Home | History | Annotate | Download | only in priv
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- begin                                     guest_amd64_toIR.c ---*/
      4 /*--------------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2012 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 /* Translates AMD64 code to IR. */
     37 
     38 /* TODO:
     39 
     40    All Puts to CC_OP/CC_DEP1/CC_DEP2/CC_NDEP should really be checked
     41    to ensure a 64-bit value is being written.
     42 
     43    x87 FP Limitations:
     44 
     45    * all arithmetic done at 64 bits
     46 
     47    * no FP exceptions, except for handling stack over/underflow
     48 
     49    * FP rounding mode observed only for float->int conversions and
     50      int->float conversions which could lose accuracy, and for
     51      float-to-float rounding.  For all other operations,
     52      round-to-nearest is used, regardless.
     53 
     54    * FP sin/cos/tan/sincos: C2 flag is always cleared.  IOW the
     55      simulation claims the argument is in-range (-2^63 <= arg <= 2^63)
     56      even when it isn't.
     57 
     58    * some of the FCOM cases could do with testing -- not convinced
     59      that the args are the right way round.
     60 
     61    * FSAVE does not re-initialise the FPU; it should do
     62 
     63    * FINIT not only initialises the FPU environment, it also zeroes
     64      all the FP registers.  It should leave the registers unchanged.
     65 
     66     RDTSC returns zero, always.
     67 
     68     SAHF should cause eflags[1] == 1, and in fact it produces 0.  As
     69     per Intel docs this bit has no meaning anyway.  Since PUSHF is the
     70     only way to observe eflags[1], a proper fix would be to make that
     71     bit be set by PUSHF.
     72 
     73     This module uses global variables and so is not MT-safe (if that
     74     should ever become relevant).
     75 */
     76 
     77 /* Notes re address size overrides (0x67).
     78 
     79    According to the AMD documentation (24594 Rev 3.09, Sept 2003,
     80    "AMD64 Architecture Programmer's Manual Volume 3: General-Purpose
     81    and System Instructions"), Section 1.2.3 ("Address-Size Override
     82    Prefix"):
     83 
     84    0x67 applies to all explicit memory references, causing the top
     85    32 bits of the effective address to become zero.
     86 
     87    0x67 has no effect on stack references (push/pop); these always
     88    use a 64-bit address.
     89 
     90    0x67 changes the interpretation of instructions which implicitly
     91    reference RCX/RSI/RDI, so that in fact ECX/ESI/EDI are used
     92    instead.  These are:
     93 
     94       cmp{s,sb,sw,sd,sq}
     95       in{s,sb,sw,sd}
     96       jcxz, jecxz, jrcxz
     97       lod{s,sb,sw,sd,sq}
     98       loop{,e,bz,be,z}
     99       mov{s,sb,sw,sd,sq}
    100       out{s,sb,sw,sd}
    101       rep{,e,ne,nz}
    102       sca{s,sb,sw,sd,sq}
    103       sto{s,sb,sw,sd,sq}
    104       xlat{,b} */
    105 
    106 /* "Special" instructions.
    107 
    108    This instruction decoder can decode three special instructions
    109    which mean nothing natively (are no-ops as far as regs/mem are
    110    concerned) but have meaning for supporting Valgrind.  A special
    111    instruction is flagged by the 16-byte preamble 48C1C703 48C1C70D
    112    48C1C73D 48C1C733 (in the standard interpretation, that means: rolq
    113    $3, %rdi; rolq $13, %rdi; rolq $61, %rdi; rolq $51, %rdi).
    114    Following that, one of the following 3 are allowed (standard
    115    interpretation in parentheses):
    116 
    117       4887DB (xchgq %rbx,%rbx)   %RDX = client_request ( %RAX )
    118       4887C9 (xchgq %rcx,%rcx)   %RAX = guest_NRADDR
    119       4887D2 (xchgq %rdx,%rdx)   call-noredir *%RAX
    120 
    121    Any other bytes following the 16-byte preamble are illegal and
    122    constitute a failure in instruction decoding.  This all assumes
    123    that the preamble will never occur except in specific code
    124    fragments designed for Valgrind to catch.
    125 
    126    No prefixes may precede a "Special" instruction.
    127 */
    128 
    129 /* casLE (implementation of lock-prefixed insns) and rep-prefixed
    130    insns: the side-exit back to the start of the insn is done with
    131    Ijk_Boring.  This is quite wrong, it should be done with
    132    Ijk_NoRedir, since otherwise the side exit, which is intended to
    133    restart the instruction for whatever reason, could go somewhere
    134    entirely else.  Doing it right (with Ijk_NoRedir jumps) would make
    135    no-redir jumps performance critical, at least for rep-prefixed
    136    instructions, since all iterations thereof would involve such a
    137    jump.  It's not such a big deal with casLE since the side exit is
    138    only taken if the CAS fails, that is, the location is contended,
    139    which is relatively unlikely.
    140 
    141    Note also, the test for CAS success vs failure is done using
    142    Iop_CasCmp{EQ,NE}{8,16,32,64} rather than the ordinary
    143    Iop_Cmp{EQ,NE} equivalents.  This is so as to tell Memcheck that it
    144    shouldn't definedness-check these comparisons.  See
    145    COMMENT_ON_CasCmpEQ in memcheck/mc_translate.c for
    146    background/rationale.
    147 */
    148 
    149 /* LOCK prefixed instructions.  These are translated using IR-level
    150    CAS statements (IRCAS) and are believed to preserve atomicity, even
    151    from the point of view of some other process racing against a
    152    simulated one (presumably they communicate via a shared memory
    153    segment).
    154 
    155    Handlers which are aware of LOCK prefixes are:
    156       dis_op2_G_E      (add, or, adc, sbb, and, sub, xor)
    157       dis_cmpxchg_G_E  (cmpxchg)
    158       dis_Grp1         (add, or, adc, sbb, and, sub, xor)
    159       dis_Grp3         (not, neg)
    160       dis_Grp4         (inc, dec)
    161       dis_Grp5         (inc, dec)
    162       dis_Grp8_Imm     (bts, btc, btr)
    163       dis_bt_G_E       (bts, btc, btr)
    164       dis_xadd_G_E     (xadd)
    165 */
    166 
    167 
    168 #include "libvex_basictypes.h"
    169 #include "libvex_ir.h"
    170 #include "libvex.h"
    171 #include "libvex_guest_amd64.h"
    172 
    173 #include "main_util.h"
    174 #include "main_globals.h"
    175 #include "guest_generic_bb_to_IR.h"
    176 #include "guest_generic_x87.h"
    177 #include "guest_amd64_defs.h"
    178 
    179 
    180 /*------------------------------------------------------------*/
    181 /*--- Globals                                              ---*/
    182 /*------------------------------------------------------------*/
    183 
    184 /* These are set at the start of the translation of an insn, right
    185    down in disInstr_AMD64, so that we don't have to pass them around
    186    endlessly.  They are all constant during the translation of any
    187    given insn. */
    188 
    189 /* These are set at the start of the translation of a BB, so
    190    that we don't have to pass them around endlessly. */
    191 
    192 /* We need to know this to do sub-register accesses correctly. */
    193 static Bool host_is_bigendian;
    194 
    195 /* Pointer to the guest code area (points to start of BB, not to the
    196    insn being processed). */
    197 static UChar* guest_code;
    198 
    199 /* The guest address corresponding to guest_code[0]. */
    200 static Addr64 guest_RIP_bbstart;
    201 
    202 /* The guest address for the instruction currently being
    203    translated. */
    204 static Addr64 guest_RIP_curr_instr;
    205 
    206 /* The IRSB* into which we're generating code. */
    207 static IRSB* irsb;
    208 
    209 /* For ensuring that %rip-relative addressing is done right.  A read
    210    of %rip generates the address of the next instruction.  It may be
    211    that we don't conveniently know that inside disAMode().  For sanity
    212    checking, if the next insn %rip is needed, we make a guess at what
    213    it is, record that guess here, and set the accompanying Bool to
    214    indicate that -- after this insn's decode is finished -- that guess
    215    needs to be checked.  */
    216 
    217 /* At the start of each insn decode, is set to (0, False).
    218    After the decode, if _mustcheck is now True, _assumed is
    219    checked. */
    220 
    221 static Addr64 guest_RIP_next_assumed;
    222 static Bool   guest_RIP_next_mustcheck;
    223 
    224 
    225 /*------------------------------------------------------------*/
    226 /*--- Helpers for constructing IR.                         ---*/
    227 /*------------------------------------------------------------*/
    228 
    229 /* Generate a new temporary of the given type. */
    230 static IRTemp newTemp ( IRType ty )
    231 {
    232    vassert(isPlausibleIRType(ty));
    233    return newIRTemp( irsb->tyenv, ty );
    234 }
    235 
    236 /* Add a statement to the list held by "irsb". */
    237 static void stmt ( IRStmt* st )
    238 {
    239    addStmtToIRSB( irsb, st );
    240 }
    241 
    242 /* Generate a statement "dst := e". */
    243 static void assign ( IRTemp dst, IRExpr* e )
    244 {
    245    stmt( IRStmt_WrTmp(dst, e) );
    246 }
    247 
    248 static IRExpr* unop ( IROp op, IRExpr* a )
    249 {
    250    return IRExpr_Unop(op, a);
    251 }
    252 
    253 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
    254 {
    255    return IRExpr_Binop(op, a1, a2);
    256 }
    257 
    258 static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
    259 {
    260    return IRExpr_Triop(op, a1, a2, a3);
    261 }
    262 
    263 static IRExpr* mkexpr ( IRTemp tmp )
    264 {
    265    return IRExpr_RdTmp(tmp);
    266 }
    267 
    268 static IRExpr* mkU8 ( ULong i )
    269 {
    270    vassert(i < 256);
    271    return IRExpr_Const(IRConst_U8( (UChar)i ));
    272 }
    273 
    274 static IRExpr* mkU16 ( ULong i )
    275 {
    276    vassert(i < 0x10000ULL);
    277    return IRExpr_Const(IRConst_U16( (UShort)i ));
    278 }
    279 
    280 static IRExpr* mkU32 ( ULong i )
    281 {
    282    vassert(i < 0x100000000ULL);
    283    return IRExpr_Const(IRConst_U32( (UInt)i ));
    284 }
    285 
    286 static IRExpr* mkU64 ( ULong i )
    287 {
    288    return IRExpr_Const(IRConst_U64(i));
    289 }
    290 
    291 static IRExpr* mkU ( IRType ty, ULong i )
    292 {
    293    switch (ty) {
    294       case Ity_I8:  return mkU8(i);
    295       case Ity_I16: return mkU16(i);
    296       case Ity_I32: return mkU32(i);
    297       case Ity_I64: return mkU64(i);
    298       default: vpanic("mkU(amd64)");
    299    }
    300 }
    301 
    302 static void storeLE ( IRExpr* addr, IRExpr* data )
    303 {
    304    stmt( IRStmt_Store(Iend_LE, addr, data) );
    305 }
    306 
    307 static IRExpr* loadLE ( IRType ty, IRExpr* addr )
    308 {
    309    return IRExpr_Load(Iend_LE, ty, addr);
    310 }
    311 
    312 static IROp mkSizedOp ( IRType ty, IROp op8 )
    313 {
    314    vassert(op8 == Iop_Add8 || op8 == Iop_Sub8
    315            || op8 == Iop_Mul8
    316            || op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
    317            || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
    318            || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
    319            || op8 == Iop_CasCmpNE8
    320            || op8 == Iop_Not8 );
    321    switch (ty) {
    322       case Ity_I8:  return 0 +op8;
    323       case Ity_I16: return 1 +op8;
    324       case Ity_I32: return 2 +op8;
    325       case Ity_I64: return 3 +op8;
    326       default: vpanic("mkSizedOp(amd64)");
    327    }
    328 }
    329 
    330 static
    331 IRExpr* doScalarWidening ( Int szSmall, Int szBig, Bool signd, IRExpr* src )
    332 {
    333    if (szSmall == 1 && szBig == 4) {
    334       return unop(signd ? Iop_8Sto32 : Iop_8Uto32, src);
    335    }
    336    if (szSmall == 1 && szBig == 2) {
    337       return unop(signd ? Iop_8Sto16 : Iop_8Uto16, src);
    338    }
    339    if (szSmall == 2 && szBig == 4) {
    340       return unop(signd ? Iop_16Sto32 : Iop_16Uto32, src);
    341    }
    342    if (szSmall == 1 && szBig == 8 && !signd) {
    343       return unop(Iop_8Uto64, src);
    344    }
    345    if (szSmall == 1 && szBig == 8 && signd) {
    346       return unop(Iop_8Sto64, src);
    347    }
    348    if (szSmall == 2 && szBig == 8 && !signd) {
    349       return unop(Iop_16Uto64, src);
    350    }
    351    if (szSmall == 2 && szBig == 8 && signd) {
    352       return unop(Iop_16Sto64, src);
    353    }
    354    vpanic("doScalarWidening(amd64)");
    355 }
    356 
    357 
    358 
    359 /*------------------------------------------------------------*/
    360 /*--- Debugging output                                     ---*/
    361 /*------------------------------------------------------------*/
    362 
    363 /* Bomb out if we can't handle something. */
    364 __attribute__ ((noreturn))
    365 static void unimplemented ( HChar* str )
    366 {
    367    vex_printf("amd64toIR: unimplemented feature\n");
    368    vpanic(str);
    369 }
    370 
    371 #define DIP(format, args...)           \
    372    if (vex_traceflags & VEX_TRACE_FE)  \
    373       vex_printf(format, ## args)
    374 
    375 #define DIS(buf, format, args...)      \
    376    if (vex_traceflags & VEX_TRACE_FE)  \
    377       vex_sprintf(buf, format, ## args)
    378 
    379 
    380 /*------------------------------------------------------------*/
    381 /*--- Offsets of various parts of the amd64 guest state.   ---*/
    382 /*------------------------------------------------------------*/
    383 
    384 #define OFFB_RAX       offsetof(VexGuestAMD64State,guest_RAX)
    385 #define OFFB_RBX       offsetof(VexGuestAMD64State,guest_RBX)
    386 #define OFFB_RCX       offsetof(VexGuestAMD64State,guest_RCX)
    387 #define OFFB_RDX       offsetof(VexGuestAMD64State,guest_RDX)
    388 #define OFFB_RSP       offsetof(VexGuestAMD64State,guest_RSP)
    389 #define OFFB_RBP       offsetof(VexGuestAMD64State,guest_RBP)
    390 #define OFFB_RSI       offsetof(VexGuestAMD64State,guest_RSI)
    391 #define OFFB_RDI       offsetof(VexGuestAMD64State,guest_RDI)
    392 #define OFFB_R8        offsetof(VexGuestAMD64State,guest_R8)
    393 #define OFFB_R9        offsetof(VexGuestAMD64State,guest_R9)
    394 #define OFFB_R10       offsetof(VexGuestAMD64State,guest_R10)
    395 #define OFFB_R11       offsetof(VexGuestAMD64State,guest_R11)
    396 #define OFFB_R12       offsetof(VexGuestAMD64State,guest_R12)
    397 #define OFFB_R13       offsetof(VexGuestAMD64State,guest_R13)
    398 #define OFFB_R14       offsetof(VexGuestAMD64State,guest_R14)
    399 #define OFFB_R15       offsetof(VexGuestAMD64State,guest_R15)
    400 
    401 #define OFFB_RIP       offsetof(VexGuestAMD64State,guest_RIP)
    402 
    403 #define OFFB_FS_ZERO   offsetof(VexGuestAMD64State,guest_FS_ZERO)
    404 #define OFFB_GS_0x60   offsetof(VexGuestAMD64State,guest_GS_0x60)
    405 
    406 #define OFFB_CC_OP     offsetof(VexGuestAMD64State,guest_CC_OP)
    407 #define OFFB_CC_DEP1   offsetof(VexGuestAMD64State,guest_CC_DEP1)
    408 #define OFFB_CC_DEP2   offsetof(VexGuestAMD64State,guest_CC_DEP2)
    409 #define OFFB_CC_NDEP   offsetof(VexGuestAMD64State,guest_CC_NDEP)
    410 
    411 #define OFFB_FPREGS    offsetof(VexGuestAMD64State,guest_FPREG[0])
    412 #define OFFB_FPTAGS    offsetof(VexGuestAMD64State,guest_FPTAG[0])
    413 #define OFFB_DFLAG     offsetof(VexGuestAMD64State,guest_DFLAG)
    414 #define OFFB_ACFLAG    offsetof(VexGuestAMD64State,guest_ACFLAG)
    415 #define OFFB_IDFLAG    offsetof(VexGuestAMD64State,guest_IDFLAG)
    416 #define OFFB_FTOP      offsetof(VexGuestAMD64State,guest_FTOP)
    417 #define OFFB_FC3210    offsetof(VexGuestAMD64State,guest_FC3210)
    418 #define OFFB_FPROUND   offsetof(VexGuestAMD64State,guest_FPROUND)
    419 
    420 #define OFFB_SSEROUND  offsetof(VexGuestAMD64State,guest_SSEROUND)
    421 #define OFFB_YMM0      offsetof(VexGuestAMD64State,guest_YMM0)
    422 #define OFFB_YMM1      offsetof(VexGuestAMD64State,guest_YMM1)
    423 #define OFFB_YMM2      offsetof(VexGuestAMD64State,guest_YMM2)
    424 #define OFFB_YMM3      offsetof(VexGuestAMD64State,guest_YMM3)
    425 #define OFFB_YMM4      offsetof(VexGuestAMD64State,guest_YMM4)
    426 #define OFFB_YMM5      offsetof(VexGuestAMD64State,guest_YMM5)
    427 #define OFFB_YMM6      offsetof(VexGuestAMD64State,guest_YMM6)
    428 #define OFFB_YMM7      offsetof(VexGuestAMD64State,guest_YMM7)
    429 #define OFFB_YMM8      offsetof(VexGuestAMD64State,guest_YMM8)
    430 #define OFFB_YMM9      offsetof(VexGuestAMD64State,guest_YMM9)
    431 #define OFFB_YMM10     offsetof(VexGuestAMD64State,guest_YMM10)
    432 #define OFFB_YMM11     offsetof(VexGuestAMD64State,guest_YMM11)
    433 #define OFFB_YMM12     offsetof(VexGuestAMD64State,guest_YMM12)
    434 #define OFFB_YMM13     offsetof(VexGuestAMD64State,guest_YMM13)
    435 #define OFFB_YMM14     offsetof(VexGuestAMD64State,guest_YMM14)
    436 #define OFFB_YMM15     offsetof(VexGuestAMD64State,guest_YMM15)
    437 #define OFFB_YMM16     offsetof(VexGuestAMD64State,guest_YMM16)
    438 
    439 #define OFFB_EMWARN    offsetof(VexGuestAMD64State,guest_EMWARN)
    440 #define OFFB_TISTART   offsetof(VexGuestAMD64State,guest_TISTART)
    441 #define OFFB_TILEN     offsetof(VexGuestAMD64State,guest_TILEN)
    442 
    443 #define OFFB_NRADDR    offsetof(VexGuestAMD64State,guest_NRADDR)
    444 
    445 
    446 /*------------------------------------------------------------*/
    447 /*--- Helper bits and pieces for deconstructing the        ---*/
    448 /*--- amd64 insn stream.                                   ---*/
    449 /*------------------------------------------------------------*/
    450 
    451 /* This is the AMD64 register encoding -- integer regs. */
    452 #define R_RAX 0
    453 #define R_RCX 1
    454 #define R_RDX 2
    455 #define R_RBX 3
    456 #define R_RSP 4
    457 #define R_RBP 5
    458 #define R_RSI 6
    459 #define R_RDI 7
    460 #define R_R8  8
    461 #define R_R9  9
    462 #define R_R10 10
    463 #define R_R11 11
    464 #define R_R12 12
    465 #define R_R13 13
    466 #define R_R14 14
    467 #define R_R15 15
    468 
    469 /* This is the Intel register encoding -- segment regs. */
    470 #define R_ES 0
    471 #define R_CS 1
    472 #define R_SS 2
    473 #define R_DS 3
    474 #define R_FS 4
    475 #define R_GS 5
    476 
    477 
    478 /* Various simple conversions */
    479 
    480 static ULong extend_s_8to64 ( UChar x )
    481 {
    482    return (ULong)((((Long)x) << 56) >> 56);
    483 }
    484 
    485 static ULong extend_s_16to64 ( UShort x )
    486 {
    487    return (ULong)((((Long)x) << 48) >> 48);
    488 }
    489 
    490 static ULong extend_s_32to64 ( UInt x )
    491 {
    492    return (ULong)((((Long)x) << 32) >> 32);
    493 }
    494 
    495 /* Figure out whether the mod and rm parts of a modRM byte refer to a
    496    register or memory.  If so, the byte will have the form 11XXXYYY,
    497    where YYY is the register number. */
    498 inline
    499 static Bool epartIsReg ( UChar mod_reg_rm )
    500 {
    501    return toBool(0xC0 == (mod_reg_rm & 0xC0));
    502 }
    503 
    504 /* Extract the 'g' field from a modRM byte.  This only produces 3
    505    bits, which is not a complete register number.  You should avoid
    506    this function if at all possible. */
    507 inline
    508 static Int gregLO3ofRM ( UChar mod_reg_rm )
    509 {
    510    return (Int)( (mod_reg_rm >> 3) & 7 );
    511 }
    512 
    513 /* Ditto the 'e' field of a modRM byte. */
    514 inline
    515 static Int eregLO3ofRM ( UChar mod_reg_rm )
    516 {
    517    return (Int)(mod_reg_rm & 0x7);
    518 }
    519 
    520 /* Get a 8/16/32-bit unsigned value out of the insn stream. */
    521 
    522 static inline UChar getUChar ( Long delta )
    523 {
    524    UChar v = guest_code[delta+0];
    525    return v;
    526 }
    527 
    528 static UInt getUDisp16 ( Long delta )
    529 {
    530    UInt v = guest_code[delta+1]; v <<= 8;
    531    v |= guest_code[delta+0];
    532    return v & 0xFFFF;
    533 }
    534 
    535 //.. static UInt getUDisp ( Int size, Long delta )
    536 //.. {
    537 //..    switch (size) {
    538 //..       case 4: return getUDisp32(delta);
    539 //..       case 2: return getUDisp16(delta);
    540 //..       case 1: return getUChar(delta);
    541 //..       default: vpanic("getUDisp(x86)");
    542 //..    }
    543 //..    return 0; /*notreached*/
    544 //.. }
    545 
    546 
    547 /* Get a byte value out of the insn stream and sign-extend to 64
    548    bits. */
    549 static Long getSDisp8 ( Long delta )
    550 {
    551    return extend_s_8to64( guest_code[delta] );
    552 }
    553 
    554 /* Get a 16-bit value out of the insn stream and sign-extend to 64
    555    bits. */
    556 static Long getSDisp16 ( Long delta )
    557 {
    558    UInt v = guest_code[delta+1]; v <<= 8;
    559    v |= guest_code[delta+0];
    560    return extend_s_16to64( (UShort)v );
    561 }
    562 
    563 /* Get a 32-bit value out of the insn stream and sign-extend to 64
    564    bits. */
    565 static Long getSDisp32 ( Long delta )
    566 {
    567    UInt v = guest_code[delta+3]; v <<= 8;
    568    v |= guest_code[delta+2]; v <<= 8;
    569    v |= guest_code[delta+1]; v <<= 8;
    570    v |= guest_code[delta+0];
    571    return extend_s_32to64( v );
    572 }
    573 
    574 /* Get a 64-bit value out of the insn stream. */
    575 static Long getDisp64 ( Long delta )
    576 {
    577    ULong v = 0;
    578    v |= guest_code[delta+7]; v <<= 8;
    579    v |= guest_code[delta+6]; v <<= 8;
    580    v |= guest_code[delta+5]; v <<= 8;
    581    v |= guest_code[delta+4]; v <<= 8;
    582    v |= guest_code[delta+3]; v <<= 8;
    583    v |= guest_code[delta+2]; v <<= 8;
    584    v |= guest_code[delta+1]; v <<= 8;
    585    v |= guest_code[delta+0];
    586    return v;
    587 }
    588 
    589 /* Note: because AMD64 doesn't allow 64-bit literals, it is an error
    590    if this is called with size==8.  Should not happen. */
    591 static Long getSDisp ( Int size, Long delta )
    592 {
    593    switch (size) {
    594       case 4: return getSDisp32(delta);
    595       case 2: return getSDisp16(delta);
    596       case 1: return getSDisp8(delta);
    597       default: vpanic("getSDisp(amd64)");
    598   }
    599 }
    600 
    601 static ULong mkSizeMask ( Int sz )
    602 {
    603    switch (sz) {
    604       case 1: return 0x00000000000000FFULL;
    605       case 2: return 0x000000000000FFFFULL;
    606       case 4: return 0x00000000FFFFFFFFULL;
    607       case 8: return 0xFFFFFFFFFFFFFFFFULL;
    608       default: vpanic("mkSzMask(amd64)");
    609    }
    610 }
    611 
    612 static Int imin ( Int a, Int b )
    613 {
    614    return (a < b) ? a : b;
    615 }
    616 
    617 static IRType szToITy ( Int n )
    618 {
    619    switch (n) {
    620       case 1: return Ity_I8;
    621       case 2: return Ity_I16;
    622       case 4: return Ity_I32;
    623       case 8: return Ity_I64;
    624       default: vex_printf("\nszToITy(%d)\n", n);
    625                vpanic("szToITy(amd64)");
    626    }
    627 }
    628 
    629 
    630 /*------------------------------------------------------------*/
    631 /*--- For dealing with prefixes.                           ---*/
    632 /*------------------------------------------------------------*/
    633 
    634 /* The idea is to pass around an int holding a bitmask summarising
    635    info from the prefixes seen on the current instruction, including
    636    info from the REX byte.  This info is used in various places, but
    637    most especially when making sense of register fields in
    638    instructions.
    639 
    640    The top 8 bits of the prefix are 0x55, just as a hacky way to
    641    ensure it really is a valid prefix.
    642 
    643    Things you can safely assume about a well-formed prefix:
    644    * at most one segment-override bit (CS,DS,ES,FS,GS,SS) is set.
    645    * if REX is not present then REXW,REXR,REXX,REXB will read
    646      as zero.
    647    * F2 and F3 will not both be 1.
    648 */
    649 
    650 typedef UInt  Prefix;
    651 
    652 #define PFX_ASO    (1<<0)    /* address-size override present (0x67) */
    653 #define PFX_66     (1<<1)    /* operand-size override-to-16 present (0x66) */
    654 #define PFX_REX    (1<<2)    /* REX byte present (0x40 to 0x4F) */
    655 #define PFX_REXW   (1<<3)    /* REX W bit, if REX present, else 0 */
    656 #define PFX_REXR   (1<<4)    /* REX R bit, if REX present, else 0 */
    657 #define PFX_REXX   (1<<5)    /* REX X bit, if REX present, else 0 */
    658 #define PFX_REXB   (1<<6)    /* REX B bit, if REX present, else 0 */
    659 #define PFX_LOCK   (1<<7)    /* bus LOCK prefix present (0xF0) */
    660 #define PFX_F2     (1<<8)    /* REP/REPE/REPZ prefix present (0xF2) */
    661 #define PFX_F3     (1<<9)    /* REPNE/REPNZ prefix present (0xF3) */
    662 #define PFX_CS     (1<<10)   /* CS segment prefix present (0x2E) */
    663 #define PFX_DS     (1<<11)   /* DS segment prefix present (0x3E) */
    664 #define PFX_ES     (1<<12)   /* ES segment prefix present (0x26) */
    665 #define PFX_FS     (1<<13)   /* FS segment prefix present (0x64) */
    666 #define PFX_GS     (1<<14)   /* GS segment prefix present (0x65) */
    667 #define PFX_SS     (1<<15)   /* SS segment prefix present (0x36) */
    668 #define PFX_VEX    (1<<16)   /* VEX prefix present (0xC4 or 0xC5) */
    669 #define PFX_VEXL   (1<<17)   /* VEX L bit, if VEX present, else 0 */
    670 /* The extra register field VEX.vvvv is encoded (after not-ing it) as
    671    PFX_VEXnV3 .. PFX_VEXnV0, so these must occupy adjacent bit
    672    positions. */
    673 #define PFX_VEXnV0 (1<<18)   /* ~VEX vvvv[0], if VEX present, else 0 */
    674 #define PFX_VEXnV1 (1<<19)   /* ~VEX vvvv[1], if VEX present, else 0 */
    675 #define PFX_VEXnV2 (1<<20)   /* ~VEX vvvv[2], if VEX present, else 0 */
    676 #define PFX_VEXnV3 (1<<21)   /* ~VEX vvvv[3], if VEX present, else 0 */
    677 
    678 
    679 #define PFX_EMPTY 0x55000000
    680 
    681 static Bool IS_VALID_PFX ( Prefix pfx ) {
    682    return toBool((pfx & 0xFF000000) == PFX_EMPTY);
    683 }
    684 
    685 static Bool haveREX ( Prefix pfx ) {
    686    return toBool(pfx & PFX_REX);
    687 }
    688 
    689 static Int getRexW ( Prefix pfx ) {
    690    return (pfx & PFX_REXW) ? 1 : 0;
    691 }
    692 static Int getRexR ( Prefix pfx ) {
    693    return (pfx & PFX_REXR) ? 1 : 0;
    694 }
    695 static Int getRexX ( Prefix pfx ) {
    696    return (pfx & PFX_REXX) ? 1 : 0;
    697 }
    698 static Int getRexB ( Prefix pfx ) {
    699    return (pfx & PFX_REXB) ? 1 : 0;
    700 }
    701 
    702 /* Check a prefix doesn't have F2 or F3 set in it, since usually that
    703    completely changes what instruction it really is. */
    704 static Bool haveF2orF3 ( Prefix pfx ) {
    705    return toBool((pfx & (PFX_F2|PFX_F3)) > 0);
    706 }
    707 static Bool haveF2 ( Prefix pfx ) {
    708    return toBool((pfx & PFX_F2) > 0);
    709 }
    710 static Bool haveF3 ( Prefix pfx ) {
    711    return toBool((pfx & PFX_F3) > 0);
    712 }
    713 
    714 static Bool have66 ( Prefix pfx ) {
    715    return toBool((pfx & PFX_66) > 0);
    716 }
    717 static Bool haveASO ( Prefix pfx ) {
    718    return toBool((pfx & PFX_ASO) > 0);
    719 }
    720 
    721 /* Return True iff pfx has 66 set and F2 and F3 clear */
    722 static Bool have66noF2noF3 ( Prefix pfx )
    723 {
    724   return
    725      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_66);
    726 }
    727 
    728 /* Return True iff pfx has F2 set and 66 and F3 clear */
    729 static Bool haveF2no66noF3 ( Prefix pfx )
    730 {
    731   return
    732      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F2);
    733 }
    734 
    735 /* Return True iff pfx has F3 set and 66 and F2 clear */
    736 static Bool haveF3no66noF2 ( Prefix pfx )
    737 {
    738   return
    739      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F3);
    740 }
    741 
    742 /* Return True iff pfx has F3 set and F2 clear */
    743 static Bool haveF3noF2 ( Prefix pfx )
    744 {
    745   return
    746      toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F3);
    747 }
    748 
    749 /* Return True iff pfx has F2 set and F3 clear */
    750 static Bool haveF2noF3 ( Prefix pfx )
    751 {
    752   return
    753      toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F2);
    754 }
    755 
    756 /* Return True iff pfx has 66, F2 and F3 clear */
    757 static Bool haveNo66noF2noF3 ( Prefix pfx )
    758 {
    759   return
    760      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == 0);
    761 }
    762 
    763 /* Return True iff pfx has any of 66, F2 and F3 set */
    764 static Bool have66orF2orF3 ( Prefix pfx )
    765 {
    766   return toBool( ! haveNo66noF2noF3(pfx) );
    767 }
    768 
    769 /* Return True iff pfx has 66 or F2 set */
    770 static Bool have66orF2 ( Prefix pfx )
    771 {
    772    return toBool((pfx & (PFX_66|PFX_F2)) > 0);
    773 }
    774 
    775 /* Clear all the segment-override bits in a prefix. */
    776 static Prefix clearSegBits ( Prefix p )
    777 {
    778    return
    779       p & ~(PFX_CS | PFX_DS | PFX_ES | PFX_FS | PFX_GS | PFX_SS);
    780 }
    781 
    782 /* Get the (inverted, hence back to "normal") VEX.vvvv field. */
    783 static UInt getVexNvvvv ( Prefix pfx ) {
    784    UInt r = (UInt)pfx;
    785    r /= (UInt)PFX_VEXnV0; /* pray this turns into a shift */
    786    return r & 0xF;
    787 }
    788 
    789 static Bool haveVEX ( Prefix pfx ) {
    790    return toBool(pfx & PFX_VEX);
    791 }
    792 
    793 static Int getVexL ( Prefix pfx ) {
    794    return (pfx & PFX_VEXL) ? 1 : 0;
    795 }
    796 
    797 
    798 /*------------------------------------------------------------*/
    799 /*--- For dealing with escapes                             ---*/
    800 /*------------------------------------------------------------*/
    801 
    802 
    803 /* Escapes come after the prefixes, but before the primary opcode
    804    byte.  They escape the primary opcode byte into a bigger space.
    805    The 0xF0000000 isn't significant, except so as to make it not
    806    overlap valid Prefix values, for sanity checking.
    807 */
    808 
    809 typedef
    810    enum {
    811       ESC_NONE=0xF0000000, // none
    812       ESC_0F,              // 0F
    813       ESC_0F38,            // 0F 38
    814       ESC_0F3A             // 0F 3A
    815    }
    816    Escape;
    817 
    818 
    819 /*------------------------------------------------------------*/
    820 /*--- For dealing with integer registers                   ---*/
    821 /*------------------------------------------------------------*/
    822 
    823 /* This is somewhat complex.  The rules are:
    824 
    825    For 64, 32 and 16 bit register references, the e or g fields in the
    826    modrm bytes supply the low 3 bits of the register number.  The
    827    fourth (most-significant) bit of the register number is supplied by
    828    the REX byte, if it is present; else that bit is taken to be zero.
    829 
    830    The REX.R bit supplies the high bit corresponding to the g register
    831    field, and the REX.B bit supplies the high bit corresponding to the
    832    e register field (when the mod part of modrm indicates that modrm's
    833    e component refers to a register and not to memory).
    834 
    835    The REX.X bit supplies a high register bit for certain registers
    836    in SIB address modes, and is generally rarely used.
    837 
    838    For 8 bit register references, the presence of the REX byte itself
    839    has significance.  If there is no REX present, then the 3-bit
    840    number extracted from the modrm e or g field is treated as an index
    841    into the sequence %al %cl %dl %bl %ah %ch %dh %bh -- that is, the
    842    old x86 encoding scheme.
    843 
    844    But if there is a REX present, the register reference is
    845    interpreted in the same way as for 64/32/16-bit references: a high
    846    bit is extracted from REX, giving a 4-bit number, and the denoted
    847    register is the lowest 8 bits of the 16 integer registers denoted
    848    by the number.  In particular, values 3 through 7 of this sequence
    849    do not refer to %ah %ch %dh %bh but instead to the lowest 8 bits of
    850    %rsp %rbp %rsi %rdi.
    851 
    852    The REX.W bit has no bearing at all on register numbers.  Instead
    853    its presence indicates that the operand size is to be overridden
    854    from its default value (32 bits) to 64 bits instead.  This is in
    855    the same fashion that an 0x66 prefix indicates the operand size is
    856    to be overridden from 32 bits down to 16 bits.  When both REX.W and
    857    0x66 are present there is a conflict, and REX.W takes precedence.
    858 
    859    Rather than try to handle this complexity using a single huge
    860    function, several smaller ones are provided.  The aim is to make it
    861    as difficult as possible to screw up register decoding in a subtle
    862    and hard-to-track-down way.
    863 
    864    Because these routines fish around in the host's memory (that is,
    865    in the guest state area) for sub-parts of guest registers, their
    866    correctness depends on the host's endianness.  So far these
    867    routines only work for little-endian hosts.  Those for which
    868    endianness is important have assertions to ensure sanity.
    869 */
    870 
    871 
    872 /* About the simplest question you can ask: where do the 64-bit
    873    integer registers live (in the guest state) ? */
    874 
    875 static Int integerGuestReg64Offset ( UInt reg )
    876 {
    877    switch (reg) {
    878       case R_RAX: return OFFB_RAX;
    879       case R_RCX: return OFFB_RCX;
    880       case R_RDX: return OFFB_RDX;
    881       case R_RBX: return OFFB_RBX;
    882       case R_RSP: return OFFB_RSP;
    883       case R_RBP: return OFFB_RBP;
    884       case R_RSI: return OFFB_RSI;
    885       case R_RDI: return OFFB_RDI;
    886       case R_R8:  return OFFB_R8;
    887       case R_R9:  return OFFB_R9;
    888       case R_R10: return OFFB_R10;
    889       case R_R11: return OFFB_R11;
    890       case R_R12: return OFFB_R12;
    891       case R_R13: return OFFB_R13;
    892       case R_R14: return OFFB_R14;
    893       case R_R15: return OFFB_R15;
    894       default: vpanic("integerGuestReg64Offset(amd64)");
    895    }
    896 }
    897 
    898 
    899 /* Produce the name of an integer register, for printing purposes.
    900    reg is a number in the range 0 .. 15 that has been generated from a
    901    3-bit reg-field number and a REX extension bit.  irregular denotes
    902    the case where sz==1 and no REX byte is present. */
    903 
    904 static
    905 HChar* nameIReg ( Int sz, UInt reg, Bool irregular )
    906 {
    907    static HChar* ireg64_names[16]
    908      = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
    909          "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
    910    static HChar* ireg32_names[16]
    911      = { "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
    912          "%r8d", "%r9d", "%r10d","%r11d","%r12d","%r13d","%r14d","%r15d" };
    913    static HChar* ireg16_names[16]
    914      = { "%ax",  "%cx",  "%dx",  "%bx",  "%sp",  "%bp",  "%si",  "%di",
    915          "%r8w", "%r9w", "%r10w","%r11w","%r12w","%r13w","%r14w","%r15w" };
    916    static HChar* ireg8_names[16]
    917      = { "%al",  "%cl",  "%dl",  "%bl",  "%spl", "%bpl", "%sil", "%dil",
    918          "%r8b", "%r9b", "%r10b","%r11b","%r12b","%r13b","%r14b","%r15b" };
    919    static HChar* ireg8_irregular[8]
    920      = { "%al", "%cl", "%dl", "%bl", "%ah", "%ch", "%dh", "%bh" };
    921 
    922    vassert(reg < 16);
    923    if (sz == 1) {
    924       if (irregular)
    925          vassert(reg < 8);
    926    } else {
    927       vassert(irregular == False);
    928    }
    929 
    930    switch (sz) {
    931       case 8: return ireg64_names[reg];
    932       case 4: return ireg32_names[reg];
    933       case 2: return ireg16_names[reg];
    934       case 1: if (irregular) {
    935                  return ireg8_irregular[reg];
    936               } else {
    937                  return ireg8_names[reg];
    938               }
    939       default: vpanic("nameIReg(amd64)");
    940    }
    941 }
    942 
    943 /* Using the same argument conventions as nameIReg, produce the
    944    guest state offset of an integer register. */
    945 
    946 static
    947 Int offsetIReg ( Int sz, UInt reg, Bool irregular )
    948 {
    949    vassert(reg < 16);
    950    if (sz == 1) {
    951       if (irregular)
    952          vassert(reg < 8);
    953    } else {
    954       vassert(irregular == False);
    955    }
    956 
    957    /* Deal with irregular case -- sz==1 and no REX present */
    958    if (sz == 1 && irregular) {
    959       switch (reg) {
    960          case R_RSP: return 1+ OFFB_RAX;
    961          case R_RBP: return 1+ OFFB_RCX;
    962          case R_RSI: return 1+ OFFB_RDX;
    963          case R_RDI: return 1+ OFFB_RBX;
    964          default:    break; /* use the normal case */
    965       }
    966    }
    967 
    968    /* Normal case */
    969    return integerGuestReg64Offset(reg);
    970 }
    971 
    972 
    973 /* Read the %CL register :: Ity_I8, for shift/rotate operations. */
    974 
    975 static IRExpr* getIRegCL ( void )
    976 {
    977    vassert(!host_is_bigendian);
    978    return IRExpr_Get( OFFB_RCX, Ity_I8 );
    979 }
    980 
    981 
    982 /* Write to the %AH register. */
    983 
    984 static void putIRegAH ( IRExpr* e )
    985 {
    986    vassert(!host_is_bigendian);
    987    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I8);
    988    stmt( IRStmt_Put( OFFB_RAX+1, e ) );
    989 }
    990 
    991 
    992 /* Read/write various widths of %RAX, as it has various
    993    special-purpose uses. */
    994 
    995 static HChar* nameIRegRAX ( Int sz )
    996 {
    997    switch (sz) {
    998       case 1: return "%al";
    999       case 2: return "%ax";
   1000       case 4: return "%eax";
   1001       case 8: return "%rax";
   1002       default: vpanic("nameIRegRAX(amd64)");
   1003    }
   1004 }
   1005 
   1006 static IRExpr* getIRegRAX ( Int sz )
   1007 {
   1008    vassert(!host_is_bigendian);
   1009    switch (sz) {
   1010       case 1: return IRExpr_Get( OFFB_RAX, Ity_I8 );
   1011       case 2: return IRExpr_Get( OFFB_RAX, Ity_I16 );
   1012       case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RAX, Ity_I64 ));
   1013       case 8: return IRExpr_Get( OFFB_RAX, Ity_I64 );
   1014       default: vpanic("getIRegRAX(amd64)");
   1015    }
   1016 }
   1017 
   1018 static void putIRegRAX ( Int sz, IRExpr* e )
   1019 {
   1020    IRType ty = typeOfIRExpr(irsb->tyenv, e);
   1021    vassert(!host_is_bigendian);
   1022    switch (sz) {
   1023       case 8: vassert(ty == Ity_I64);
   1024               stmt( IRStmt_Put( OFFB_RAX, e ));
   1025               break;
   1026       case 4: vassert(ty == Ity_I32);
   1027               stmt( IRStmt_Put( OFFB_RAX, unop(Iop_32Uto64,e) ));
   1028               break;
   1029       case 2: vassert(ty == Ity_I16);
   1030               stmt( IRStmt_Put( OFFB_RAX, e ));
   1031               break;
   1032       case 1: vassert(ty == Ity_I8);
   1033               stmt( IRStmt_Put( OFFB_RAX, e ));
   1034               break;
   1035       default: vpanic("putIRegRAX(amd64)");
   1036    }
   1037 }
   1038 
   1039 
   1040 /* Read/write various widths of %RDX, as it has various
   1041    special-purpose uses. */
   1042 
   1043 static HChar* nameIRegRDX ( Int sz )
   1044 {
   1045    switch (sz) {
   1046       case 1: return "%dl";
   1047       case 2: return "%dx";
   1048       case 4: return "%edx";
   1049       case 8: return "%rdx";
   1050       default: vpanic("nameIRegRDX(amd64)");
   1051    }
   1052 }
   1053 
   1054 static IRExpr* getIRegRDX ( Int sz )
   1055 {
   1056    vassert(!host_is_bigendian);
   1057    switch (sz) {
   1058       case 1: return IRExpr_Get( OFFB_RDX, Ity_I8 );
   1059       case 2: return IRExpr_Get( OFFB_RDX, Ity_I16 );
   1060       case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RDX, Ity_I64 ));
   1061       case 8: return IRExpr_Get( OFFB_RDX, Ity_I64 );
   1062       default: vpanic("getIRegRDX(amd64)");
   1063    }
   1064 }
   1065 
   1066 static void putIRegRDX ( Int sz, IRExpr* e )
   1067 {
   1068    vassert(!host_is_bigendian);
   1069    vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
   1070    switch (sz) {
   1071       case 8: stmt( IRStmt_Put( OFFB_RDX, e ));
   1072               break;
   1073       case 4: stmt( IRStmt_Put( OFFB_RDX, unop(Iop_32Uto64,e) ));
   1074               break;
   1075       case 2: stmt( IRStmt_Put( OFFB_RDX, e ));
   1076               break;
   1077       case 1: stmt( IRStmt_Put( OFFB_RDX, e ));
   1078               break;
   1079       default: vpanic("putIRegRDX(amd64)");
   1080    }
   1081 }
   1082 
   1083 
   1084 /* Simplistic functions to deal with the integer registers as a
   1085    straightforward bank of 16 64-bit regs. */
   1086 
   1087 static IRExpr* getIReg64 ( UInt regno )
   1088 {
   1089    return IRExpr_Get( integerGuestReg64Offset(regno),
   1090                       Ity_I64 );
   1091 }
   1092 
   1093 static void putIReg64 ( UInt regno, IRExpr* e )
   1094 {
   1095    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   1096    stmt( IRStmt_Put( integerGuestReg64Offset(regno), e ) );
   1097 }
   1098 
   1099 static HChar* nameIReg64 ( UInt regno )
   1100 {
   1101    return nameIReg( 8, regno, False );
   1102 }
   1103 
   1104 
   1105 /* Simplistic functions to deal with the lower halves of integer
   1106    registers as a straightforward bank of 16 32-bit regs. */
   1107 
   1108 static IRExpr* getIReg32 ( UInt regno )
   1109 {
   1110    vassert(!host_is_bigendian);
   1111    return unop(Iop_64to32,
   1112                IRExpr_Get( integerGuestReg64Offset(regno),
   1113                            Ity_I64 ));
   1114 }
   1115 
   1116 static void putIReg32 ( UInt regno, IRExpr* e )
   1117 {
   1118    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
   1119    stmt( IRStmt_Put( integerGuestReg64Offset(regno),
   1120                      unop(Iop_32Uto64,e) ) );
   1121 }
   1122 
   1123 static HChar* nameIReg32 ( UInt regno )
   1124 {
   1125    return nameIReg( 4, regno, False );
   1126 }
   1127 
   1128 
   1129 /* Simplistic functions to deal with the lower quarters of integer
   1130    registers as a straightforward bank of 16 16-bit regs. */
   1131 
   1132 static IRExpr* getIReg16 ( UInt regno )
   1133 {
   1134    vassert(!host_is_bigendian);
   1135    return IRExpr_Get( integerGuestReg64Offset(regno),
   1136                       Ity_I16 );
   1137 }
   1138 
   1139 static void putIReg16 ( UInt regno, IRExpr* e )
   1140 {
   1141    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
   1142    stmt( IRStmt_Put( integerGuestReg64Offset(regno),
   1143                      unop(Iop_16Uto64,e) ) );
   1144 }
   1145 
   1146 static HChar* nameIReg16 ( UInt regno )
   1147 {
   1148    return nameIReg( 2, regno, False );
   1149 }
   1150 
   1151 
   1152 /* Sometimes what we know is a 3-bit register number, a REX byte, and
   1153    which field of the REX byte is to be used to extend to a 4-bit
   1154    number.  These functions cater for that situation.
   1155 */
   1156 static IRExpr* getIReg64rexX ( Prefix pfx, UInt lo3bits )
   1157 {
   1158    vassert(lo3bits < 8);
   1159    vassert(IS_VALID_PFX(pfx));
   1160    return getIReg64( lo3bits | (getRexX(pfx) << 3) );
   1161 }
   1162 
   1163 static HChar* nameIReg64rexX ( Prefix pfx, UInt lo3bits )
   1164 {
   1165    vassert(lo3bits < 8);
   1166    vassert(IS_VALID_PFX(pfx));
   1167    return nameIReg( 8, lo3bits | (getRexX(pfx) << 3), False );
   1168 }
   1169 
   1170 static HChar* nameIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
   1171 {
   1172    vassert(lo3bits < 8);
   1173    vassert(IS_VALID_PFX(pfx));
   1174    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1175    return nameIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1176                         toBool(sz==1 && !haveREX(pfx)) );
   1177 }
   1178 
   1179 static IRExpr* getIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
   1180 {
   1181    vassert(lo3bits < 8);
   1182    vassert(IS_VALID_PFX(pfx));
   1183    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1184    if (sz == 4) {
   1185       sz = 8;
   1186       return unop(Iop_64to32,
   1187                   IRExpr_Get(
   1188                      offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1189                                      toBool(sz==1 && !haveREX(pfx)) ),
   1190                      szToITy(sz)
   1191                  )
   1192              );
   1193    } else {
   1194       return IRExpr_Get(
   1195                 offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1196                                 toBool(sz==1 && !haveREX(pfx)) ),
   1197                 szToITy(sz)
   1198              );
   1199    }
   1200 }
   1201 
   1202 static void putIRegRexB ( Int sz, Prefix pfx, UInt lo3bits, IRExpr* e )
   1203 {
   1204    vassert(lo3bits < 8);
   1205    vassert(IS_VALID_PFX(pfx));
   1206    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1207    vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
   1208    stmt( IRStmt_Put(
   1209             offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1210                             toBool(sz==1 && !haveREX(pfx)) ),
   1211             sz==4 ? unop(Iop_32Uto64,e) : e
   1212    ));
   1213 }
   1214 
   1215 
   1216 /* Functions for getting register numbers from modrm bytes and REX
   1217    when we don't have to consider the complexities of integer subreg
   1218    accesses.
   1219 */
   1220 /* Extract the g reg field from a modRM byte, and augment it using the
   1221    REX.R bit from the supplied REX byte.  The R bit usually is
   1222    associated with the g register field.
   1223 */
   1224 static UInt gregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
   1225 {
   1226    Int reg = (Int)( (mod_reg_rm >> 3) & 7 );
   1227    reg += (pfx & PFX_REXR) ? 8 : 0;
   1228    return reg;
   1229 }
   1230 
   1231 /* Extract the e reg field from a modRM byte, and augment it using the
   1232    REX.B bit from the supplied REX byte.  The B bit usually is
   1233    associated with the e register field (when modrm indicates e is a
   1234    register, that is).
   1235 */
   1236 static UInt eregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
   1237 {
   1238    Int rm;
   1239    vassert(epartIsReg(mod_reg_rm));
   1240    rm = (Int)(mod_reg_rm & 0x7);
   1241    rm += (pfx & PFX_REXB) ? 8 : 0;
   1242    return rm;
   1243 }
   1244 
   1245 
   1246 /* General functions for dealing with integer register access. */
   1247 
   1248 /* Produce the guest state offset for a reference to the 'g' register
   1249    field in a modrm byte, taking into account REX (or its absence),
   1250    and the size of the access.
   1251 */
   1252 static UInt offsetIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1253 {
   1254    UInt reg;
   1255    vassert(!host_is_bigendian);
   1256    vassert(IS_VALID_PFX(pfx));
   1257    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1258    reg = gregOfRexRM( pfx, mod_reg_rm );
   1259    return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
   1260 }
   1261 
   1262 static
   1263 IRExpr* getIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1264 {
   1265    if (sz == 4) {
   1266       sz = 8;
   1267       return unop(Iop_64to32,
   1268                   IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
   1269                               szToITy(sz) ));
   1270    } else {
   1271       return IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
   1272                          szToITy(sz) );
   1273    }
   1274 }
   1275 
   1276 static
   1277 void putIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
   1278 {
   1279    vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
   1280    if (sz == 4) {
   1281       e = unop(Iop_32Uto64,e);
   1282    }
   1283    stmt( IRStmt_Put( offsetIRegG( sz, pfx, mod_reg_rm ), e ) );
   1284 }
   1285 
   1286 static
   1287 HChar* nameIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1288 {
   1289    return nameIReg( sz, gregOfRexRM(pfx,mod_reg_rm),
   1290                         toBool(sz==1 && !haveREX(pfx)) );
   1291 }
   1292 
   1293 
   1294 /* Produce the guest state offset for a reference to the 'e' register
   1295    field in a modrm byte, taking into account REX (or its absence),
   1296    and the size of the access.  eregOfRexRM will assert if mod_reg_rm
   1297    denotes a memory access rather than a register access.
   1298 */
   1299 static UInt offsetIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1300 {
   1301    UInt reg;
   1302    vassert(!host_is_bigendian);
   1303    vassert(IS_VALID_PFX(pfx));
   1304    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1305    reg = eregOfRexRM( pfx, mod_reg_rm );
   1306    return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
   1307 }
   1308 
   1309 static
   1310 IRExpr* getIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1311 {
   1312    if (sz == 4) {
   1313       sz = 8;
   1314       return unop(Iop_64to32,
   1315                   IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
   1316                               szToITy(sz) ));
   1317    } else {
   1318       return IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
   1319                          szToITy(sz) );
   1320    }
   1321 }
   1322 
   1323 static
   1324 void putIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
   1325 {
   1326    vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
   1327    if (sz == 4) {
   1328       e = unop(Iop_32Uto64,e);
   1329    }
   1330    stmt( IRStmt_Put( offsetIRegE( sz, pfx, mod_reg_rm ), e ) );
   1331 }
   1332 
   1333 static
   1334 HChar* nameIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1335 {
   1336    return nameIReg( sz, eregOfRexRM(pfx,mod_reg_rm),
   1337                         toBool(sz==1 && !haveREX(pfx)) );
   1338 }
   1339 
   1340 
   1341 /*------------------------------------------------------------*/
   1342 /*--- For dealing with XMM registers                       ---*/
   1343 /*------------------------------------------------------------*/
   1344 
   1345 static Int ymmGuestRegOffset ( UInt ymmreg )
   1346 {
   1347    switch (ymmreg) {
   1348       case 0:  return OFFB_YMM0;
   1349       case 1:  return OFFB_YMM1;
   1350       case 2:  return OFFB_YMM2;
   1351       case 3:  return OFFB_YMM3;
   1352       case 4:  return OFFB_YMM4;
   1353       case 5:  return OFFB_YMM5;
   1354       case 6:  return OFFB_YMM6;
   1355       case 7:  return OFFB_YMM7;
   1356       case 8:  return OFFB_YMM8;
   1357       case 9:  return OFFB_YMM9;
   1358       case 10: return OFFB_YMM10;
   1359       case 11: return OFFB_YMM11;
   1360       case 12: return OFFB_YMM12;
   1361       case 13: return OFFB_YMM13;
   1362       case 14: return OFFB_YMM14;
   1363       case 15: return OFFB_YMM15;
   1364       default: vpanic("ymmGuestRegOffset(amd64)");
   1365    }
   1366 }
   1367 
   1368 static Int xmmGuestRegOffset ( UInt xmmreg )
   1369 {
   1370    /* Correct for little-endian host only. */
   1371    vassert(!host_is_bigendian);
   1372    return ymmGuestRegOffset( xmmreg );
   1373 }
   1374 
   1375 /* Lanes of vector registers are always numbered from zero being the
   1376    least significant lane (rightmost in the register).  */
   1377 
   1378 static Int xmmGuestRegLane16offset ( UInt xmmreg, Int laneno )
   1379 {
   1380    /* Correct for little-endian host only. */
   1381    vassert(!host_is_bigendian);
   1382    vassert(laneno >= 0 && laneno < 8);
   1383    return xmmGuestRegOffset( xmmreg ) + 2 * laneno;
   1384 }
   1385 
   1386 static Int xmmGuestRegLane32offset ( UInt xmmreg, Int laneno )
   1387 {
   1388    /* Correct for little-endian host only. */
   1389    vassert(!host_is_bigendian);
   1390    vassert(laneno >= 0 && laneno < 4);
   1391    return xmmGuestRegOffset( xmmreg ) + 4 * laneno;
   1392 }
   1393 
   1394 static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno )
   1395 {
   1396    /* Correct for little-endian host only. */
   1397    vassert(!host_is_bigendian);
   1398    vassert(laneno >= 0 && laneno < 2);
   1399    return xmmGuestRegOffset( xmmreg ) + 8 * laneno;
   1400 }
   1401 
   1402 static Int ymmGuestRegLane128offset ( UInt ymmreg, Int laneno )
   1403 {
   1404    /* Correct for little-endian host only. */
   1405    vassert(!host_is_bigendian);
   1406    vassert(laneno >= 0 && laneno < 2);
   1407    return ymmGuestRegOffset( ymmreg ) + 16 * laneno;
   1408 }
   1409 
   1410 static Int ymmGuestRegLane64offset ( UInt ymmreg, Int laneno )
   1411 {
   1412    /* Correct for little-endian host only. */
   1413    vassert(!host_is_bigendian);
   1414    vassert(laneno >= 0 && laneno < 4);
   1415    return ymmGuestRegOffset( ymmreg ) + 8 * laneno;
   1416 }
   1417 
   1418 static Int ymmGuestRegLane32offset ( UInt ymmreg, Int laneno )
   1419 {
   1420    /* Correct for little-endian host only. */
   1421    vassert(!host_is_bigendian);
   1422    vassert(laneno >= 0 && laneno < 8);
   1423    return ymmGuestRegOffset( ymmreg ) + 4 * laneno;
   1424 }
   1425 
   1426 static IRExpr* getXMMReg ( UInt xmmreg )
   1427 {
   1428    return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
   1429 }
   1430 
   1431 static IRExpr* getXMMRegLane64 ( UInt xmmreg, Int laneno )
   1432 {
   1433    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_I64 );
   1434 }
   1435 
   1436 static IRExpr* getXMMRegLane64F ( UInt xmmreg, Int laneno )
   1437 {
   1438    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_F64 );
   1439 }
   1440 
   1441 static IRExpr* getXMMRegLane32 ( UInt xmmreg, Int laneno )
   1442 {
   1443    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_I32 );
   1444 }
   1445 
   1446 static IRExpr* getXMMRegLane32F ( UInt xmmreg, Int laneno )
   1447 {
   1448    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_F32 );
   1449 }
   1450 
   1451 static IRExpr* getXMMRegLane16 ( UInt xmmreg, Int laneno )
   1452 {
   1453   return IRExpr_Get( xmmGuestRegLane16offset(xmmreg,laneno), Ity_I16 );
   1454 }
   1455 
   1456 static void putXMMReg ( UInt xmmreg, IRExpr* e )
   1457 {
   1458    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
   1459    stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
   1460 }
   1461 
   1462 static void putXMMRegLane64 ( UInt xmmreg, Int laneno, IRExpr* e )
   1463 {
   1464    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   1465    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
   1466 }
   1467 
   1468 static void putXMMRegLane64F ( UInt xmmreg, Int laneno, IRExpr* e )
   1469 {
   1470    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
   1471    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
   1472 }
   1473 
   1474 static void putXMMRegLane32F ( UInt xmmreg, Int laneno, IRExpr* e )
   1475 {
   1476    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
   1477    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
   1478 }
   1479 
   1480 static void putXMMRegLane32 ( UInt xmmreg, Int laneno, IRExpr* e )
   1481 {
   1482    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
   1483    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
   1484 }
   1485 
   1486 static IRExpr* getYMMReg ( UInt xmmreg )
   1487 {
   1488    return IRExpr_Get( ymmGuestRegOffset(xmmreg), Ity_V256 );
   1489 }
   1490 
   1491 static IRExpr* getYMMRegLane128 ( UInt ymmreg, Int laneno )
   1492 {
   1493    return IRExpr_Get( ymmGuestRegLane128offset(ymmreg,laneno), Ity_V128 );
   1494 }
   1495 
   1496 static IRExpr* getYMMRegLane64 ( UInt ymmreg, Int laneno )
   1497 {
   1498    return IRExpr_Get( ymmGuestRegLane64offset(ymmreg,laneno), Ity_I64 );
   1499 }
   1500 
   1501 static IRExpr* getYMMRegLane32 ( UInt ymmreg, Int laneno )
   1502 {
   1503    return IRExpr_Get( ymmGuestRegLane32offset(ymmreg,laneno), Ity_I32 );
   1504 }
   1505 
   1506 static void putYMMReg ( UInt ymmreg, IRExpr* e )
   1507 {
   1508    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V256);
   1509    stmt( IRStmt_Put( ymmGuestRegOffset(ymmreg), e ) );
   1510 }
   1511 
   1512 static void putYMMRegLane128 ( UInt ymmreg, Int laneno, IRExpr* e )
   1513 {
   1514    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
   1515    stmt( IRStmt_Put( ymmGuestRegLane128offset(ymmreg,laneno), e ) );
   1516 }
   1517 
   1518 static void putYMMRegLane64F ( UInt ymmreg, Int laneno, IRExpr* e )
   1519 {
   1520    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
   1521    stmt( IRStmt_Put( ymmGuestRegLane64offset(ymmreg,laneno), e ) );
   1522 }
   1523 
   1524 static void putYMMRegLane64 ( UInt ymmreg, Int laneno, IRExpr* e )
   1525 {
   1526    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   1527    stmt( IRStmt_Put( ymmGuestRegLane64offset(ymmreg,laneno), e ) );
   1528 }
   1529 
   1530 static void putYMMRegLane32F ( UInt ymmreg, Int laneno, IRExpr* e )
   1531 {
   1532    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
   1533    stmt( IRStmt_Put( ymmGuestRegLane32offset(ymmreg,laneno), e ) );
   1534 }
   1535 
   1536 static void putYMMRegLane32 ( UInt ymmreg, Int laneno, IRExpr* e )
   1537 {
   1538    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
   1539    stmt( IRStmt_Put( ymmGuestRegLane32offset(ymmreg,laneno), e ) );
   1540 }
   1541 
   1542 static IRExpr* mkV128 ( UShort mask )
   1543 {
   1544    return IRExpr_Const(IRConst_V128(mask));
   1545 }
   1546 
   1547 /* Write the low half of a YMM reg and zero out the upper half. */
   1548 static void putYMMRegLoAndZU ( UInt ymmreg, IRExpr* e )
   1549 {
   1550    putYMMRegLane128( ymmreg, 0, e );
   1551    putYMMRegLane128( ymmreg, 1, mkV128(0) );
   1552 }
   1553 
   1554 static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y )
   1555 {
   1556    vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1);
   1557    vassert(typeOfIRExpr(irsb->tyenv,y) == Ity_I1);
   1558    return unop(Iop_64to1,
   1559                binop(Iop_And64,
   1560                      unop(Iop_1Uto64,x),
   1561                      unop(Iop_1Uto64,y)));
   1562 }
   1563 
   1564 /* Generate a compare-and-swap operation, operating on memory at
   1565    'addr'.  The expected value is 'expVal' and the new value is
   1566    'newVal'.  If the operation fails, then transfer control (with a
   1567    no-redir jump (XXX no -- see comment at top of this file)) to
   1568    'restart_point', which is presumably the address of the guest
   1569    instruction again -- retrying, essentially. */
   1570 static void casLE ( IRExpr* addr, IRExpr* expVal, IRExpr* newVal,
   1571                     Addr64 restart_point )
   1572 {
   1573    IRCAS* cas;
   1574    IRType tyE    = typeOfIRExpr(irsb->tyenv, expVal);
   1575    IRType tyN    = typeOfIRExpr(irsb->tyenv, newVal);
   1576    IRTemp oldTmp = newTemp(tyE);
   1577    IRTemp expTmp = newTemp(tyE);
   1578    vassert(tyE == tyN);
   1579    vassert(tyE == Ity_I64 || tyE == Ity_I32
   1580            || tyE == Ity_I16 || tyE == Ity_I8);
   1581    assign(expTmp, expVal);
   1582    cas = mkIRCAS( IRTemp_INVALID, oldTmp, Iend_LE, addr,
   1583                   NULL, mkexpr(expTmp), NULL, newVal );
   1584    stmt( IRStmt_CAS(cas) );
   1585    stmt( IRStmt_Exit(
   1586             binop( mkSizedOp(tyE,Iop_CasCmpNE8),
   1587                    mkexpr(oldTmp), mkexpr(expTmp) ),
   1588             Ijk_Boring, /*Ijk_NoRedir*/
   1589             IRConst_U64( restart_point ),
   1590             OFFB_RIP
   1591          ));
   1592 }
   1593 
   1594 
   1595 /*------------------------------------------------------------*/
   1596 /*--- Helpers for %rflags.                                 ---*/
   1597 /*------------------------------------------------------------*/
   1598 
   1599 /* -------------- Evaluating the flags-thunk. -------------- */
   1600 
   1601 /* Build IR to calculate all the eflags from stored
   1602    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
   1603    Ity_I64. */
   1604 static IRExpr* mk_amd64g_calculate_rflags_all ( void )
   1605 {
   1606    IRExpr** args
   1607       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
   1608                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1609                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1610                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   1611    IRExpr* call
   1612       = mkIRExprCCall(
   1613            Ity_I64,
   1614            0/*regparm*/,
   1615            "amd64g_calculate_rflags_all", &amd64g_calculate_rflags_all,
   1616            args
   1617         );
   1618    /* Exclude OP and NDEP from definedness checking.  We're only
   1619       interested in DEP1 and DEP2. */
   1620    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
   1621    return call;
   1622 }
   1623 
   1624 /* Build IR to calculate some particular condition from stored
   1625    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
   1626    Ity_Bit. */
   1627 static IRExpr* mk_amd64g_calculate_condition ( AMD64Condcode cond )
   1628 {
   1629    IRExpr** args
   1630       = mkIRExprVec_5( mkU64(cond),
   1631                        IRExpr_Get(OFFB_CC_OP,   Ity_I64),
   1632                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1633                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1634                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   1635    IRExpr* call
   1636       = mkIRExprCCall(
   1637            Ity_I64,
   1638            0/*regparm*/,
   1639            "amd64g_calculate_condition", &amd64g_calculate_condition,
   1640            args
   1641         );
   1642    /* Exclude the requested condition, OP and NDEP from definedness
   1643       checking.  We're only interested in DEP1 and DEP2. */
   1644    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<1) | (1<<4);
   1645    return unop(Iop_64to1, call);
   1646 }
   1647 
   1648 /* Build IR to calculate just the carry flag from stored
   1649    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression :: Ity_I64. */
   1650 static IRExpr* mk_amd64g_calculate_rflags_c ( void )
   1651 {
   1652    IRExpr** args
   1653       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
   1654                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1655                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1656                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   1657    IRExpr* call
   1658       = mkIRExprCCall(
   1659            Ity_I64,
   1660            0/*regparm*/,
   1661            "amd64g_calculate_rflags_c", &amd64g_calculate_rflags_c,
   1662            args
   1663         );
   1664    /* Exclude OP and NDEP from definedness checking.  We're only
   1665       interested in DEP1 and DEP2. */
   1666    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
   1667    return call;
   1668 }
   1669 
   1670 
   1671 /* -------------- Building the flags-thunk. -------------- */
   1672 
   1673 /* The machinery in this section builds the flag-thunk following a
   1674    flag-setting operation.  Hence the various setFlags_* functions.
   1675 */
   1676 
   1677 static Bool isAddSub ( IROp op8 )
   1678 {
   1679    return toBool(op8 == Iop_Add8 || op8 == Iop_Sub8);
   1680 }
   1681 
   1682 static Bool isLogic ( IROp op8 )
   1683 {
   1684    return toBool(op8 == Iop_And8 || op8 == Iop_Or8 || op8 == Iop_Xor8);
   1685 }
   1686 
   1687 /* U-widen 8/16/32/64 bit int expr to 64. */
   1688 static IRExpr* widenUto64 ( IRExpr* e )
   1689 {
   1690    switch (typeOfIRExpr(irsb->tyenv,e)) {
   1691       case Ity_I64: return e;
   1692       case Ity_I32: return unop(Iop_32Uto64, e);
   1693       case Ity_I16: return unop(Iop_16Uto64, e);
   1694       case Ity_I8:  return unop(Iop_8Uto64, e);
   1695       default: vpanic("widenUto64");
   1696    }
   1697 }
   1698 
   1699 /* S-widen 8/16/32/64 bit int expr to 32. */
   1700 static IRExpr* widenSto64 ( IRExpr* e )
   1701 {
   1702    switch (typeOfIRExpr(irsb->tyenv,e)) {
   1703       case Ity_I64: return e;
   1704       case Ity_I32: return unop(Iop_32Sto64, e);
   1705       case Ity_I16: return unop(Iop_16Sto64, e);
   1706       case Ity_I8:  return unop(Iop_8Sto64, e);
   1707       default: vpanic("widenSto64");
   1708    }
   1709 }
   1710 
   1711 /* Narrow 8/16/32/64 bit int expr to 8/16/32/64.  Clearly only some
   1712    of these combinations make sense. */
   1713 static IRExpr* narrowTo ( IRType dst_ty, IRExpr* e )
   1714 {
   1715    IRType src_ty = typeOfIRExpr(irsb->tyenv,e);
   1716    if (src_ty == dst_ty)
   1717       return e;
   1718    if (src_ty == Ity_I32 && dst_ty == Ity_I16)
   1719       return unop(Iop_32to16, e);
   1720    if (src_ty == Ity_I32 && dst_ty == Ity_I8)
   1721       return unop(Iop_32to8, e);
   1722    if (src_ty == Ity_I64 && dst_ty == Ity_I32)
   1723       return unop(Iop_64to32, e);
   1724    if (src_ty == Ity_I64 && dst_ty == Ity_I16)
   1725       return unop(Iop_64to16, e);
   1726    if (src_ty == Ity_I64 && dst_ty == Ity_I8)
   1727       return unop(Iop_64to8, e);
   1728 
   1729    vex_printf("\nsrc, dst tys are: ");
   1730    ppIRType(src_ty);
   1731    vex_printf(", ");
   1732    ppIRType(dst_ty);
   1733    vex_printf("\n");
   1734    vpanic("narrowTo(amd64)");
   1735 }
   1736 
   1737 
   1738 /* Set the flags thunk OP, DEP1 and DEP2 fields.  The supplied op is
   1739    auto-sized up to the real op. */
   1740 
   1741 static
   1742 void setFlags_DEP1_DEP2 ( IROp op8, IRTemp dep1, IRTemp dep2, IRType ty )
   1743 {
   1744    Int ccOp = 0;
   1745    switch (ty) {
   1746       case Ity_I8:  ccOp = 0; break;
   1747       case Ity_I16: ccOp = 1; break;
   1748       case Ity_I32: ccOp = 2; break;
   1749       case Ity_I64: ccOp = 3; break;
   1750       default: vassert(0);
   1751    }
   1752    switch (op8) {
   1753       case Iop_Add8: ccOp += AMD64G_CC_OP_ADDB;   break;
   1754       case Iop_Sub8: ccOp += AMD64G_CC_OP_SUBB;   break;
   1755       default:       ppIROp(op8);
   1756                      vpanic("setFlags_DEP1_DEP2(amd64)");
   1757    }
   1758    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   1759    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
   1760    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(dep2))) );
   1761 }
   1762 
   1763 
   1764 /* Set the OP and DEP1 fields only, and write zero to DEP2. */
   1765 
   1766 static
   1767 void setFlags_DEP1 ( IROp op8, IRTemp dep1, IRType ty )
   1768 {
   1769    Int ccOp = 0;
   1770    switch (ty) {
   1771       case Ity_I8:  ccOp = 0; break;
   1772       case Ity_I16: ccOp = 1; break;
   1773       case Ity_I32: ccOp = 2; break;
   1774       case Ity_I64: ccOp = 3; break;
   1775       default: vassert(0);
   1776    }
   1777    switch (op8) {
   1778       case Iop_Or8:
   1779       case Iop_And8:
   1780       case Iop_Xor8: ccOp += AMD64G_CC_OP_LOGICB; break;
   1781       default:       ppIROp(op8);
   1782                      vpanic("setFlags_DEP1(amd64)");
   1783    }
   1784    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   1785    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
   1786    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
   1787 }
   1788 
   1789 
   1790 /* For shift operations, we put in the result and the undershifted
   1791    result.  Except if the shift amount is zero, the thunk is left
   1792    unchanged. */
   1793 
   1794 static void setFlags_DEP1_DEP2_shift ( IROp    op64,
   1795                                        IRTemp  res,
   1796                                        IRTemp  resUS,
   1797                                        IRType  ty,
   1798                                        IRTemp  guard )
   1799 {
   1800    Int ccOp = 0;
   1801    switch (ty) {
   1802       case Ity_I8:  ccOp = 0; break;
   1803       case Ity_I16: ccOp = 1; break;
   1804       case Ity_I32: ccOp = 2; break;
   1805       case Ity_I64: ccOp = 3; break;
   1806       default: vassert(0);
   1807    }
   1808 
   1809    vassert(guard);
   1810 
   1811    /* Both kinds of right shifts are handled by the same thunk
   1812       operation. */
   1813    switch (op64) {
   1814       case Iop_Shr64:
   1815       case Iop_Sar64: ccOp += AMD64G_CC_OP_SHRB; break;
   1816       case Iop_Shl64: ccOp += AMD64G_CC_OP_SHLB; break;
   1817       default:        ppIROp(op64);
   1818                       vpanic("setFlags_DEP1_DEP2_shift(amd64)");
   1819    }
   1820 
   1821    /* DEP1 contains the result, DEP2 contains the undershifted value. */
   1822    stmt( IRStmt_Put( OFFB_CC_OP,
   1823                      IRExpr_Mux0X( mkexpr(guard),
   1824                                    IRExpr_Get(OFFB_CC_OP,Ity_I64),
   1825                                    mkU64(ccOp))) );
   1826    stmt( IRStmt_Put( OFFB_CC_DEP1,
   1827                      IRExpr_Mux0X( mkexpr(guard),
   1828                                    IRExpr_Get(OFFB_CC_DEP1,Ity_I64),
   1829                                    widenUto64(mkexpr(res)))) );
   1830    stmt( IRStmt_Put( OFFB_CC_DEP2,
   1831                      IRExpr_Mux0X( mkexpr(guard),
   1832                                    IRExpr_Get(OFFB_CC_DEP2,Ity_I64),
   1833                                    widenUto64(mkexpr(resUS)))) );
   1834 }
   1835 
   1836 
   1837 /* For the inc/dec case, we store in DEP1 the result value and in NDEP
   1838    the former value of the carry flag, which unfortunately we have to
   1839    compute. */
   1840 
   1841 static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
   1842 {
   1843    Int ccOp = inc ? AMD64G_CC_OP_INCB : AMD64G_CC_OP_DECB;
   1844 
   1845    switch (ty) {
   1846       case Ity_I8:  ccOp += 0; break;
   1847       case Ity_I16: ccOp += 1; break;
   1848       case Ity_I32: ccOp += 2; break;
   1849       case Ity_I64: ccOp += 3; break;
   1850       default: vassert(0);
   1851    }
   1852 
   1853    /* This has to come first, because calculating the C flag
   1854       may require reading all four thunk fields. */
   1855    stmt( IRStmt_Put( OFFB_CC_NDEP, mk_amd64g_calculate_rflags_c()) );
   1856    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   1857    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(res))) );
   1858    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
   1859 }
   1860 
   1861 
   1862 /* Multiplies are pretty much like add and sub: DEP1 and DEP2 hold the
   1863    two arguments. */
   1864 
   1865 static
   1866 void setFlags_MUL ( IRType ty, IRTemp arg1, IRTemp arg2, ULong base_op )
   1867 {
   1868    switch (ty) {
   1869       case Ity_I8:
   1870          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+0) ) );
   1871          break;
   1872       case Ity_I16:
   1873          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+1) ) );
   1874          break;
   1875       case Ity_I32:
   1876          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+2) ) );
   1877          break;
   1878       case Ity_I64:
   1879          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+3) ) );
   1880          break;
   1881       default:
   1882          vpanic("setFlags_MUL(amd64)");
   1883    }
   1884    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(arg1)) ));
   1885    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(arg2)) ));
   1886 }
   1887 
   1888 
   1889 /* -------------- Condition codes. -------------- */
   1890 
   1891 /* Condition codes, using the AMD encoding.  */
   1892 
   1893 static HChar* name_AMD64Condcode ( AMD64Condcode cond )
   1894 {
   1895    switch (cond) {
   1896       case AMD64CondO:      return "o";
   1897       case AMD64CondNO:     return "no";
   1898       case AMD64CondB:      return "b";
   1899       case AMD64CondNB:     return "ae"; /*"nb";*/
   1900       case AMD64CondZ:      return "e"; /*"z";*/
   1901       case AMD64CondNZ:     return "ne"; /*"nz";*/
   1902       case AMD64CondBE:     return "be";
   1903       case AMD64CondNBE:    return "a"; /*"nbe";*/
   1904       case AMD64CondS:      return "s";
   1905       case AMD64CondNS:     return "ns";
   1906       case AMD64CondP:      return "p";
   1907       case AMD64CondNP:     return "np";
   1908       case AMD64CondL:      return "l";
   1909       case AMD64CondNL:     return "ge"; /*"nl";*/
   1910       case AMD64CondLE:     return "le";
   1911       case AMD64CondNLE:    return "g"; /*"nle";*/
   1912       case AMD64CondAlways: return "ALWAYS";
   1913       default: vpanic("name_AMD64Condcode");
   1914    }
   1915 }
   1916 
   1917 static
   1918 AMD64Condcode positiveIse_AMD64Condcode ( AMD64Condcode  cond,
   1919                                           /*OUT*/Bool*   needInvert )
   1920 {
   1921    vassert(cond >= AMD64CondO && cond <= AMD64CondNLE);
   1922    if (cond & 1) {
   1923       *needInvert = True;
   1924       return cond-1;
   1925    } else {
   1926       *needInvert = False;
   1927       return cond;
   1928    }
   1929 }
   1930 
   1931 
   1932 /* -------------- Helpers for ADD/SUB with carry. -------------- */
   1933 
   1934 /* Given ta1, ta2 and tres, compute tres = ADC(ta1,ta2) and set flags
   1935    appropriately.
   1936 
   1937    Optionally, generate a store for the 'tres' value.  This can either
   1938    be a normal store, or it can be a cas-with-possible-failure style
   1939    store:
   1940 
   1941    if taddr is IRTemp_INVALID, then no store is generated.
   1942 
   1943    if taddr is not IRTemp_INVALID, then a store (using taddr as
   1944    the address) is generated:
   1945 
   1946      if texpVal is IRTemp_INVALID then a normal store is
   1947      generated, and restart_point must be zero (it is irrelevant).
   1948 
   1949      if texpVal is not IRTemp_INVALID then a cas-style store is
   1950      generated.  texpVal is the expected value, restart_point
   1951      is the restart point if the store fails, and texpVal must
   1952      have the same type as tres.
   1953 
   1954 */
   1955 static void helper_ADC ( Int sz,
   1956                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   1957                          /* info about optional store: */
   1958                          IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
   1959 {
   1960    UInt    thunkOp;
   1961    IRType  ty    = szToITy(sz);
   1962    IRTemp  oldc  = newTemp(Ity_I64);
   1963    IRTemp  oldcn = newTemp(ty);
   1964    IROp    plus  = mkSizedOp(ty, Iop_Add8);
   1965    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   1966 
   1967    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   1968 
   1969    switch (sz) {
   1970       case 8:  thunkOp = AMD64G_CC_OP_ADCQ; break;
   1971       case 4:  thunkOp = AMD64G_CC_OP_ADCL; break;
   1972       case 2:  thunkOp = AMD64G_CC_OP_ADCW; break;
   1973       case 1:  thunkOp = AMD64G_CC_OP_ADCB; break;
   1974       default: vassert(0);
   1975    }
   1976 
   1977    /* oldc = old carry flag, 0 or 1 */
   1978    assign( oldc,  binop(Iop_And64,
   1979                         mk_amd64g_calculate_rflags_c(),
   1980                         mkU64(1)) );
   1981 
   1982    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   1983 
   1984    assign( tres, binop(plus,
   1985                        binop(plus,mkexpr(ta1),mkexpr(ta2)),
   1986                        mkexpr(oldcn)) );
   1987 
   1988    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   1989       start of this function. */
   1990    if (taddr != IRTemp_INVALID) {
   1991       if (texpVal == IRTemp_INVALID) {
   1992          vassert(restart_point == 0);
   1993          storeLE( mkexpr(taddr), mkexpr(tres) );
   1994       } else {
   1995          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   1996          /* .. and hence 'texpVal' has the same type as 'tres'. */
   1997          casLE( mkexpr(taddr),
   1998                 mkexpr(texpVal), mkexpr(tres), restart_point );
   1999       }
   2000    }
   2001 
   2002    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
   2003    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1))  ));
   2004    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
   2005                                                          mkexpr(oldcn)) )) );
   2006    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   2007 }
   2008 
   2009 
   2010 /* Given ta1, ta2 and tres, compute tres = SBB(ta1,ta2) and set flags
   2011    appropriately.  As with helper_ADC, possibly generate a store of
   2012    the result -- see comments on helper_ADC for details.
   2013 */
   2014 static void helper_SBB ( Int sz,
   2015                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   2016                          /* info about optional store: */
   2017                          IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
   2018 {
   2019    UInt    thunkOp;
   2020    IRType  ty    = szToITy(sz);
   2021    IRTemp  oldc  = newTemp(Ity_I64);
   2022    IRTemp  oldcn = newTemp(ty);
   2023    IROp    minus = mkSizedOp(ty, Iop_Sub8);
   2024    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   2025 
   2026    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   2027 
   2028    switch (sz) {
   2029       case 8:  thunkOp = AMD64G_CC_OP_SBBQ; break;
   2030       case 4:  thunkOp = AMD64G_CC_OP_SBBL; break;
   2031       case 2:  thunkOp = AMD64G_CC_OP_SBBW; break;
   2032       case 1:  thunkOp = AMD64G_CC_OP_SBBB; break;
   2033       default: vassert(0);
   2034    }
   2035 
   2036    /* oldc = old carry flag, 0 or 1 */
   2037    assign( oldc, binop(Iop_And64,
   2038                        mk_amd64g_calculate_rflags_c(),
   2039                        mkU64(1)) );
   2040 
   2041    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   2042 
   2043    assign( tres, binop(minus,
   2044                        binop(minus,mkexpr(ta1),mkexpr(ta2)),
   2045                        mkexpr(oldcn)) );
   2046 
   2047    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   2048       start of this function. */
   2049    if (taddr != IRTemp_INVALID) {
   2050       if (texpVal == IRTemp_INVALID) {
   2051          vassert(restart_point == 0);
   2052          storeLE( mkexpr(taddr), mkexpr(tres) );
   2053       } else {
   2054          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   2055          /* .. and hence 'texpVal' has the same type as 'tres'. */
   2056          casLE( mkexpr(taddr),
   2057                 mkexpr(texpVal), mkexpr(tres), restart_point );
   2058       }
   2059    }
   2060 
   2061    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
   2062    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1) )) );
   2063    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
   2064                                                          mkexpr(oldcn)) )) );
   2065    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   2066 }
   2067 
   2068 
   2069 /* -------------- Helpers for disassembly printing. -------------- */
   2070 
   2071 static HChar* nameGrp1 ( Int opc_aux )
   2072 {
   2073    static HChar* grp1_names[8]
   2074      = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
   2075    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(amd64)");
   2076    return grp1_names[opc_aux];
   2077 }
   2078 
   2079 static HChar* nameGrp2 ( Int opc_aux )
   2080 {
   2081    static HChar* grp2_names[8]
   2082      = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
   2083    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(amd64)");
   2084    return grp2_names[opc_aux];
   2085 }
   2086 
   2087 static HChar* nameGrp4 ( Int opc_aux )
   2088 {
   2089    static HChar* grp4_names[8]
   2090      = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
   2091    if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(amd64)");
   2092    return grp4_names[opc_aux];
   2093 }
   2094 
   2095 static HChar* nameGrp5 ( Int opc_aux )
   2096 {
   2097    static HChar* grp5_names[8]
   2098      = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
   2099    if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(amd64)");
   2100    return grp5_names[opc_aux];
   2101 }
   2102 
   2103 static HChar* nameGrp8 ( Int opc_aux )
   2104 {
   2105    static HChar* grp8_names[8]
   2106       = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
   2107    if (opc_aux < 4 || opc_aux > 7) vpanic("nameGrp8(amd64)");
   2108    return grp8_names[opc_aux];
   2109 }
   2110 
   2111 //.. static HChar* nameSReg ( UInt sreg )
   2112 //.. {
   2113 //..    switch (sreg) {
   2114 //..       case R_ES: return "%es";
   2115 //..       case R_CS: return "%cs";
   2116 //..       case R_SS: return "%ss";
   2117 //..       case R_DS: return "%ds";
   2118 //..       case R_FS: return "%fs";
   2119 //..       case R_GS: return "%gs";
   2120 //..       default: vpanic("nameSReg(x86)");
   2121 //..    }
   2122 //.. }
   2123 
   2124 static HChar* nameMMXReg ( Int mmxreg )
   2125 {
   2126    static HChar* mmx_names[8]
   2127      = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
   2128    if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(amd64,guest)");
   2129    return mmx_names[mmxreg];
   2130 }
   2131 
   2132 static HChar* nameXMMReg ( Int xmmreg )
   2133 {
   2134    static HChar* xmm_names[16]
   2135      = { "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3",
   2136          "%xmm4",  "%xmm5",  "%xmm6",  "%xmm7",
   2137          "%xmm8",  "%xmm9",  "%xmm10", "%xmm11",
   2138          "%xmm12", "%xmm13", "%xmm14", "%xmm15" };
   2139    if (xmmreg < 0 || xmmreg > 15) vpanic("nameXMMReg(amd64)");
   2140    return xmm_names[xmmreg];
   2141 }
   2142 
   2143 static HChar* nameMMXGran ( Int gran )
   2144 {
   2145    switch (gran) {
   2146       case 0: return "b";
   2147       case 1: return "w";
   2148       case 2: return "d";
   2149       case 3: return "q";
   2150       default: vpanic("nameMMXGran(amd64,guest)");
   2151    }
   2152 }
   2153 
   2154 static HChar nameISize ( Int size )
   2155 {
   2156    switch (size) {
   2157       case 8: return 'q';
   2158       case 4: return 'l';
   2159       case 2: return 'w';
   2160       case 1: return 'b';
   2161       default: vpanic("nameISize(amd64)");
   2162    }
   2163 }
   2164 
   2165 static HChar* nameYMMReg ( Int ymmreg )
   2166 {
   2167    static HChar* ymm_names[16]
   2168      = { "%ymm0",  "%ymm1",  "%ymm2",  "%ymm3",
   2169          "%ymm4",  "%ymm5",  "%ymm6",  "%ymm7",
   2170          "%ymm8",  "%ymm9",  "%ymm10", "%ymm11",
   2171          "%ymm12", "%ymm13", "%ymm14", "%ymm15" };
   2172    if (ymmreg < 0 || ymmreg > 15) vpanic("nameYMMReg(amd64)");
   2173    return ymm_names[ymmreg];
   2174 }
   2175 
   2176 
   2177 /*------------------------------------------------------------*/
   2178 /*--- JMP helpers                                          ---*/
   2179 /*------------------------------------------------------------*/
   2180 
   2181 static void jmp_lit( /*MOD*/DisResult* dres,
   2182                      IRJumpKind kind, Addr64 d64 )
   2183 {
   2184    vassert(dres->whatNext    == Dis_Continue);
   2185    vassert(dres->len         == 0);
   2186    vassert(dres->continueAt  == 0);
   2187    vassert(dres->jk_StopHere == Ijk_INVALID);
   2188    dres->whatNext    = Dis_StopHere;
   2189    dres->jk_StopHere = kind;
   2190    stmt( IRStmt_Put( OFFB_RIP, mkU64(d64) ) );
   2191 }
   2192 
   2193 static void jmp_treg( /*MOD*/DisResult* dres,
   2194                       IRJumpKind kind, IRTemp t )
   2195 {
   2196    vassert(dres->whatNext    == Dis_Continue);
   2197    vassert(dres->len         == 0);
   2198    vassert(dres->continueAt  == 0);
   2199    vassert(dres->jk_StopHere == Ijk_INVALID);
   2200    dres->whatNext    = Dis_StopHere;
   2201    dres->jk_StopHere = kind;
   2202    stmt( IRStmt_Put( OFFB_RIP, mkexpr(t) ) );
   2203 }
   2204 
   2205 static
   2206 void jcc_01 ( /*MOD*/DisResult* dres,
   2207               AMD64Condcode cond, Addr64 d64_false, Addr64 d64_true )
   2208 {
   2209    Bool          invert;
   2210    AMD64Condcode condPos;
   2211    vassert(dres->whatNext    == Dis_Continue);
   2212    vassert(dres->len         == 0);
   2213    vassert(dres->continueAt  == 0);
   2214    vassert(dres->jk_StopHere == Ijk_INVALID);
   2215    dres->whatNext    = Dis_StopHere;
   2216    dres->jk_StopHere = Ijk_Boring;
   2217    condPos = positiveIse_AMD64Condcode ( cond, &invert );
   2218    if (invert) {
   2219       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
   2220                          Ijk_Boring,
   2221                          IRConst_U64(d64_false),
   2222                          OFFB_RIP ) );
   2223       stmt( IRStmt_Put( OFFB_RIP, mkU64(d64_true) ) );
   2224    } else {
   2225       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
   2226                          Ijk_Boring,
   2227                          IRConst_U64(d64_true),
   2228                          OFFB_RIP ) );
   2229       stmt( IRStmt_Put( OFFB_RIP, mkU64(d64_false) ) );
   2230    }
   2231 }
   2232 
   2233 /* Let new_rsp be the %rsp value after a call/return.  Let nia be the
   2234    guest address of the next instruction to be executed.
   2235 
   2236    This function generates an AbiHint to say that -128(%rsp)
   2237    .. -1(%rsp) should now be regarded as uninitialised.
   2238 */
   2239 static
   2240 void make_redzone_AbiHint ( VexAbiInfo* vbi,
   2241                             IRTemp new_rsp, IRTemp nia, HChar* who )
   2242 {
   2243    Int szB = vbi->guest_stack_redzone_size;
   2244    vassert(szB >= 0);
   2245 
   2246    /* A bit of a kludge.  Currently the only AbI we've guested AMD64
   2247       for is ELF.  So just check it's the expected 128 value
   2248       (paranoia). */
   2249    vassert(szB == 128);
   2250 
   2251    if (0) vex_printf("AbiHint: %s\n", who);
   2252    vassert(typeOfIRTemp(irsb->tyenv, new_rsp) == Ity_I64);
   2253    vassert(typeOfIRTemp(irsb->tyenv, nia) == Ity_I64);
   2254    if (szB > 0)
   2255       stmt( IRStmt_AbiHint(
   2256                binop(Iop_Sub64, mkexpr(new_rsp), mkU64(szB)),
   2257                szB,
   2258                mkexpr(nia)
   2259             ));
   2260 }
   2261 
   2262 
   2263 /*------------------------------------------------------------*/
   2264 /*--- Disassembling addressing modes                       ---*/
   2265 /*------------------------------------------------------------*/
   2266 
   2267 static
   2268 HChar* segRegTxt ( Prefix pfx )
   2269 {
   2270    if (pfx & PFX_CS) return "%cs:";
   2271    if (pfx & PFX_DS) return "%ds:";
   2272    if (pfx & PFX_ES) return "%es:";
   2273    if (pfx & PFX_FS) return "%fs:";
   2274    if (pfx & PFX_GS) return "%gs:";
   2275    if (pfx & PFX_SS) return "%ss:";
   2276    return ""; /* no override */
   2277 }
   2278 
   2279 
   2280 /* 'virtual' is an IRExpr* holding a virtual address.  Convert it to a
   2281    linear address by adding any required segment override as indicated
   2282    by sorb, and also dealing with any address size override
   2283    present. */
   2284 static
   2285 IRExpr* handleAddrOverrides ( VexAbiInfo* vbi,
   2286                               Prefix pfx, IRExpr* virtual )
   2287 {
   2288    /* --- segment overrides --- */
   2289    if (pfx & PFX_FS) {
   2290       if (vbi->guest_amd64_assume_fs_is_zero) {
   2291          /* Note that this is a linux-kernel specific hack that relies
   2292             on the assumption that %fs is always zero. */
   2293          /* return virtual + guest_FS_ZERO. */
   2294          virtual = binop(Iop_Add64, virtual,
   2295                                     IRExpr_Get(OFFB_FS_ZERO, Ity_I64));
   2296       } else {
   2297          unimplemented("amd64 %fs segment override");
   2298       }
   2299    }
   2300 
   2301    if (pfx & PFX_GS) {
   2302       if (vbi->guest_amd64_assume_gs_is_0x60) {
   2303          /* Note that this is a darwin-kernel specific hack that relies
   2304             on the assumption that %gs is always 0x60. */
   2305          /* return virtual + guest_GS_0x60. */
   2306          virtual = binop(Iop_Add64, virtual,
   2307                                     IRExpr_Get(OFFB_GS_0x60, Ity_I64));
   2308       } else {
   2309          unimplemented("amd64 %gs segment override");
   2310       }
   2311    }
   2312 
   2313    /* cs, ds, es and ss are simply ignored in 64-bit mode. */
   2314 
   2315    /* --- address size override --- */
   2316    if (haveASO(pfx))
   2317       virtual = unop(Iop_32Uto64, unop(Iop_64to32, virtual));
   2318 
   2319    return virtual;
   2320 }
   2321 
   2322 //.. {
   2323 //..    Int    sreg;
   2324 //..    IRType hWordTy;
   2325 //..    IRTemp ldt_ptr, gdt_ptr, seg_selector, r64;
   2326 //..
   2327 //..    if (sorb == 0)
   2328 //..       /* the common case - no override */
   2329 //..       return virtual;
   2330 //..
   2331 //..    switch (sorb) {
   2332 //..       case 0x3E: sreg = R_DS; break;
   2333 //..       case 0x26: sreg = R_ES; break;
   2334 //..       case 0x64: sreg = R_FS; break;
   2335 //..       case 0x65: sreg = R_GS; break;
   2336 //..       default: vpanic("handleAddrOverrides(x86,guest)");
   2337 //..    }
   2338 //..
   2339 //..    hWordTy = sizeof(HWord)==4 ? Ity_I32 : Ity_I64;
   2340 //..
   2341 //..    seg_selector = newTemp(Ity_I32);
   2342 //..    ldt_ptr      = newTemp(hWordTy);
   2343 //..    gdt_ptr      = newTemp(hWordTy);
   2344 //..    r64          = newTemp(Ity_I64);
   2345 //..
   2346 //..    assign( seg_selector, unop(Iop_16Uto32, getSReg(sreg)) );
   2347 //..    assign( ldt_ptr, IRExpr_Get( OFFB_LDT, hWordTy ));
   2348 //..    assign( gdt_ptr, IRExpr_Get( OFFB_GDT, hWordTy ));
   2349 //..
   2350 //..    /*
   2351 //..    Call this to do the translation and limit checks:
   2352 //..    ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
   2353 //..                                  UInt seg_selector, UInt virtual_addr )
   2354 //..    */
   2355 //..    assign(
   2356 //..       r64,
   2357 //..       mkIRExprCCall(
   2358 //..          Ity_I64,
   2359 //..          0/*regparms*/,
   2360 //..          "x86g_use_seg_selector",
   2361 //..          &x86g_use_seg_selector,
   2362 //..          mkIRExprVec_4( mkexpr(ldt_ptr), mkexpr(gdt_ptr),
   2363 //..                         mkexpr(seg_selector), virtual)
   2364 //..       )
   2365 //..    );
   2366 //..
   2367 //..    /* If the high 32 of the result are non-zero, there was a
   2368 //..       failure in address translation.  In which case, make a
   2369 //..       quick exit.
   2370 //..    */
   2371 //..    stmt(
   2372 //..       IRStmt_Exit(
   2373 //..          binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
   2374 //..          Ijk_MapFail,
   2375 //..          IRConst_U32( guest_eip_curr_instr )
   2376 //..       )
   2377 //..    );
   2378 //..
   2379 //..    /* otherwise, here's the translated result. */
   2380 //..    return unop(Iop_64to32, mkexpr(r64));
   2381 //.. }
   2382 
   2383 
   2384 /* Generate IR to calculate an address indicated by a ModRM and
   2385    following SIB bytes.  The expression, and the number of bytes in
   2386    the address mode, are returned (the latter in *len).  Note that
   2387    this fn should not be called if the R/M part of the address denotes
   2388    a register instead of memory.  If print_codegen is true, text of
   2389    the addressing mode is placed in buf.
   2390 
   2391    The computed address is stored in a new tempreg, and the
   2392    identity of the tempreg is returned.
   2393 
   2394    extra_bytes holds the number of bytes after the amode, as supplied
   2395    by the caller.  This is needed to make sense of %rip-relative
   2396    addresses.  Note that the value that *len is set to is only the
   2397    length of the amode itself and does not include the value supplied
   2398    in extra_bytes.
   2399  */
   2400 
   2401 static IRTemp disAMode_copy2tmp ( IRExpr* addr64 )
   2402 {
   2403    IRTemp tmp = newTemp(Ity_I64);
   2404    assign( tmp, addr64 );
   2405    return tmp;
   2406 }
   2407 
   2408 static
   2409 IRTemp disAMode ( /*OUT*/Int* len,
   2410                   VexAbiInfo* vbi, Prefix pfx, Long delta,
   2411                   /*OUT*/HChar* buf, Int extra_bytes )
   2412 {
   2413    UChar mod_reg_rm = getUChar(delta);
   2414    delta++;
   2415 
   2416    buf[0] = (UChar)0;
   2417    vassert(extra_bytes >= 0 && extra_bytes < 10);
   2418 
   2419    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   2420       jump table seems a bit excessive.
   2421    */
   2422    mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
   2423    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   2424                                                /* is now XX0XXYYY */
   2425    mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
   2426    switch (mod_reg_rm) {
   2427 
   2428       /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
   2429          REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
   2430       */
   2431       case 0x00: case 0x01: case 0x02: case 0x03:
   2432       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   2433          { UChar rm = toUChar(mod_reg_rm & 7);
   2434            DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
   2435            *len = 1;
   2436            return disAMode_copy2tmp(
   2437                   handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,rm)));
   2438          }
   2439 
   2440       /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
   2441          REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
   2442       */
   2443       case 0x08: case 0x09: case 0x0A: case 0x0B:
   2444       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   2445          { UChar rm = toUChar(mod_reg_rm & 7);
   2446            Long d   = getSDisp8(delta);
   2447            if (d == 0) {
   2448               DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
   2449            } else {
   2450               DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
   2451            }
   2452            *len = 2;
   2453            return disAMode_copy2tmp(
   2454                   handleAddrOverrides(vbi, pfx,
   2455                      binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
   2456          }
   2457 
   2458       /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
   2459          REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
   2460       */
   2461       case 0x10: case 0x11: case 0x12: case 0x13:
   2462       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   2463          { UChar rm = toUChar(mod_reg_rm & 7);
   2464            Long  d  = getSDisp32(delta);
   2465            DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
   2466            *len = 5;
   2467            return disAMode_copy2tmp(
   2468                   handleAddrOverrides(vbi, pfx,
   2469                      binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
   2470          }
   2471 
   2472       /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
   2473       /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
   2474       case 0x18: case 0x19: case 0x1A: case 0x1B:
   2475       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   2476          vpanic("disAMode(amd64): not an addr!");
   2477 
   2478       /* RIP + disp32.  This assumes that guest_RIP_curr_instr is set
   2479          correctly at the start of handling each instruction. */
   2480       case 0x05:
   2481          { Long d = getSDisp32(delta);
   2482            *len = 5;
   2483            DIS(buf, "%s%lld(%%rip)", segRegTxt(pfx), d);
   2484            /* We need to know the next instruction's start address.
   2485               Try and figure out what it is, record the guess, and ask
   2486               the top-level driver logic (bbToIR_AMD64) to check we
   2487               guessed right, after the instruction is completely
   2488               decoded. */
   2489            guest_RIP_next_mustcheck = True;
   2490            guest_RIP_next_assumed = guest_RIP_bbstart
   2491                                     + delta+4 + extra_bytes;
   2492            return disAMode_copy2tmp(
   2493                      handleAddrOverrides(vbi, pfx,
   2494                         binop(Iop_Add64, mkU64(guest_RIP_next_assumed),
   2495                                          mkU64(d))));
   2496          }
   2497 
   2498       case 0x04: {
   2499          /* SIB, with no displacement.  Special cases:
   2500             -- %rsp cannot act as an index value.
   2501                If index_r indicates %rsp, zero is used for the index.
   2502             -- when mod is zero and base indicates RBP or R13, base is
   2503                instead a 32-bit sign-extended literal.
   2504             It's all madness, I tell you.  Extract %index, %base and
   2505             scale from the SIB byte.  The value denoted is then:
   2506                | %index == %RSP && (%base == %RBP || %base == %R13)
   2507                = d32 following SIB byte
   2508                | %index == %RSP && !(%base == %RBP || %base == %R13)
   2509                = %base
   2510                | %index != %RSP && (%base == %RBP || %base == %R13)
   2511                = d32 following SIB byte + (%index << scale)
   2512                | %index != %RSP && !(%base == %RBP || %base == %R13)
   2513                = %base + (%index << scale)
   2514          */
   2515          UChar sib     = getUChar(delta);
   2516          UChar scale   = toUChar((sib >> 6) & 3);
   2517          UChar index_r = toUChar((sib >> 3) & 7);
   2518          UChar base_r  = toUChar(sib & 7);
   2519          /* correct since #(R13) == 8 + #(RBP) */
   2520          Bool  base_is_BPor13 = toBool(base_r == R_RBP);
   2521          Bool  index_is_SP    = toBool(index_r == R_RSP && 0==getRexX(pfx));
   2522          delta++;
   2523 
   2524          if ((!index_is_SP) && (!base_is_BPor13)) {
   2525             if (scale == 0) {
   2526                DIS(buf, "%s(%s,%s)", segRegTxt(pfx),
   2527                          nameIRegRexB(8,pfx,base_r),
   2528                          nameIReg64rexX(pfx,index_r));
   2529             } else {
   2530                DIS(buf, "%s(%s,%s,%d)", segRegTxt(pfx),
   2531                          nameIRegRexB(8,pfx,base_r),
   2532                          nameIReg64rexX(pfx,index_r), 1<<scale);
   2533             }
   2534             *len = 2;
   2535             return
   2536                disAMode_copy2tmp(
   2537                handleAddrOverrides(vbi, pfx,
   2538                   binop(Iop_Add64,
   2539                         getIRegRexB(8,pfx,base_r),
   2540                         binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
   2541                               mkU8(scale)))));
   2542          }
   2543 
   2544          if ((!index_is_SP) && base_is_BPor13) {
   2545             Long d = getSDisp32(delta);
   2546             DIS(buf, "%s%lld(,%s,%d)", segRegTxt(pfx), d,
   2547                       nameIReg64rexX(pfx,index_r), 1<<scale);
   2548             *len = 6;
   2549             return
   2550                disAMode_copy2tmp(
   2551                handleAddrOverrides(vbi, pfx,
   2552                   binop(Iop_Add64,
   2553                         binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
   2554                                          mkU8(scale)),
   2555                         mkU64(d))));
   2556          }
   2557 
   2558          if (index_is_SP && (!base_is_BPor13)) {
   2559             DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,base_r));
   2560             *len = 2;
   2561             return disAMode_copy2tmp(
   2562                    handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,base_r)));
   2563          }
   2564 
   2565          if (index_is_SP && base_is_BPor13) {
   2566             Long d = getSDisp32(delta);
   2567             DIS(buf, "%s%lld", segRegTxt(pfx), d);
   2568             *len = 6;
   2569             return disAMode_copy2tmp(
   2570                    handleAddrOverrides(vbi, pfx, mkU64(d)));
   2571          }
   2572 
   2573          vassert(0);
   2574       }
   2575 
   2576       /* SIB, with 8-bit displacement.  Special cases:
   2577          -- %esp cannot act as an index value.
   2578             If index_r indicates %esp, zero is used for the index.
   2579          Denoted value is:
   2580             | %index == %ESP
   2581             = d8 + %base
   2582             | %index != %ESP
   2583             = d8 + %base + (%index << scale)
   2584       */
   2585       case 0x0C: {
   2586          UChar sib     = getUChar(delta);
   2587          UChar scale   = toUChar((sib >> 6) & 3);
   2588          UChar index_r = toUChar((sib >> 3) & 7);
   2589          UChar base_r  = toUChar(sib & 7);
   2590          Long d        = getSDisp8(delta+1);
   2591 
   2592          if (index_r == R_RSP && 0==getRexX(pfx)) {
   2593             DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
   2594                                    d, nameIRegRexB(8,pfx,base_r));
   2595             *len = 3;
   2596             return disAMode_copy2tmp(
   2597                    handleAddrOverrides(vbi, pfx,
   2598                       binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
   2599          } else {
   2600             if (scale == 0) {
   2601                DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
   2602                          nameIRegRexB(8,pfx,base_r),
   2603                          nameIReg64rexX(pfx,index_r));
   2604             } else {
   2605                DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
   2606                          nameIRegRexB(8,pfx,base_r),
   2607                          nameIReg64rexX(pfx,index_r), 1<<scale);
   2608             }
   2609             *len = 3;
   2610             return
   2611                 disAMode_copy2tmp(
   2612                 handleAddrOverrides(vbi, pfx,
   2613                   binop(Iop_Add64,
   2614                         binop(Iop_Add64,
   2615                               getIRegRexB(8,pfx,base_r),
   2616                               binop(Iop_Shl64,
   2617                                     getIReg64rexX(pfx,index_r), mkU8(scale))),
   2618                         mkU64(d))));
   2619          }
   2620          vassert(0); /*NOTREACHED*/
   2621       }
   2622 
   2623       /* SIB, with 32-bit displacement.  Special cases:
   2624          -- %rsp cannot act as an index value.
   2625             If index_r indicates %rsp, zero is used for the index.
   2626          Denoted value is:
   2627             | %index == %RSP
   2628             = d32 + %base
   2629             | %index != %RSP
   2630             = d32 + %base + (%index << scale)
   2631       */
   2632       case 0x14: {
   2633          UChar sib     = getUChar(delta);
   2634          UChar scale   = toUChar((sib >> 6) & 3);
   2635          UChar index_r = toUChar((sib >> 3) & 7);
   2636          UChar base_r  = toUChar(sib & 7);
   2637          Long d        = getSDisp32(delta+1);
   2638 
   2639          if (index_r == R_RSP && 0==getRexX(pfx)) {
   2640             DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
   2641                                    d, nameIRegRexB(8,pfx,base_r));
   2642             *len = 6;
   2643             return disAMode_copy2tmp(
   2644                    handleAddrOverrides(vbi, pfx,
   2645                       binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
   2646          } else {
   2647             if (scale == 0) {
   2648                DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
   2649                          nameIRegRexB(8,pfx,base_r),
   2650                          nameIReg64rexX(pfx,index_r));
   2651             } else {
   2652                DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
   2653                          nameIRegRexB(8,pfx,base_r),
   2654                          nameIReg64rexX(pfx,index_r), 1<<scale);
   2655             }
   2656             *len = 6;
   2657             return
   2658                 disAMode_copy2tmp(
   2659                 handleAddrOverrides(vbi, pfx,
   2660                   binop(Iop_Add64,
   2661                         binop(Iop_Add64,
   2662                               getIRegRexB(8,pfx,base_r),
   2663                               binop(Iop_Shl64,
   2664                                     getIReg64rexX(pfx,index_r), mkU8(scale))),
   2665                         mkU64(d))));
   2666          }
   2667          vassert(0); /*NOTREACHED*/
   2668       }
   2669 
   2670       default:
   2671          vpanic("disAMode(amd64)");
   2672          return 0; /*notreached*/
   2673    }
   2674 }
   2675 
   2676 
   2677 /* Figure out the number of (insn-stream) bytes constituting the amode
   2678    beginning at delta.  Is useful for getting hold of literals beyond
   2679    the end of the amode before it has been disassembled.  */
   2680 
   2681 static UInt lengthAMode ( Prefix pfx, Long delta )
   2682 {
   2683    UChar mod_reg_rm = getUChar(delta);
   2684    delta++;
   2685 
   2686    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   2687       jump table seems a bit excessive.
   2688    */
   2689    mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
   2690    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   2691                                                /* is now XX0XXYYY */
   2692    mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
   2693    switch (mod_reg_rm) {
   2694 
   2695       /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
   2696          REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
   2697       */
   2698       case 0x00: case 0x01: case 0x02: case 0x03:
   2699       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   2700          return 1;
   2701 
   2702       /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
   2703          REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
   2704       */
   2705       case 0x08: case 0x09: case 0x0A: case 0x0B:
   2706       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   2707          return 2;
   2708 
   2709       /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
   2710          REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
   2711       */
   2712       case 0x10: case 0x11: case 0x12: case 0x13:
   2713       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   2714          return 5;
   2715 
   2716       /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
   2717       /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
   2718       /* Not an address, but still handled. */
   2719       case 0x18: case 0x19: case 0x1A: case 0x1B:
   2720       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   2721          return 1;
   2722 
   2723       /* RIP + disp32. */
   2724       case 0x05:
   2725          return 5;
   2726 
   2727       case 0x04: {
   2728          /* SIB, with no displacement. */
   2729          UChar sib     = getUChar(delta);
   2730          UChar base_r  = toUChar(sib & 7);
   2731          /* correct since #(R13) == 8 + #(RBP) */
   2732          Bool  base_is_BPor13 = toBool(base_r == R_RBP);
   2733 
   2734          if (base_is_BPor13) {
   2735             return 6;
   2736          } else {
   2737             return 2;
   2738          }
   2739       }
   2740 
   2741       /* SIB, with 8-bit displacement. */
   2742       case 0x0C:
   2743          return 3;
   2744 
   2745       /* SIB, with 32-bit displacement. */
   2746       case 0x14:
   2747          return 6;
   2748 
   2749       default:
   2750          vpanic("lengthAMode(amd64)");
   2751          return 0; /*notreached*/
   2752    }
   2753 }
   2754 
   2755 
   2756 /*------------------------------------------------------------*/
   2757 /*--- Disassembling common idioms                          ---*/
   2758 /*------------------------------------------------------------*/
   2759 
   2760 /* Handle binary integer instructions of the form
   2761       op E, G  meaning
   2762       op reg-or-mem, reg
   2763    Is passed the a ptr to the modRM byte, the actual operation, and the
   2764    data size.  Returns the address advanced completely over this
   2765    instruction.
   2766 
   2767    E(src) is reg-or-mem
   2768    G(dst) is reg.
   2769 
   2770    If E is reg, -->    GET %G,  tmp
   2771                        OP %E,   tmp
   2772                        PUT tmp, %G
   2773 
   2774    If E is mem and OP is not reversible,
   2775                 -->    (getAddr E) -> tmpa
   2776                        LD (tmpa), tmpa
   2777                        GET %G, tmp2
   2778                        OP tmpa, tmp2
   2779                        PUT tmp2, %G
   2780 
   2781    If E is mem and OP is reversible
   2782                 -->    (getAddr E) -> tmpa
   2783                        LD (tmpa), tmpa
   2784                        OP %G, tmpa
   2785                        PUT tmpa, %G
   2786 */
   2787 static
   2788 ULong dis_op2_E_G ( VexAbiInfo* vbi,
   2789                     Prefix      pfx,
   2790                     Bool        addSubCarry,
   2791                     IROp        op8,
   2792                     Bool        keep,
   2793                     Int         size,
   2794                     Long        delta0,
   2795                     HChar*      t_amd64opc )
   2796 {
   2797    HChar   dis_buf[50];
   2798    Int     len;
   2799    IRType  ty   = szToITy(size);
   2800    IRTemp  dst1 = newTemp(ty);
   2801    IRTemp  src  = newTemp(ty);
   2802    IRTemp  dst0 = newTemp(ty);
   2803    UChar   rm   = getUChar(delta0);
   2804    IRTemp  addr = IRTemp_INVALID;
   2805 
   2806    /* addSubCarry == True indicates the intended operation is
   2807       add-with-carry or subtract-with-borrow. */
   2808    if (addSubCarry) {
   2809       vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
   2810       vassert(keep);
   2811    }
   2812 
   2813    if (epartIsReg(rm)) {
   2814       /* Specially handle XOR reg,reg, because that doesn't really
   2815          depend on reg, and doing the obvious thing potentially
   2816          generates a spurious value check failure due to the bogus
   2817          dependency. */
   2818       if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
   2819           && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
   2820          if (False && op8 == Iop_Sub8)
   2821             vex_printf("vex amd64->IR: sbb %%r,%%r optimisation(1)\n");
   2822 	 putIRegG(size,pfx,rm, mkU(ty,0));
   2823       }
   2824 
   2825       assign( dst0, getIRegG(size,pfx,rm) );
   2826       assign( src,  getIRegE(size,pfx,rm) );
   2827 
   2828       if (addSubCarry && op8 == Iop_Add8) {
   2829          helper_ADC( size, dst1, dst0, src,
   2830                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2831          putIRegG(size, pfx, rm, mkexpr(dst1));
   2832       } else
   2833       if (addSubCarry && op8 == Iop_Sub8) {
   2834          helper_SBB( size, dst1, dst0, src,
   2835                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2836          putIRegG(size, pfx, rm, mkexpr(dst1));
   2837       } else {
   2838          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   2839          if (isAddSub(op8))
   2840             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2841          else
   2842             setFlags_DEP1(op8, dst1, ty);
   2843          if (keep)
   2844             putIRegG(size, pfx, rm, mkexpr(dst1));
   2845       }
   2846 
   2847       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   2848                           nameIRegE(size,pfx,rm),
   2849                           nameIRegG(size,pfx,rm));
   2850       return 1+delta0;
   2851    } else {
   2852       /* E refers to memory */
   2853       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   2854       assign( dst0, getIRegG(size,pfx,rm) );
   2855       assign( src,  loadLE(szToITy(size), mkexpr(addr)) );
   2856 
   2857       if (addSubCarry && op8 == Iop_Add8) {
   2858          helper_ADC( size, dst1, dst0, src,
   2859                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2860          putIRegG(size, pfx, rm, mkexpr(dst1));
   2861       } else
   2862       if (addSubCarry && op8 == Iop_Sub8) {
   2863          helper_SBB( size, dst1, dst0, src,
   2864                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2865          putIRegG(size, pfx, rm, mkexpr(dst1));
   2866       } else {
   2867          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   2868          if (isAddSub(op8))
   2869             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2870          else
   2871             setFlags_DEP1(op8, dst1, ty);
   2872          if (keep)
   2873             putIRegG(size, pfx, rm, mkexpr(dst1));
   2874       }
   2875 
   2876       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   2877                           dis_buf, nameIRegG(size, pfx, rm));
   2878       return len+delta0;
   2879    }
   2880 }
   2881 
   2882 
   2883 
   2884 /* Handle binary integer instructions of the form
   2885       op G, E  meaning
   2886       op reg, reg-or-mem
   2887    Is passed the a ptr to the modRM byte, the actual operation, and the
   2888    data size.  Returns the address advanced completely over this
   2889    instruction.
   2890 
   2891    G(src) is reg.
   2892    E(dst) is reg-or-mem
   2893 
   2894    If E is reg, -->    GET %E,  tmp
   2895                        OP %G,   tmp
   2896                        PUT tmp, %E
   2897 
   2898    If E is mem, -->    (getAddr E) -> tmpa
   2899                        LD (tmpa), tmpv
   2900                        OP %G, tmpv
   2901                        ST tmpv, (tmpa)
   2902 */
   2903 static
   2904 ULong dis_op2_G_E ( VexAbiInfo* vbi,
   2905                     Prefix      pfx,
   2906                     Bool        addSubCarry,
   2907                     IROp        op8,
   2908                     Bool        keep,
   2909                     Int         size,
   2910                     Long        delta0,
   2911                     HChar*      t_amd64opc )
   2912 {
   2913    HChar   dis_buf[50];
   2914    Int     len;
   2915    IRType  ty   = szToITy(size);
   2916    IRTemp  dst1 = newTemp(ty);
   2917    IRTemp  src  = newTemp(ty);
   2918    IRTemp  dst0 = newTemp(ty);
   2919    UChar   rm   = getUChar(delta0);
   2920    IRTemp  addr = IRTemp_INVALID;
   2921 
   2922    /* addSubCarry == True indicates the intended operation is
   2923       add-with-carry or subtract-with-borrow. */
   2924    if (addSubCarry) {
   2925       vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
   2926       vassert(keep);
   2927    }
   2928 
   2929    if (epartIsReg(rm)) {
   2930       /* Specially handle XOR reg,reg, because that doesn't really
   2931          depend on reg, and doing the obvious thing potentially
   2932          generates a spurious value check failure due to the bogus
   2933          dependency.  Ditto SBB reg,reg. */
   2934       if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
   2935           && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
   2936          putIRegE(size,pfx,rm, mkU(ty,0));
   2937       }
   2938 
   2939       assign(dst0, getIRegE(size,pfx,rm));
   2940       assign(src,  getIRegG(size,pfx,rm));
   2941 
   2942       if (addSubCarry && op8 == Iop_Add8) {
   2943          helper_ADC( size, dst1, dst0, src,
   2944                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2945          putIRegE(size, pfx, rm, mkexpr(dst1));
   2946       } else
   2947       if (addSubCarry && op8 == Iop_Sub8) {
   2948          helper_SBB( size, dst1, dst0, src,
   2949                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2950          putIRegE(size, pfx, rm, mkexpr(dst1));
   2951       } else {
   2952          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   2953          if (isAddSub(op8))
   2954             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2955          else
   2956             setFlags_DEP1(op8, dst1, ty);
   2957          if (keep)
   2958             putIRegE(size, pfx, rm, mkexpr(dst1));
   2959       }
   2960 
   2961       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   2962                           nameIRegG(size,pfx,rm),
   2963                           nameIRegE(size,pfx,rm));
   2964       return 1+delta0;
   2965    }
   2966 
   2967    /* E refers to memory */
   2968    {
   2969       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   2970       assign(dst0, loadLE(ty,mkexpr(addr)));
   2971       assign(src,  getIRegG(size,pfx,rm));
   2972 
   2973       if (addSubCarry && op8 == Iop_Add8) {
   2974          if (pfx & PFX_LOCK) {
   2975             /* cas-style store */
   2976             helper_ADC( size, dst1, dst0, src,
   2977                         /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   2978          } else {
   2979             /* normal store */
   2980             helper_ADC( size, dst1, dst0, src,
   2981                         /*store*/addr, IRTemp_INVALID, 0 );
   2982          }
   2983       } else
   2984       if (addSubCarry && op8 == Iop_Sub8) {
   2985          if (pfx & PFX_LOCK) {
   2986             /* cas-style store */
   2987             helper_SBB( size, dst1, dst0, src,
   2988                         /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   2989          } else {
   2990             /* normal store */
   2991             helper_SBB( size, dst1, dst0, src,
   2992                         /*store*/addr, IRTemp_INVALID, 0 );
   2993          }
   2994       } else {
   2995          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   2996          if (keep) {
   2997             if (pfx & PFX_LOCK) {
   2998                if (0) vex_printf("locked case\n" );
   2999                casLE( mkexpr(addr),
   3000                       mkexpr(dst0)/*expval*/,
   3001                       mkexpr(dst1)/*newval*/, guest_RIP_curr_instr );
   3002             } else {
   3003                if (0) vex_printf("nonlocked case\n");
   3004                storeLE(mkexpr(addr), mkexpr(dst1));
   3005             }
   3006          }
   3007          if (isAddSub(op8))
   3008             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3009          else
   3010             setFlags_DEP1(op8, dst1, ty);
   3011       }
   3012 
   3013       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   3014                           nameIRegG(size,pfx,rm), dis_buf);
   3015       return len+delta0;
   3016    }
   3017 }
   3018 
   3019 
   3020 /* Handle move instructions of the form
   3021       mov E, G  meaning
   3022       mov reg-or-mem, reg
   3023    Is passed the a ptr to the modRM byte, and the data size.  Returns
   3024    the address advanced completely over this instruction.
   3025 
   3026    E(src) is reg-or-mem
   3027    G(dst) is reg.
   3028 
   3029    If E is reg, -->    GET %E,  tmpv
   3030                        PUT tmpv, %G
   3031 
   3032    If E is mem  -->    (getAddr E) -> tmpa
   3033                        LD (tmpa), tmpb
   3034                        PUT tmpb, %G
   3035 */
   3036 static
   3037 ULong dis_mov_E_G ( VexAbiInfo* vbi,
   3038                     Prefix      pfx,
   3039                     Int         size,
   3040                     Long        delta0 )
   3041 {
   3042    Int len;
   3043    UChar rm = getUChar(delta0);
   3044    HChar dis_buf[50];
   3045 
   3046    if (epartIsReg(rm)) {
   3047       putIRegG(size, pfx, rm, getIRegE(size, pfx, rm));
   3048       DIP("mov%c %s,%s\n", nameISize(size),
   3049                            nameIRegE(size,pfx,rm),
   3050                            nameIRegG(size,pfx,rm));
   3051       return 1+delta0;
   3052    }
   3053 
   3054    /* E refers to memory */
   3055    {
   3056       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   3057       putIRegG(size, pfx, rm, loadLE(szToITy(size), mkexpr(addr)));
   3058       DIP("mov%c %s,%s\n", nameISize(size),
   3059                            dis_buf,
   3060                            nameIRegG(size,pfx,rm));
   3061       return delta0+len;
   3062    }
   3063 }
   3064 
   3065 
   3066 /* Handle move instructions of the form
   3067       mov G, E  meaning
   3068       mov reg, reg-or-mem
   3069    Is passed the a ptr to the modRM byte, and the data size.  Returns
   3070    the address advanced completely over this instruction.
   3071 
   3072    G(src) is reg.
   3073    E(dst) is reg-or-mem
   3074 
   3075    If E is reg, -->    GET %G,  tmp
   3076                        PUT tmp, %E
   3077 
   3078    If E is mem, -->    (getAddr E) -> tmpa
   3079                        GET %G, tmpv
   3080                        ST tmpv, (tmpa)
   3081 */
   3082 static
   3083 ULong dis_mov_G_E ( VexAbiInfo* vbi,
   3084                     Prefix      pfx,
   3085                     Int         size,
   3086                     Long        delta0 )
   3087 {
   3088    Int len;
   3089    UChar rm = getUChar(delta0);
   3090    HChar dis_buf[50];
   3091 
   3092    if (epartIsReg(rm)) {
   3093       putIRegE(size, pfx, rm, getIRegG(size, pfx, rm));
   3094       DIP("mov%c %s,%s\n", nameISize(size),
   3095                            nameIRegG(size,pfx,rm),
   3096                            nameIRegE(size,pfx,rm));
   3097       return 1+delta0;
   3098    }
   3099 
   3100    /* E refers to memory */
   3101    {
   3102       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   3103       storeLE( mkexpr(addr), getIRegG(size, pfx, rm) );
   3104       DIP("mov%c %s,%s\n", nameISize(size),
   3105                            nameIRegG(size,pfx,rm),
   3106                            dis_buf);
   3107       return len+delta0;
   3108    }
   3109 }
   3110 
   3111 
   3112 /* op $immediate, AL/AX/EAX/RAX. */
   3113 static
   3114 ULong dis_op_imm_A ( Int    size,
   3115                      Bool   carrying,
   3116                      IROp   op8,
   3117                      Bool   keep,
   3118                      Long   delta,
   3119                      HChar* t_amd64opc )
   3120 {
   3121    Int    size4 = imin(size,4);
   3122    IRType ty    = szToITy(size);
   3123    IRTemp dst0  = newTemp(ty);
   3124    IRTemp src   = newTemp(ty);
   3125    IRTemp dst1  = newTemp(ty);
   3126    Long  lit    = getSDisp(size4,delta);
   3127    assign(dst0, getIRegRAX(size));
   3128    assign(src,  mkU(ty,lit & mkSizeMask(size)));
   3129 
   3130    if (isAddSub(op8) && !carrying) {
   3131       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   3132       setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3133    }
   3134    else
   3135    if (isLogic(op8)) {
   3136       vassert(!carrying);
   3137       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   3138       setFlags_DEP1(op8, dst1, ty);
   3139    }
   3140    else
   3141    if (op8 == Iop_Add8 && carrying) {
   3142       helper_ADC( size, dst1, dst0, src,
   3143                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3144    }
   3145    else
   3146    if (op8 == Iop_Sub8 && carrying) {
   3147       helper_SBB( size, dst1, dst0, src,
   3148                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3149    }
   3150    else
   3151       vpanic("dis_op_imm_A(amd64,guest)");
   3152 
   3153    if (keep)
   3154       putIRegRAX(size, mkexpr(dst1));
   3155 
   3156    DIP("%s%c $%lld, %s\n", t_amd64opc, nameISize(size),
   3157                            lit, nameIRegRAX(size));
   3158    return delta+size4;
   3159 }
   3160 
   3161 
   3162 /* Sign- and Zero-extending moves. */
   3163 static
   3164 ULong dis_movx_E_G ( VexAbiInfo* vbi,
   3165                      Prefix pfx,
   3166                      Long delta, Int szs, Int szd, Bool sign_extend )
   3167 {
   3168    UChar rm = getUChar(delta);
   3169    if (epartIsReg(rm)) {
   3170       putIRegG(szd, pfx, rm,
   3171                     doScalarWidening(
   3172                        szs,szd,sign_extend,
   3173                        getIRegE(szs,pfx,rm)));
   3174       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   3175                                nameISize(szs),
   3176                                nameISize(szd),
   3177                                nameIRegE(szs,pfx,rm),
   3178                                nameIRegG(szd,pfx,rm));
   3179       return 1+delta;
   3180    }
   3181 
   3182    /* E refers to memory */
   3183    {
   3184       Int    len;
   3185       HChar  dis_buf[50];
   3186       IRTemp addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   3187       putIRegG(szd, pfx, rm,
   3188                     doScalarWidening(
   3189                        szs,szd,sign_extend,
   3190                        loadLE(szToITy(szs),mkexpr(addr))));
   3191       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   3192                                nameISize(szs),
   3193                                nameISize(szd),
   3194                                dis_buf,
   3195                                nameIRegG(szd,pfx,rm));
   3196       return len+delta;
   3197    }
   3198 }
   3199 
   3200 
   3201 /* Generate code to divide ArchRegs RDX:RAX / EDX:EAX / DX:AX / AX by
   3202    the 64 / 32 / 16 / 8 bit quantity in the given IRTemp.  */
   3203 static
   3204 void codegen_div ( Int sz, IRTemp t, Bool signed_divide )
   3205 {
   3206    /* special-case the 64-bit case */
   3207    if (sz == 8) {
   3208       IROp   op     = signed_divide ? Iop_DivModS128to64
   3209                                     : Iop_DivModU128to64;
   3210       IRTemp src128 = newTemp(Ity_I128);
   3211       IRTemp dst128 = newTemp(Ity_I128);
   3212       assign( src128, binop(Iop_64HLto128,
   3213                             getIReg64(R_RDX),
   3214                             getIReg64(R_RAX)) );
   3215       assign( dst128, binop(op, mkexpr(src128), mkexpr(t)) );
   3216       putIReg64( R_RAX, unop(Iop_128to64,mkexpr(dst128)) );
   3217       putIReg64( R_RDX, unop(Iop_128HIto64,mkexpr(dst128)) );
   3218    } else {
   3219       IROp   op    = signed_divide ? Iop_DivModS64to32
   3220                                    : Iop_DivModU64to32;
   3221       IRTemp src64 = newTemp(Ity_I64);
   3222       IRTemp dst64 = newTemp(Ity_I64);
   3223       switch (sz) {
   3224       case 4:
   3225          assign( src64,
   3226                  binop(Iop_32HLto64, getIRegRDX(4), getIRegRAX(4)) );
   3227          assign( dst64,
   3228                  binop(op, mkexpr(src64), mkexpr(t)) );
   3229          putIRegRAX( 4, unop(Iop_64to32,mkexpr(dst64)) );
   3230          putIRegRDX( 4, unop(Iop_64HIto32,mkexpr(dst64)) );
   3231          break;
   3232       case 2: {
   3233          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   3234          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   3235          assign( src64, unop(widen3264,
   3236                              binop(Iop_16HLto32,
   3237                                    getIRegRDX(2),
   3238                                    getIRegRAX(2))) );
   3239          assign( dst64, binop(op, mkexpr(src64), unop(widen1632,mkexpr(t))) );
   3240          putIRegRAX( 2, unop(Iop_32to16,unop(Iop_64to32,mkexpr(dst64))) );
   3241          putIRegRDX( 2, unop(Iop_32to16,unop(Iop_64HIto32,mkexpr(dst64))) );
   3242          break;
   3243       }
   3244       case 1: {
   3245          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   3246          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   3247          IROp widen816  = signed_divide ? Iop_8Sto16  : Iop_8Uto16;
   3248          assign( src64, unop(widen3264,
   3249                         unop(widen1632, getIRegRAX(2))) );
   3250          assign( dst64,
   3251                  binop(op, mkexpr(src64),
   3252                            unop(widen1632, unop(widen816, mkexpr(t)))) );
   3253          putIRegRAX( 1, unop(Iop_16to8,
   3254                         unop(Iop_32to16,
   3255                         unop(Iop_64to32,mkexpr(dst64)))) );
   3256          putIRegAH( unop(Iop_16to8,
   3257                     unop(Iop_32to16,
   3258                     unop(Iop_64HIto32,mkexpr(dst64)))) );
   3259          break;
   3260       }
   3261       default:
   3262          vpanic("codegen_div(amd64)");
   3263       }
   3264    }
   3265 }
   3266 
   3267 static
   3268 ULong dis_Grp1 ( VexAbiInfo* vbi,
   3269                  Prefix pfx,
   3270                  Long delta, UChar modrm,
   3271                  Int am_sz, Int d_sz, Int sz, Long d64 )
   3272 {
   3273    Int     len;
   3274    HChar   dis_buf[50];
   3275    IRType  ty   = szToITy(sz);
   3276    IRTemp  dst1 = newTemp(ty);
   3277    IRTemp  src  = newTemp(ty);
   3278    IRTemp  dst0 = newTemp(ty);
   3279    IRTemp  addr = IRTemp_INVALID;
   3280    IROp    op8  = Iop_INVALID;
   3281    ULong   mask = mkSizeMask(sz);
   3282 
   3283    switch (gregLO3ofRM(modrm)) {
   3284       case 0: op8 = Iop_Add8; break;  case 1: op8 = Iop_Or8;  break;
   3285       case 2: break;  // ADC
   3286       case 3: break;  // SBB
   3287       case 4: op8 = Iop_And8; break;  case 5: op8 = Iop_Sub8; break;
   3288       case 6: op8 = Iop_Xor8; break;  case 7: op8 = Iop_Sub8; break;
   3289       /*NOTREACHED*/
   3290       default: vpanic("dis_Grp1(amd64): unhandled case");
   3291    }
   3292 
   3293    if (epartIsReg(modrm)) {
   3294       vassert(am_sz == 1);
   3295 
   3296       assign(dst0, getIRegE(sz,pfx,modrm));
   3297       assign(src,  mkU(ty,d64 & mask));
   3298 
   3299       if (gregLO3ofRM(modrm) == 2 /* ADC */) {
   3300          helper_ADC( sz, dst1, dst0, src,
   3301                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3302       } else
   3303       if (gregLO3ofRM(modrm) == 3 /* SBB */) {
   3304          helper_SBB( sz, dst1, dst0, src,
   3305                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3306       } else {
   3307          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3308          if (isAddSub(op8))
   3309             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3310          else
   3311             setFlags_DEP1(op8, dst1, ty);
   3312       }
   3313 
   3314       if (gregLO3ofRM(modrm) < 7)
   3315          putIRegE(sz, pfx, modrm, mkexpr(dst1));
   3316 
   3317       delta += (am_sz + d_sz);
   3318       DIP("%s%c $%lld, %s\n",
   3319           nameGrp1(gregLO3ofRM(modrm)), nameISize(sz), d64,
   3320           nameIRegE(sz,pfx,modrm));
   3321    } else {
   3322       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
   3323 
   3324       assign(dst0, loadLE(ty,mkexpr(addr)));
   3325       assign(src, mkU(ty,d64 & mask));
   3326 
   3327       if (gregLO3ofRM(modrm) == 2 /* ADC */) {
   3328          if (pfx & PFX_LOCK) {
   3329             /* cas-style store */
   3330             helper_ADC( sz, dst1, dst0, src,
   3331                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3332          } else {
   3333             /* normal store */
   3334             helper_ADC( sz, dst1, dst0, src,
   3335                         /*store*/addr, IRTemp_INVALID, 0 );
   3336          }
   3337       } else
   3338       if (gregLO3ofRM(modrm) == 3 /* SBB */) {
   3339          if (pfx & PFX_LOCK) {
   3340             /* cas-style store */
   3341             helper_SBB( sz, dst1, dst0, src,
   3342                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3343          } else {
   3344             /* normal store */
   3345             helper_SBB( sz, dst1, dst0, src,
   3346                         /*store*/addr, IRTemp_INVALID, 0 );
   3347          }
   3348       } else {
   3349          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3350          if (gregLO3ofRM(modrm) < 7) {
   3351             if (pfx & PFX_LOCK) {
   3352                casLE( mkexpr(addr), mkexpr(dst0)/*expVal*/,
   3353                                     mkexpr(dst1)/*newVal*/,
   3354                                     guest_RIP_curr_instr );
   3355             } else {
   3356                storeLE(mkexpr(addr), mkexpr(dst1));
   3357             }
   3358          }
   3359          if (isAddSub(op8))
   3360             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3361          else
   3362             setFlags_DEP1(op8, dst1, ty);
   3363       }
   3364 
   3365       delta += (len+d_sz);
   3366       DIP("%s%c $%lld, %s\n",
   3367           nameGrp1(gregLO3ofRM(modrm)), nameISize(sz),
   3368           d64, dis_buf);
   3369    }
   3370    return delta;
   3371 }
   3372 
   3373 
   3374 /* Group 2 extended opcodes.  shift_expr must be an 8-bit typed
   3375    expression. */
   3376 
   3377 static
   3378 ULong dis_Grp2 ( VexAbiInfo* vbi,
   3379                  Prefix pfx,
   3380                  Long delta, UChar modrm,
   3381                  Int am_sz, Int d_sz, Int sz, IRExpr* shift_expr,
   3382                  HChar* shift_expr_txt, Bool* decode_OK )
   3383 {
   3384    /* delta on entry points at the modrm byte. */
   3385    HChar  dis_buf[50];
   3386    Int    len;
   3387    Bool   isShift, isRotate, isRotateC;
   3388    IRType ty    = szToITy(sz);
   3389    IRTemp dst0  = newTemp(ty);
   3390    IRTemp dst1  = newTemp(ty);
   3391    IRTemp addr  = IRTemp_INVALID;
   3392 
   3393    *decode_OK = True;
   3394 
   3395    vassert(sz == 1 || sz == 2 || sz == 4 || sz == 8);
   3396 
   3397    /* Put value to shift/rotate in dst0. */
   3398    if (epartIsReg(modrm)) {
   3399       assign(dst0, getIRegE(sz, pfx, modrm));
   3400       delta += (am_sz + d_sz);
   3401    } else {
   3402       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
   3403       assign(dst0, loadLE(ty,mkexpr(addr)));
   3404       delta += len + d_sz;
   3405    }
   3406 
   3407    isShift = False;
   3408    switch (gregLO3ofRM(modrm)) { case 4: case 5: case 6: case 7: isShift = True; }
   3409 
   3410    isRotate = False;
   3411    switch (gregLO3ofRM(modrm)) { case 0: case 1: isRotate = True; }
   3412 
   3413    isRotateC = False;
   3414    switch (gregLO3ofRM(modrm)) { case 2: case 3: isRotateC = True; }
   3415 
   3416    if (!isShift && !isRotate && !isRotateC) {
   3417       /*NOTREACHED*/
   3418       vpanic("dis_Grp2(Reg): unhandled case(amd64)");
   3419    }
   3420 
   3421    if (isRotateC) {
   3422       /* Call a helper; this insn is so ridiculous it does not deserve
   3423          better.  One problem is, the helper has to calculate both the
   3424          new value and the new flags.  This is more than 64 bits, and
   3425          there is no way to return more than 64 bits from the helper.
   3426          Hence the crude and obvious solution is to call it twice,
   3427          using the sign of the sz field to indicate whether it is the
   3428          value or rflags result we want.
   3429       */
   3430       Bool     left = toBool(gregLO3ofRM(modrm) == 2);
   3431       IRExpr** argsVALUE;
   3432       IRExpr** argsRFLAGS;
   3433 
   3434       IRTemp new_value  = newTemp(Ity_I64);
   3435       IRTemp new_rflags = newTemp(Ity_I64);
   3436       IRTemp old_rflags = newTemp(Ity_I64);
   3437 
   3438       assign( old_rflags, widenUto64(mk_amd64g_calculate_rflags_all()) );
   3439 
   3440       argsVALUE
   3441          = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
   3442                           widenUto64(shift_expr),   /* rotate amount */
   3443                           mkexpr(old_rflags),
   3444                           mkU64(sz) );
   3445       assign( new_value,
   3446                  mkIRExprCCall(
   3447                     Ity_I64,
   3448                     0/*regparm*/,
   3449                     left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
   3450                     left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
   3451                     argsVALUE
   3452                  )
   3453             );
   3454 
   3455       argsRFLAGS
   3456          = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
   3457                           widenUto64(shift_expr),   /* rotate amount */
   3458                           mkexpr(old_rflags),
   3459                           mkU64(-sz) );
   3460       assign( new_rflags,
   3461                  mkIRExprCCall(
   3462                     Ity_I64,
   3463                     0/*regparm*/,
   3464                     left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
   3465                     left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
   3466                     argsRFLAGS
   3467                  )
   3468             );
   3469 
   3470       assign( dst1, narrowTo(ty, mkexpr(new_value)) );
   3471       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   3472       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(new_rflags) ));
   3473       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   3474       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   3475    }
   3476 
   3477    else
   3478    if (isShift) {
   3479 
   3480       IRTemp pre64     = newTemp(Ity_I64);
   3481       IRTemp res64     = newTemp(Ity_I64);
   3482       IRTemp res64ss   = newTemp(Ity_I64);
   3483       IRTemp shift_amt = newTemp(Ity_I8);
   3484       UChar  mask      = toUChar(sz==8 ? 63 : 31);
   3485       IROp   op64;
   3486 
   3487       switch (gregLO3ofRM(modrm)) {
   3488          case 4: op64 = Iop_Shl64; break;
   3489          case 5: op64 = Iop_Shr64; break;
   3490          case 6: op64 = Iop_Shl64; break;
   3491          case 7: op64 = Iop_Sar64; break;
   3492          /*NOTREACHED*/
   3493          default: vpanic("dis_Grp2:shift"); break;
   3494       }
   3495 
   3496       /* Widen the value to be shifted to 64 bits, do the shift, and
   3497          narrow back down.  This seems surprisingly long-winded, but
   3498          unfortunately the AMD semantics requires that 8/16/32-bit
   3499          shifts give defined results for shift values all the way up
   3500          to 32, and this seems the simplest way to do it.  It has the
   3501          advantage that the only IR level shifts generated are of 64
   3502          bit values, and the shift amount is guaranteed to be in the
   3503          range 0 .. 63, thereby observing the IR semantics requiring
   3504          all shift values to be in the range 0 .. 2^word_size-1.
   3505 
   3506          Therefore the shift amount is masked with 63 for 64-bit shifts
   3507          and 31 for all others.
   3508       */
   3509       /* shift_amt = shift_expr & MASK, regardless of operation size */
   3510       assign( shift_amt, binop(Iop_And8, shift_expr, mkU8(mask)) );
   3511 
   3512       /* suitably widen the value to be shifted to 64 bits. */
   3513       assign( pre64, op64==Iop_Sar64 ? widenSto64(mkexpr(dst0))
   3514                                      : widenUto64(mkexpr(dst0)) );
   3515 
   3516       /* res64 = pre64 `shift` shift_amt */
   3517       assign( res64, binop(op64, mkexpr(pre64), mkexpr(shift_amt)) );
   3518 
   3519       /* res64ss = pre64 `shift` ((shift_amt - 1) & MASK) */
   3520       assign( res64ss,
   3521               binop(op64,
   3522                     mkexpr(pre64),
   3523                     binop(Iop_And8,
   3524                           binop(Iop_Sub8,
   3525                                 mkexpr(shift_amt), mkU8(1)),
   3526                           mkU8(mask))) );
   3527 
   3528       /* Build the flags thunk. */
   3529       setFlags_DEP1_DEP2_shift(op64, res64, res64ss, ty, shift_amt);
   3530 
   3531       /* Narrow the result back down. */
   3532       assign( dst1, narrowTo(ty, mkexpr(res64)) );
   3533 
   3534    } /* if (isShift) */
   3535 
   3536    else
   3537    if (isRotate) {
   3538       Int    ccOp      = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1
   3539                                         : (ty==Ity_I32 ? 2 : 3));
   3540       Bool   left      = toBool(gregLO3ofRM(modrm) == 0);
   3541       IRTemp rot_amt   = newTemp(Ity_I8);
   3542       IRTemp rot_amt64 = newTemp(Ity_I8);
   3543       IRTemp oldFlags  = newTemp(Ity_I64);
   3544       UChar  mask      = toUChar(sz==8 ? 63 : 31);
   3545 
   3546       /* rot_amt = shift_expr & mask */
   3547       /* By masking the rotate amount thusly, the IR-level Shl/Shr
   3548          expressions never shift beyond the word size and thus remain
   3549          well defined. */
   3550       assign(rot_amt64, binop(Iop_And8, shift_expr, mkU8(mask)));
   3551 
   3552       if (ty == Ity_I64)
   3553          assign(rot_amt, mkexpr(rot_amt64));
   3554       else
   3555          assign(rot_amt, binop(Iop_And8, mkexpr(rot_amt64), mkU8(8*sz-1)));
   3556 
   3557       if (left) {
   3558 
   3559          /* dst1 = (dst0 << rot_amt) | (dst0 >>u (wordsize-rot_amt)) */
   3560          assign(dst1,
   3561             binop( mkSizedOp(ty,Iop_Or8),
   3562                    binop( mkSizedOp(ty,Iop_Shl8),
   3563                           mkexpr(dst0),
   3564                           mkexpr(rot_amt)
   3565                    ),
   3566                    binop( mkSizedOp(ty,Iop_Shr8),
   3567                           mkexpr(dst0),
   3568                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   3569                    )
   3570             )
   3571          );
   3572          ccOp += AMD64G_CC_OP_ROLB;
   3573 
   3574       } else { /* right */
   3575 
   3576          /* dst1 = (dst0 >>u rot_amt) | (dst0 << (wordsize-rot_amt)) */
   3577          assign(dst1,
   3578             binop( mkSizedOp(ty,Iop_Or8),
   3579                    binop( mkSizedOp(ty,Iop_Shr8),
   3580                           mkexpr(dst0),
   3581                           mkexpr(rot_amt)
   3582                    ),
   3583                    binop( mkSizedOp(ty,Iop_Shl8),
   3584                           mkexpr(dst0),
   3585                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   3586                    )
   3587             )
   3588          );
   3589          ccOp += AMD64G_CC_OP_RORB;
   3590 
   3591       }
   3592 
   3593       /* dst1 now holds the rotated value.  Build flag thunk.  We
   3594          need the resulting value for this, and the previous flags.
   3595          Except don't set it if the rotate count is zero. */
   3596 
   3597       assign(oldFlags, mk_amd64g_calculate_rflags_all());
   3598 
   3599       /* CC_DEP1 is the rotated value.  CC_NDEP is flags before. */
   3600       stmt( IRStmt_Put( OFFB_CC_OP,
   3601                         IRExpr_Mux0X( mkexpr(rot_amt64),
   3602                                       IRExpr_Get(OFFB_CC_OP,Ity_I64),
   3603                                       mkU64(ccOp))) );
   3604       stmt( IRStmt_Put( OFFB_CC_DEP1,
   3605                         IRExpr_Mux0X( mkexpr(rot_amt64),
   3606                                       IRExpr_Get(OFFB_CC_DEP1,Ity_I64),
   3607                                       widenUto64(mkexpr(dst1)))) );
   3608       stmt( IRStmt_Put( OFFB_CC_DEP2,
   3609                         IRExpr_Mux0X( mkexpr(rot_amt64),
   3610                                       IRExpr_Get(OFFB_CC_DEP2,Ity_I64),
   3611                                       mkU64(0))) );
   3612       stmt( IRStmt_Put( OFFB_CC_NDEP,
   3613                         IRExpr_Mux0X( mkexpr(rot_amt64),
   3614                                       IRExpr_Get(OFFB_CC_NDEP,Ity_I64),
   3615                                       mkexpr(oldFlags))) );
   3616    } /* if (isRotate) */
   3617 
   3618    /* Save result, and finish up. */
   3619    if (epartIsReg(modrm)) {
   3620       putIRegE(sz, pfx, modrm, mkexpr(dst1));
   3621       if (vex_traceflags & VEX_TRACE_FE) {
   3622          vex_printf("%s%c ",
   3623                     nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
   3624          if (shift_expr_txt)
   3625             vex_printf("%s", shift_expr_txt);
   3626          else
   3627             ppIRExpr(shift_expr);
   3628          vex_printf(", %s\n", nameIRegE(sz,pfx,modrm));
   3629       }
   3630    } else {
   3631       storeLE(mkexpr(addr), mkexpr(dst1));
   3632       if (vex_traceflags & VEX_TRACE_FE) {
   3633          vex_printf("%s%c ",
   3634                     nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
   3635          if (shift_expr_txt)
   3636             vex_printf("%s", shift_expr_txt);
   3637          else
   3638             ppIRExpr(shift_expr);
   3639          vex_printf(", %s\n", dis_buf);
   3640       }
   3641    }
   3642    return delta;
   3643 }
   3644 
   3645 
   3646 /* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
   3647 static
   3648 ULong dis_Grp8_Imm ( VexAbiInfo* vbi,
   3649                      Prefix pfx,
   3650                      Long delta, UChar modrm,
   3651                      Int am_sz, Int sz, ULong src_val,
   3652                      Bool* decode_OK )
   3653 {
   3654    /* src_val denotes a d8.
   3655       And delta on entry points at the modrm byte. */
   3656 
   3657    IRType ty     = szToITy(sz);
   3658    IRTemp t2     = newTemp(Ity_I64);
   3659    IRTemp t2m    = newTemp(Ity_I64);
   3660    IRTemp t_addr = IRTemp_INVALID;
   3661    HChar  dis_buf[50];
   3662    ULong  mask;
   3663 
   3664    /* we're optimists :-) */
   3665    *decode_OK = True;
   3666 
   3667    /* Limit src_val -- the bit offset -- to something within a word.
   3668       The Intel docs say that literal offsets larger than a word are
   3669       masked in this way. */
   3670    switch (sz) {
   3671       case 2:  src_val &= 15; break;
   3672       case 4:  src_val &= 31; break;
   3673       case 8:  src_val &= 63; break;
   3674       default: *decode_OK = False; return delta;
   3675    }
   3676 
   3677    /* Invent a mask suitable for the operation. */
   3678    switch (gregLO3ofRM(modrm)) {
   3679       case 4: /* BT */  mask = 0;                  break;
   3680       case 5: /* BTS */ mask = 1ULL << src_val;    break;
   3681       case 6: /* BTR */ mask = ~(1ULL << src_val); break;
   3682       case 7: /* BTC */ mask = 1ULL << src_val;    break;
   3683          /* If this needs to be extended, probably simplest to make a
   3684             new function to handle the other cases (0 .. 3).  The
   3685             Intel docs do however not indicate any use for 0 .. 3, so
   3686             we don't expect this to happen. */
   3687       default: *decode_OK = False; return delta;
   3688    }
   3689 
   3690    /* Fetch the value to be tested and modified into t2, which is
   3691       64-bits wide regardless of sz. */
   3692    if (epartIsReg(modrm)) {
   3693       vassert(am_sz == 1);
   3694       assign( t2, widenUto64(getIRegE(sz, pfx, modrm)) );
   3695       delta += (am_sz + 1);
   3696       DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
   3697                                 nameISize(sz),
   3698                                 src_val, nameIRegE(sz,pfx,modrm));
   3699    } else {
   3700       Int len;
   3701       t_addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 1 );
   3702       delta  += (len+1);
   3703       assign( t2, widenUto64(loadLE(ty, mkexpr(t_addr))) );
   3704       DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
   3705                                 nameISize(sz),
   3706                                 src_val, dis_buf);
   3707    }
   3708 
   3709    /* Compute the new value into t2m, if non-BT. */
   3710    switch (gregLO3ofRM(modrm)) {
   3711       case 4: /* BT */
   3712          break;
   3713       case 5: /* BTS */
   3714          assign( t2m, binop(Iop_Or64, mkU64(mask), mkexpr(t2)) );
   3715          break;
   3716       case 6: /* BTR */
   3717          assign( t2m, binop(Iop_And64, mkU64(mask), mkexpr(t2)) );
   3718          break;
   3719       case 7: /* BTC */
   3720          assign( t2m, binop(Iop_Xor64, mkU64(mask), mkexpr(t2)) );
   3721          break;
   3722      default:
   3723          /*NOTREACHED*/ /*the previous switch guards this*/
   3724          vassert(0);
   3725    }
   3726 
   3727    /* Write the result back, if non-BT. */
   3728    if (gregLO3ofRM(modrm) != 4 /* BT */) {
   3729       if (epartIsReg(modrm)) {
   3730 	putIRegE(sz, pfx, modrm, narrowTo(ty, mkexpr(t2m)));
   3731       } else {
   3732          if (pfx & PFX_LOCK) {
   3733             casLE( mkexpr(t_addr),
   3734                    narrowTo(ty, mkexpr(t2))/*expd*/,
   3735                    narrowTo(ty, mkexpr(t2m))/*new*/,
   3736                    guest_RIP_curr_instr );
   3737          } else {
   3738             storeLE(mkexpr(t_addr), narrowTo(ty, mkexpr(t2m)));
   3739          }
   3740       }
   3741    }
   3742 
   3743    /* Copy relevant bit from t2 into the carry flag. */
   3744    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
   3745    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   3746    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   3747    stmt( IRStmt_Put(
   3748             OFFB_CC_DEP1,
   3749             binop(Iop_And64,
   3750                   binop(Iop_Shr64, mkexpr(t2), mkU8(src_val)),
   3751                   mkU64(1))
   3752        ));
   3753    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   3754       elimination of previous stores to this field work better. */
   3755    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   3756 
   3757    return delta;
   3758 }
   3759 
   3760 
   3761 /* Signed/unsigned widening multiply.  Generate IR to multiply the
   3762    value in RAX/EAX/AX/AL by the given IRTemp, and park the result in
   3763    RDX:RAX/EDX:EAX/DX:AX/AX.
   3764 */
   3765 static void codegen_mulL_A_D ( Int sz, Bool syned,
   3766                                IRTemp tmp, HChar* tmp_txt )
   3767 {
   3768    IRType ty = szToITy(sz);
   3769    IRTemp t1 = newTemp(ty);
   3770 
   3771    assign( t1, getIRegRAX(sz) );
   3772 
   3773    switch (ty) {
   3774       case Ity_I64: {
   3775          IRTemp res128  = newTemp(Ity_I128);
   3776          IRTemp resHi   = newTemp(Ity_I64);
   3777          IRTemp resLo   = newTemp(Ity_I64);
   3778          IROp   mulOp   = syned ? Iop_MullS64 : Iop_MullU64;
   3779          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   3780          setFlags_MUL ( Ity_I64, t1, tmp, tBaseOp );
   3781          assign( res128, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   3782          assign( resHi, unop(Iop_128HIto64,mkexpr(res128)));
   3783          assign( resLo, unop(Iop_128to64,mkexpr(res128)));
   3784          putIReg64(R_RDX, mkexpr(resHi));
   3785          putIReg64(R_RAX, mkexpr(resLo));
   3786          break;
   3787       }
   3788       case Ity_I32: {
   3789          IRTemp res64   = newTemp(Ity_I64);
   3790          IRTemp resHi   = newTemp(Ity_I32);
   3791          IRTemp resLo   = newTemp(Ity_I32);
   3792          IROp   mulOp   = syned ? Iop_MullS32 : Iop_MullU32;
   3793          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   3794          setFlags_MUL ( Ity_I32, t1, tmp, tBaseOp );
   3795          assign( res64, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   3796          assign( resHi, unop(Iop_64HIto32,mkexpr(res64)));
   3797          assign( resLo, unop(Iop_64to32,mkexpr(res64)));
   3798          putIRegRDX(4, mkexpr(resHi));
   3799          putIRegRAX(4, mkexpr(resLo));
   3800          break;
   3801       }
   3802       case Ity_I16: {
   3803          IRTemp res32   = newTemp(Ity_I32);
   3804          IRTemp resHi   = newTemp(Ity_I16);
   3805          IRTemp resLo   = newTemp(Ity_I16);
   3806          IROp   mulOp   = syned ? Iop_MullS16 : Iop_MullU16;
   3807          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   3808          setFlags_MUL ( Ity_I16, t1, tmp, tBaseOp );
   3809          assign( res32, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   3810          assign( resHi, unop(Iop_32HIto16,mkexpr(res32)));
   3811          assign( resLo, unop(Iop_32to16,mkexpr(res32)));
   3812          putIRegRDX(2, mkexpr(resHi));
   3813          putIRegRAX(2, mkexpr(resLo));
   3814          break;
   3815       }
   3816       case Ity_I8: {
   3817          IRTemp res16   = newTemp(Ity_I16);
   3818          IRTemp resHi   = newTemp(Ity_I8);
   3819          IRTemp resLo   = newTemp(Ity_I8);
   3820          IROp   mulOp   = syned ? Iop_MullS8 : Iop_MullU8;
   3821          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   3822          setFlags_MUL ( Ity_I8, t1, tmp, tBaseOp );
   3823          assign( res16, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   3824          assign( resHi, unop(Iop_16HIto8,mkexpr(res16)));
   3825          assign( resLo, unop(Iop_16to8,mkexpr(res16)));
   3826          putIRegRAX(2, mkexpr(res16));
   3827          break;
   3828       }
   3829       default:
   3830          ppIRType(ty);
   3831          vpanic("codegen_mulL_A_D(amd64)");
   3832    }
   3833    DIP("%s%c %s\n", syned ? "imul" : "mul", nameISize(sz), tmp_txt);
   3834 }
   3835 
   3836 
   3837 /* Group 3 extended opcodes. */
   3838 static
   3839 ULong dis_Grp3 ( VexAbiInfo* vbi,
   3840                  Prefix pfx, Int sz, Long delta, Bool* decode_OK )
   3841 {
   3842    Long    d64;
   3843    UChar   modrm;
   3844    HChar   dis_buf[50];
   3845    Int     len;
   3846    IRTemp  addr;
   3847    IRType  ty = szToITy(sz);
   3848    IRTemp  t1 = newTemp(ty);
   3849    IRTemp dst1, src, dst0;
   3850    *decode_OK = True;
   3851    modrm = getUChar(delta);
   3852    if (epartIsReg(modrm)) {
   3853       switch (gregLO3ofRM(modrm)) {
   3854          case 0: { /* TEST */
   3855             delta++;
   3856             d64 = getSDisp(imin(4,sz), delta);
   3857             delta += imin(4,sz);
   3858             dst1 = newTemp(ty);
   3859             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   3860                                getIRegE(sz,pfx,modrm),
   3861                                mkU(ty, d64 & mkSizeMask(sz))));
   3862             setFlags_DEP1( Iop_And8, dst1, ty );
   3863             DIP("test%c $%lld, %s\n",
   3864                 nameISize(sz), d64,
   3865                 nameIRegE(sz, pfx, modrm));
   3866             break;
   3867          }
   3868          case 1:
   3869             *decode_OK = False;
   3870             return delta;
   3871          case 2: /* NOT */
   3872             delta++;
   3873             putIRegE(sz, pfx, modrm,
   3874                               unop(mkSizedOp(ty,Iop_Not8),
   3875                                    getIRegE(sz, pfx, modrm)));
   3876             DIP("not%c %s\n", nameISize(sz),
   3877                               nameIRegE(sz, pfx, modrm));
   3878             break;
   3879          case 3: /* NEG */
   3880             delta++;
   3881             dst0 = newTemp(ty);
   3882             src  = newTemp(ty);
   3883             dst1 = newTemp(ty);
   3884             assign(dst0, mkU(ty,0));
   3885             assign(src,  getIRegE(sz, pfx, modrm));
   3886             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
   3887                                                        mkexpr(src)));
   3888             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   3889             putIRegE(sz, pfx, modrm, mkexpr(dst1));
   3890             DIP("neg%c %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm));
   3891             break;
   3892          case 4: /* MUL (unsigned widening) */
   3893             delta++;
   3894             src = newTemp(ty);
   3895             assign(src, getIRegE(sz,pfx,modrm));
   3896             codegen_mulL_A_D ( sz, False, src,
   3897                                nameIRegE(sz,pfx,modrm) );
   3898             break;
   3899          case 5: /* IMUL (signed widening) */
   3900             delta++;
   3901             src = newTemp(ty);
   3902             assign(src, getIRegE(sz,pfx,modrm));
   3903             codegen_mulL_A_D ( sz, True, src,
   3904                                nameIRegE(sz,pfx,modrm) );
   3905             break;
   3906          case 6: /* DIV */
   3907             delta++;
   3908             assign( t1, getIRegE(sz, pfx, modrm) );
   3909             codegen_div ( sz, t1, False );
   3910             DIP("div%c %s\n", nameISize(sz),
   3911                               nameIRegE(sz, pfx, modrm));
   3912             break;
   3913          case 7: /* IDIV */
   3914             delta++;
   3915             assign( t1, getIRegE(sz, pfx, modrm) );
   3916             codegen_div ( sz, t1, True );
   3917             DIP("idiv%c %s\n", nameISize(sz),
   3918                                nameIRegE(sz, pfx, modrm));
   3919             break;
   3920          default:
   3921             /*NOTREACHED*/
   3922             vpanic("Grp3(amd64,R)");
   3923       }
   3924    } else {
   3925       addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
   3926                         /* we have to inform disAMode of any immediate
   3927 			   bytes used */
   3928                         gregLO3ofRM(modrm)==0/*TEST*/
   3929                            ? imin(4,sz)
   3930                            : 0
   3931                       );
   3932       t1   = newTemp(ty);
   3933       delta += len;
   3934       assign(t1, loadLE(ty,mkexpr(addr)));
   3935       switch (gregLO3ofRM(modrm)) {
   3936          case 0: { /* TEST */
   3937             d64 = getSDisp(imin(4,sz), delta);
   3938             delta += imin(4,sz);
   3939             dst1 = newTemp(ty);
   3940             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   3941                                mkexpr(t1),
   3942                                mkU(ty, d64 & mkSizeMask(sz))));
   3943             setFlags_DEP1( Iop_And8, dst1, ty );
   3944             DIP("test%c $%lld, %s\n", nameISize(sz), d64, dis_buf);
   3945             break;
   3946          }
   3947          case 1:
   3948             *decode_OK = False;
   3949             return delta;
   3950          case 2: /* NOT */
   3951             dst1 = newTemp(ty);
   3952             assign(dst1, unop(mkSizedOp(ty,Iop_Not8), mkexpr(t1)));
   3953             if (pfx & PFX_LOCK) {
   3954                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   3955                                     guest_RIP_curr_instr );
   3956             } else {
   3957                storeLE( mkexpr(addr), mkexpr(dst1) );
   3958             }
   3959             DIP("not%c %s\n", nameISize(sz), dis_buf);
   3960             break;
   3961          case 3: /* NEG */
   3962             dst0 = newTemp(ty);
   3963             src  = newTemp(ty);
   3964             dst1 = newTemp(ty);
   3965             assign(dst0, mkU(ty,0));
   3966             assign(src,  mkexpr(t1));
   3967             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
   3968                                                        mkexpr(src)));
   3969             if (pfx & PFX_LOCK) {
   3970                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   3971                                     guest_RIP_curr_instr );
   3972             } else {
   3973                storeLE( mkexpr(addr), mkexpr(dst1) );
   3974             }
   3975             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   3976             DIP("neg%c %s\n", nameISize(sz), dis_buf);
   3977             break;
   3978          case 4: /* MUL (unsigned widening) */
   3979             codegen_mulL_A_D ( sz, False, t1, dis_buf );
   3980             break;
   3981          case 5: /* IMUL */
   3982             codegen_mulL_A_D ( sz, True, t1, dis_buf );
   3983             break;
   3984          case 6: /* DIV */
   3985             codegen_div ( sz, t1, False );
   3986             DIP("div%c %s\n", nameISize(sz), dis_buf);
   3987             break;
   3988          case 7: /* IDIV */
   3989             codegen_div ( sz, t1, True );
   3990             DIP("idiv%c %s\n", nameISize(sz), dis_buf);
   3991             break;
   3992          default:
   3993             /*NOTREACHED*/
   3994             vpanic("Grp3(amd64,M)");
   3995       }
   3996    }
   3997    return delta;
   3998 }
   3999 
   4000 
   4001 /* Group 4 extended opcodes. */
   4002 static
   4003 ULong dis_Grp4 ( VexAbiInfo* vbi,
   4004                  Prefix pfx, Long delta, Bool* decode_OK )
   4005 {
   4006    Int   alen;
   4007    UChar modrm;
   4008    HChar dis_buf[50];
   4009    IRType ty = Ity_I8;
   4010    IRTemp t1 = newTemp(ty);
   4011    IRTemp t2 = newTemp(ty);
   4012 
   4013    *decode_OK = True;
   4014 
   4015    modrm = getUChar(delta);
   4016    if (epartIsReg(modrm)) {
   4017       assign(t1, getIRegE(1, pfx, modrm));
   4018       switch (gregLO3ofRM(modrm)) {
   4019          case 0: /* INC */
   4020             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   4021             putIRegE(1, pfx, modrm, mkexpr(t2));
   4022             setFlags_INC_DEC( True, t2, ty );
   4023             break;
   4024          case 1: /* DEC */
   4025             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   4026             putIRegE(1, pfx, modrm, mkexpr(t2));
   4027             setFlags_INC_DEC( False, t2, ty );
   4028             break;
   4029          default:
   4030             *decode_OK = False;
   4031             return delta;
   4032       }
   4033       delta++;
   4034       DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)),
   4035                       nameIRegE(1, pfx, modrm));
   4036    } else {
   4037       IRTemp addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   4038       assign( t1, loadLE(ty, mkexpr(addr)) );
   4039       switch (gregLO3ofRM(modrm)) {
   4040          case 0: /* INC */
   4041             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   4042             if (pfx & PFX_LOCK) {
   4043                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   4044                       guest_RIP_curr_instr );
   4045             } else {
   4046                storeLE( mkexpr(addr), mkexpr(t2) );
   4047             }
   4048             setFlags_INC_DEC( True, t2, ty );
   4049             break;
   4050          case 1: /* DEC */
   4051             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   4052             if (pfx & PFX_LOCK) {
   4053                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   4054                       guest_RIP_curr_instr );
   4055             } else {
   4056                storeLE( mkexpr(addr), mkexpr(t2) );
   4057             }
   4058             setFlags_INC_DEC( False, t2, ty );
   4059             break;
   4060          default:
   4061             *decode_OK = False;
   4062             return delta;
   4063       }
   4064       delta += alen;
   4065       DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)), dis_buf);
   4066    }
   4067    return delta;
   4068 }
   4069 
   4070 
   4071 /* Group 5 extended opcodes. */
   4072 static
   4073 ULong dis_Grp5 ( VexAbiInfo* vbi,
   4074                  Prefix pfx, Int sz, Long delta,
   4075                  /*MOD*/DisResult* dres, /*OUT*/Bool* decode_OK )
   4076 {
   4077    Int     len;
   4078    UChar   modrm;
   4079    HChar   dis_buf[50];
   4080    IRTemp  addr = IRTemp_INVALID;
   4081    IRType  ty = szToITy(sz);
   4082    IRTemp  t1 = newTemp(ty);
   4083    IRTemp  t2 = IRTemp_INVALID;
   4084    IRTemp  t3 = IRTemp_INVALID;
   4085    Bool    showSz = True;
   4086 
   4087    *decode_OK = True;
   4088 
   4089    modrm = getUChar(delta);
   4090    if (epartIsReg(modrm)) {
   4091       assign(t1, getIRegE(sz,pfx,modrm));
   4092       switch (gregLO3ofRM(modrm)) {
   4093          case 0: /* INC */
   4094             t2 = newTemp(ty);
   4095             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   4096                              mkexpr(t1), mkU(ty,1)));
   4097             setFlags_INC_DEC( True, t2, ty );
   4098             putIRegE(sz,pfx,modrm, mkexpr(t2));
   4099             break;
   4100          case 1: /* DEC */
   4101             t2 = newTemp(ty);
   4102             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   4103                              mkexpr(t1), mkU(ty,1)));
   4104             setFlags_INC_DEC( False, t2, ty );
   4105             putIRegE(sz,pfx,modrm, mkexpr(t2));
   4106             break;
   4107          case 2: /* call Ev */
   4108             /* Ignore any sz value and operate as if sz==8. */
   4109             if (!(sz == 4 || sz == 8)) goto unhandled;
   4110             sz = 8;
   4111             t3 = newTemp(Ity_I64);
   4112             assign(t3, getIRegE(sz,pfx,modrm));
   4113             t2 = newTemp(Ity_I64);
   4114             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   4115             putIReg64(R_RSP, mkexpr(t2));
   4116             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+1));
   4117             make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(reg)");
   4118             jmp_treg(dres, Ijk_Call, t3);
   4119             vassert(dres->whatNext == Dis_StopHere);
   4120             showSz = False;
   4121             break;
   4122          case 4: /* jmp Ev */
   4123             /* Ignore any sz value and operate as if sz==8. */
   4124             if (!(sz == 4 || sz == 8)) goto unhandled;
   4125             sz = 8;
   4126             t3 = newTemp(Ity_I64);
   4127             assign(t3, getIRegE(sz,pfx,modrm));
   4128             jmp_treg(dres, Ijk_Boring, t3);
   4129             vassert(dres->whatNext == Dis_StopHere);
   4130             showSz = False;
   4131             break;
   4132          default:
   4133             *decode_OK = False;
   4134             return delta;
   4135       }
   4136       delta++;
   4137       DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
   4138                        showSz ? nameISize(sz) : ' ',
   4139                        nameIRegE(sz, pfx, modrm));
   4140    } else {
   4141       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   4142       if (gregLO3ofRM(modrm) != 2 && gregLO3ofRM(modrm) != 4
   4143                                   && gregLO3ofRM(modrm) != 6) {
   4144          assign(t1, loadLE(ty,mkexpr(addr)));
   4145       }
   4146       switch (gregLO3ofRM(modrm)) {
   4147          case 0: /* INC */
   4148             t2 = newTemp(ty);
   4149             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   4150                              mkexpr(t1), mkU(ty,1)));
   4151             if (pfx & PFX_LOCK) {
   4152                casLE( mkexpr(addr),
   4153                       mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
   4154             } else {
   4155                storeLE(mkexpr(addr),mkexpr(t2));
   4156             }
   4157             setFlags_INC_DEC( True, t2, ty );
   4158             break;
   4159          case 1: /* DEC */
   4160             t2 = newTemp(ty);
   4161             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   4162                              mkexpr(t1), mkU(ty,1)));
   4163             if (pfx & PFX_LOCK) {
   4164                casLE( mkexpr(addr),
   4165                       mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
   4166             } else {
   4167                storeLE(mkexpr(addr),mkexpr(t2));
   4168             }
   4169             setFlags_INC_DEC( False, t2, ty );
   4170             break;
   4171          case 2: /* call Ev */
   4172             /* Ignore any sz value and operate as if sz==8. */
   4173             if (!(sz == 4 || sz == 8)) goto unhandled;
   4174             sz = 8;
   4175             t3 = newTemp(Ity_I64);
   4176             assign(t3, loadLE(Ity_I64,mkexpr(addr)));
   4177             t2 = newTemp(Ity_I64);
   4178             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   4179             putIReg64(R_RSP, mkexpr(t2));
   4180             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+len));
   4181             make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(mem)");
   4182             jmp_treg(dres, Ijk_Call, t3);
   4183             vassert(dres->whatNext == Dis_StopHere);
   4184             showSz = False;
   4185             break;
   4186          case 4: /* JMP Ev */
   4187             /* Ignore any sz value and operate as if sz==8. */
   4188             if (!(sz == 4 || sz == 8)) goto unhandled;
   4189             sz = 8;
   4190             t3 = newTemp(Ity_I64);
   4191             assign(t3, loadLE(Ity_I64,mkexpr(addr)));
   4192             jmp_treg(dres, Ijk_Boring, t3);
   4193             vassert(dres->whatNext == Dis_StopHere);
   4194             showSz = False;
   4195             break;
   4196          case 6: /* PUSH Ev */
   4197             /* There is no encoding for 32-bit operand size; hence ... */
   4198             if (sz == 4) sz = 8;
   4199             if (!(sz == 8 || sz == 2)) goto unhandled;
   4200             if (sz == 8) {
   4201                t3 = newTemp(Ity_I64);
   4202                assign(t3, loadLE(Ity_I64,mkexpr(addr)));
   4203                t2 = newTemp(Ity_I64);
   4204                assign( t2, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
   4205                putIReg64(R_RSP, mkexpr(t2) );
   4206                storeLE( mkexpr(t2), mkexpr(t3) );
   4207                break;
   4208 	    } else {
   4209                goto unhandled; /* awaiting test case */
   4210 	    }
   4211          default:
   4212          unhandled:
   4213             *decode_OK = False;
   4214             return delta;
   4215       }
   4216       delta += len;
   4217       DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
   4218                        showSz ? nameISize(sz) : ' ',
   4219                        dis_buf);
   4220    }
   4221    return delta;
   4222 }
   4223 
   4224 
   4225 /*------------------------------------------------------------*/
   4226 /*--- Disassembling string ops (including REP prefixes)    ---*/
   4227 /*------------------------------------------------------------*/
   4228 
   4229 /* Code shared by all the string ops */
   4230 static
   4231 void dis_string_op_increment ( Int sz, IRTemp t_inc )
   4232 {
   4233    UChar logSz;
   4234    if (sz == 8 || sz == 4 || sz == 2) {
   4235       logSz = 1;
   4236       if (sz == 4) logSz = 2;
   4237       if (sz == 8) logSz = 3;
   4238       assign( t_inc,
   4239               binop(Iop_Shl64, IRExpr_Get( OFFB_DFLAG, Ity_I64 ),
   4240                                mkU8(logSz) ) );
   4241    } else {
   4242       assign( t_inc,
   4243               IRExpr_Get( OFFB_DFLAG, Ity_I64 ) );
   4244    }
   4245 }
   4246 
   4247 static
   4248 void dis_string_op( void (*dis_OP)( Int, IRTemp, Prefix pfx ),
   4249                     Int sz, HChar* name, Prefix pfx )
   4250 {
   4251    IRTemp t_inc = newTemp(Ity_I64);
   4252    /* Really we ought to inspect the override prefixes, but we don't.
   4253       The following assertion catches any resulting sillyness. */
   4254    vassert(pfx == clearSegBits(pfx));
   4255    dis_string_op_increment(sz, t_inc);
   4256    dis_OP( sz, t_inc, pfx );
   4257    DIP("%s%c\n", name, nameISize(sz));
   4258 }
   4259 
   4260 static
   4261 void dis_MOVS ( Int sz, IRTemp t_inc, Prefix pfx )
   4262 {
   4263    IRType ty = szToITy(sz);
   4264    IRTemp td = newTemp(Ity_I64);   /* RDI */
   4265    IRTemp ts = newTemp(Ity_I64);   /* RSI */
   4266    IRExpr *incd, *incs;
   4267 
   4268    if (haveASO(pfx)) {
   4269       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4270       assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
   4271    } else {
   4272       assign( td, getIReg64(R_RDI) );
   4273       assign( ts, getIReg64(R_RSI) );
   4274    }
   4275 
   4276    storeLE( mkexpr(td), loadLE(ty,mkexpr(ts)) );
   4277 
   4278    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4279    incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
   4280    if (haveASO(pfx)) {
   4281       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4282       incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
   4283    }
   4284    putIReg64( R_RDI, incd );
   4285    putIReg64( R_RSI, incs );
   4286 }
   4287 
   4288 static
   4289 void dis_LODS ( Int sz, IRTemp t_inc, Prefix pfx )
   4290 {
   4291    IRType ty = szToITy(sz);
   4292    IRTemp ts = newTemp(Ity_I64);   /* RSI */
   4293    IRExpr *incs;
   4294 
   4295    if (haveASO(pfx))
   4296       assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
   4297    else
   4298       assign( ts, getIReg64(R_RSI) );
   4299 
   4300    putIRegRAX ( sz, loadLE(ty, mkexpr(ts)) );
   4301 
   4302    incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
   4303    if (haveASO(pfx))
   4304       incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
   4305    putIReg64( R_RSI, incs );
   4306 }
   4307 
   4308 static
   4309 void dis_STOS ( Int sz, IRTemp t_inc, Prefix pfx )
   4310 {
   4311    IRType ty = szToITy(sz);
   4312    IRTemp ta = newTemp(ty);        /* rAX */
   4313    IRTemp td = newTemp(Ity_I64);   /* RDI */
   4314    IRExpr *incd;
   4315 
   4316    assign( ta, getIRegRAX(sz) );
   4317 
   4318    if (haveASO(pfx))
   4319       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4320    else
   4321       assign( td, getIReg64(R_RDI) );
   4322 
   4323    storeLE( mkexpr(td), mkexpr(ta) );
   4324 
   4325    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4326    if (haveASO(pfx))
   4327       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4328    putIReg64( R_RDI, incd );
   4329 }
   4330 
   4331 static
   4332 void dis_CMPS ( Int sz, IRTemp t_inc, Prefix pfx )
   4333 {
   4334    IRType ty  = szToITy(sz);
   4335    IRTemp tdv = newTemp(ty);      /* (RDI) */
   4336    IRTemp tsv = newTemp(ty);      /* (RSI) */
   4337    IRTemp td  = newTemp(Ity_I64); /*  RDI  */
   4338    IRTemp ts  = newTemp(Ity_I64); /*  RSI  */
   4339    IRExpr *incd, *incs;
   4340 
   4341    if (haveASO(pfx)) {
   4342       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4343       assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
   4344    } else {
   4345       assign( td, getIReg64(R_RDI) );
   4346       assign( ts, getIReg64(R_RSI) );
   4347    }
   4348 
   4349    assign( tdv, loadLE(ty,mkexpr(td)) );
   4350 
   4351    assign( tsv, loadLE(ty,mkexpr(ts)) );
   4352 
   4353    setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
   4354 
   4355    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4356    incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
   4357    if (haveASO(pfx)) {
   4358       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4359       incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
   4360    }
   4361    putIReg64( R_RDI, incd );
   4362    putIReg64( R_RSI, incs );
   4363 }
   4364 
   4365 static
   4366 void dis_SCAS ( Int sz, IRTemp t_inc, Prefix pfx )
   4367 {
   4368    IRType ty  = szToITy(sz);
   4369    IRTemp ta  = newTemp(ty);       /*  rAX  */
   4370    IRTemp td  = newTemp(Ity_I64);  /*  RDI  */
   4371    IRTemp tdv = newTemp(ty);       /* (RDI) */
   4372    IRExpr *incd;
   4373 
   4374    assign( ta, getIRegRAX(sz) );
   4375 
   4376    if (haveASO(pfx))
   4377       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4378    else
   4379       assign( td, getIReg64(R_RDI) );
   4380 
   4381    assign( tdv, loadLE(ty,mkexpr(td)) );
   4382 
   4383    setFlags_DEP1_DEP2 ( Iop_Sub8, ta, tdv, ty );
   4384 
   4385    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4386    if (haveASO(pfx))
   4387       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4388    putIReg64( R_RDI, incd );
   4389 }
   4390 
   4391 
   4392 /* Wrap the appropriate string op inside a REP/REPE/REPNE.  We assume
   4393    the insn is the last one in the basic block, and so emit a jump to
   4394    the next insn, rather than just falling through. */
   4395 static
   4396 void dis_REP_op ( /*MOD*/DisResult* dres,
   4397                   AMD64Condcode cond,
   4398                   void (*dis_OP)(Int, IRTemp, Prefix),
   4399                   Int sz, Addr64 rip, Addr64 rip_next, HChar* name,
   4400                   Prefix pfx )
   4401 {
   4402    IRTemp t_inc = newTemp(Ity_I64);
   4403    IRTemp tc;
   4404    IRExpr* cmp;
   4405 
   4406    /* Really we ought to inspect the override prefixes, but we don't.
   4407       The following assertion catches any resulting sillyness. */
   4408    vassert(pfx == clearSegBits(pfx));
   4409 
   4410    if (haveASO(pfx)) {
   4411       tc = newTemp(Ity_I32);  /*  ECX  */
   4412       assign( tc, getIReg32(R_RCX) );
   4413       cmp = binop(Iop_CmpEQ32, mkexpr(tc), mkU32(0));
   4414    } else {
   4415       tc = newTemp(Ity_I64);  /*  RCX  */
   4416       assign( tc, getIReg64(R_RCX) );
   4417       cmp = binop(Iop_CmpEQ64, mkexpr(tc), mkU64(0));
   4418    }
   4419 
   4420    stmt( IRStmt_Exit( cmp, Ijk_Boring,
   4421                       IRConst_U64(rip_next), OFFB_RIP ) );
   4422 
   4423    if (haveASO(pfx))
   4424       putIReg32(R_RCX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
   4425   else
   4426       putIReg64(R_RCX, binop(Iop_Sub64, mkexpr(tc), mkU64(1)) );
   4427 
   4428    dis_string_op_increment(sz, t_inc);
   4429    dis_OP (sz, t_inc, pfx);
   4430 
   4431    if (cond == AMD64CondAlways) {
   4432       jmp_lit(dres, Ijk_Boring, rip);
   4433       vassert(dres->whatNext == Dis_StopHere);
   4434    } else {
   4435       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(cond),
   4436                          Ijk_Boring,
   4437                          IRConst_U64(rip),
   4438                          OFFB_RIP ) );
   4439       jmp_lit(dres, Ijk_Boring, rip_next);
   4440       vassert(dres->whatNext == Dis_StopHere);
   4441    }
   4442    DIP("%s%c\n", name, nameISize(sz));
   4443 }
   4444 
   4445 
   4446 /*------------------------------------------------------------*/
   4447 /*--- Arithmetic, etc.                                     ---*/
   4448 /*------------------------------------------------------------*/
   4449 
   4450 /* IMUL E, G.  Supplied eip points to the modR/M byte. */
   4451 static
   4452 ULong dis_mul_E_G ( VexAbiInfo* vbi,
   4453                     Prefix      pfx,
   4454                     Int         size,
   4455                     Long        delta0 )
   4456 {
   4457    Int    alen;
   4458    HChar  dis_buf[50];
   4459    UChar  rm = getUChar(delta0);
   4460    IRType ty = szToITy(size);
   4461    IRTemp te = newTemp(ty);
   4462    IRTemp tg = newTemp(ty);
   4463    IRTemp resLo = newTemp(ty);
   4464 
   4465    assign( tg, getIRegG(size, pfx, rm) );
   4466    if (epartIsReg(rm)) {
   4467       assign( te, getIRegE(size, pfx, rm) );
   4468    } else {
   4469       IRTemp addr = disAMode( &alen, vbi, pfx, delta0, dis_buf, 0 );
   4470       assign( te, loadLE(ty,mkexpr(addr)) );
   4471    }
   4472 
   4473    setFlags_MUL ( ty, te, tg, AMD64G_CC_OP_SMULB );
   4474 
   4475    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tg) ) );
   4476 
   4477    putIRegG(size, pfx, rm, mkexpr(resLo) );
   4478 
   4479    if (epartIsReg(rm)) {
   4480       DIP("imul%c %s, %s\n", nameISize(size),
   4481                              nameIRegE(size,pfx,rm),
   4482                              nameIRegG(size,pfx,rm));
   4483       return 1+delta0;
   4484    } else {
   4485       DIP("imul%c %s, %s\n", nameISize(size),
   4486                              dis_buf,
   4487                              nameIRegG(size,pfx,rm));
   4488       return alen+delta0;
   4489    }
   4490 }
   4491 
   4492 
   4493 /* IMUL I * E -> G.  Supplied rip points to the modR/M byte. */
   4494 static
   4495 ULong dis_imul_I_E_G ( VexAbiInfo* vbi,
   4496                        Prefix      pfx,
   4497                        Int         size,
   4498                        Long        delta,
   4499                        Int         litsize )
   4500 {
   4501    Long   d64;
   4502    Int    alen;
   4503    HChar  dis_buf[50];
   4504    UChar  rm = getUChar(delta);
   4505    IRType ty = szToITy(size);
   4506    IRTemp te = newTemp(ty);
   4507    IRTemp tl = newTemp(ty);
   4508    IRTemp resLo = newTemp(ty);
   4509 
   4510    vassert(/*size == 1 ||*/ size == 2 || size == 4 || size == 8);
   4511 
   4512    if (epartIsReg(rm)) {
   4513       assign(te, getIRegE(size, pfx, rm));
   4514       delta++;
   4515    } else {
   4516       IRTemp addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   4517                                      imin(4,litsize) );
   4518       assign(te, loadLE(ty, mkexpr(addr)));
   4519       delta += alen;
   4520    }
   4521    d64 = getSDisp(imin(4,litsize),delta);
   4522    delta += imin(4,litsize);
   4523 
   4524    d64 &= mkSizeMask(size);
   4525    assign(tl, mkU(ty,d64));
   4526 
   4527    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tl) ));
   4528 
   4529    setFlags_MUL ( ty, te, tl, AMD64G_CC_OP_SMULB );
   4530 
   4531    putIRegG(size, pfx, rm, mkexpr(resLo));
   4532 
   4533    DIP("imul%c $%lld, %s, %s\n",
   4534        nameISize(size), d64,
   4535        ( epartIsReg(rm) ? nameIRegE(size,pfx,rm) : dis_buf ),
   4536        nameIRegG(size,pfx,rm) );
   4537    return delta;
   4538 }
   4539 
   4540 
   4541 /* Generate an IR sequence to do a popcount operation on the supplied
   4542    IRTemp, and return a new IRTemp holding the result.  'ty' may be
   4543    Ity_I16, Ity_I32 or Ity_I64 only. */
   4544 static IRTemp gen_POPCOUNT ( IRType ty, IRTemp src )
   4545 {
   4546    Int i;
   4547    if (ty == Ity_I16) {
   4548       IRTemp old = IRTemp_INVALID;
   4549       IRTemp nyu = IRTemp_INVALID;
   4550       IRTemp mask[4], shift[4];
   4551       for (i = 0; i < 4; i++) {
   4552          mask[i]  = newTemp(ty);
   4553          shift[i] = 1 << i;
   4554       }
   4555       assign(mask[0], mkU16(0x5555));
   4556       assign(mask[1], mkU16(0x3333));
   4557       assign(mask[2], mkU16(0x0F0F));
   4558       assign(mask[3], mkU16(0x00FF));
   4559       old = src;
   4560       for (i = 0; i < 4; i++) {
   4561          nyu = newTemp(ty);
   4562          assign(nyu,
   4563                 binop(Iop_Add16,
   4564                       binop(Iop_And16,
   4565                             mkexpr(old),
   4566                             mkexpr(mask[i])),
   4567                       binop(Iop_And16,
   4568                             binop(Iop_Shr16, mkexpr(old), mkU8(shift[i])),
   4569                             mkexpr(mask[i]))));
   4570          old = nyu;
   4571       }
   4572       return nyu;
   4573    }
   4574    if (ty == Ity_I32) {
   4575       IRTemp old = IRTemp_INVALID;
   4576       IRTemp nyu = IRTemp_INVALID;
   4577       IRTemp mask[5], shift[5];
   4578       for (i = 0; i < 5; i++) {
   4579          mask[i]  = newTemp(ty);
   4580          shift[i] = 1 << i;
   4581       }
   4582       assign(mask[0], mkU32(0x55555555));
   4583       assign(mask[1], mkU32(0x33333333));
   4584       assign(mask[2], mkU32(0x0F0F0F0F));
   4585       assign(mask[3], mkU32(0x00FF00FF));
   4586       assign(mask[4], mkU32(0x0000FFFF));
   4587       old = src;
   4588       for (i = 0; i < 5; i++) {
   4589          nyu = newTemp(ty);
   4590          assign(nyu,
   4591                 binop(Iop_Add32,
   4592                       binop(Iop_And32,
   4593                             mkexpr(old),
   4594                             mkexpr(mask[i])),
   4595                       binop(Iop_And32,
   4596                             binop(Iop_Shr32, mkexpr(old), mkU8(shift[i])),
   4597                             mkexpr(mask[i]))));
   4598          old = nyu;
   4599       }
   4600       return nyu;
   4601    }
   4602    if (ty == Ity_I64) {
   4603       IRTemp old = IRTemp_INVALID;
   4604       IRTemp nyu = IRTemp_INVALID;
   4605       IRTemp mask[6], shift[6];
   4606       for (i = 0; i < 6; i++) {
   4607          mask[i]  = newTemp(ty);
   4608          shift[i] = 1 << i;
   4609       }
   4610       assign(mask[0], mkU64(0x5555555555555555ULL));
   4611       assign(mask[1], mkU64(0x3333333333333333ULL));
   4612       assign(mask[2], mkU64(0x0F0F0F0F0F0F0F0FULL));
   4613       assign(mask[3], mkU64(0x00FF00FF00FF00FFULL));
   4614       assign(mask[4], mkU64(0x0000FFFF0000FFFFULL));
   4615       assign(mask[5], mkU64(0x00000000FFFFFFFFULL));
   4616       old = src;
   4617       for (i = 0; i < 6; i++) {
   4618          nyu = newTemp(ty);
   4619          assign(nyu,
   4620                 binop(Iop_Add64,
   4621                       binop(Iop_And64,
   4622                             mkexpr(old),
   4623                             mkexpr(mask[i])),
   4624                       binop(Iop_And64,
   4625                             binop(Iop_Shr64, mkexpr(old), mkU8(shift[i])),
   4626                             mkexpr(mask[i]))));
   4627          old = nyu;
   4628       }
   4629       return nyu;
   4630    }
   4631    /*NOTREACHED*/
   4632    vassert(0);
   4633 }
   4634 
   4635 
   4636 /* Generate an IR sequence to do a count-leading-zeroes operation on
   4637    the supplied IRTemp, and return a new IRTemp holding the result.
   4638    'ty' may be Ity_I16, Ity_I32 or Ity_I64 only.  In the case where
   4639    the argument is zero, return the number of bits in the word (the
   4640    natural semantics). */
   4641 static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
   4642 {
   4643    vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16);
   4644 
   4645    IRTemp src64 = newTemp(Ity_I64);
   4646    assign(src64, widenUto64( mkexpr(src) ));
   4647 
   4648    IRTemp src64x = newTemp(Ity_I64);
   4649    assign(src64x,
   4650           binop(Iop_Shl64, mkexpr(src64),
   4651                            mkU8(64 - 8 * sizeofIRType(ty))));
   4652 
   4653    // Clz64 has undefined semantics when its input is zero, so
   4654    // special-case around that.
   4655    IRTemp res64 = newTemp(Ity_I64);
   4656    assign(res64,
   4657           IRExpr_Mux0X(
   4658              unop(Iop_1Uto8,
   4659                   binop(Iop_CmpEQ64, mkexpr(src64x), mkU64(0))),
   4660              unop(Iop_Clz64, mkexpr(src64x)),
   4661              mkU64(8 * sizeofIRType(ty))
   4662    ));
   4663 
   4664    IRTemp res = newTemp(ty);
   4665    assign(res, narrowTo(ty, mkexpr(res64)));
   4666    return res;
   4667 }
   4668 
   4669 
   4670 /*------------------------------------------------------------*/
   4671 /*---                                                      ---*/
   4672 /*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
   4673 /*---                                                      ---*/
   4674 /*------------------------------------------------------------*/
   4675 
   4676 /* --- Helper functions for dealing with the register stack. --- */
   4677 
   4678 /* --- Set the emulation-warning pseudo-register. --- */
   4679 
   4680 static void put_emwarn ( IRExpr* e /* :: Ity_I32 */ )
   4681 {
   4682    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   4683    stmt( IRStmt_Put( OFFB_EMWARN, e ) );
   4684 }
   4685 
   4686 /* --- Produce an IRExpr* denoting a 64-bit QNaN. --- */
   4687 
   4688 static IRExpr* mkQNaN64 ( void )
   4689 {
   4690   /* QNaN is 0 2047 1 0(51times)
   4691      == 0b 11111111111b 1 0(51times)
   4692      == 0x7FF8 0000 0000 0000
   4693    */
   4694    return IRExpr_Const(IRConst_F64i(0x7FF8000000000000ULL));
   4695 }
   4696 
   4697 /* --------- Get/put the top-of-stack pointer :: Ity_I32 --------- */
   4698 
   4699 static IRExpr* get_ftop ( void )
   4700 {
   4701    return IRExpr_Get( OFFB_FTOP, Ity_I32 );
   4702 }
   4703 
   4704 static void put_ftop ( IRExpr* e )
   4705 {
   4706    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   4707    stmt( IRStmt_Put( OFFB_FTOP, e ) );
   4708 }
   4709 
   4710 /* --------- Get/put the C3210 bits. --------- */
   4711 
   4712 static IRExpr*  /* :: Ity_I64 */ get_C3210 ( void )
   4713 {
   4714    return IRExpr_Get( OFFB_FC3210, Ity_I64 );
   4715 }
   4716 
   4717 static void put_C3210 ( IRExpr* e  /* :: Ity_I64 */ )
   4718 {
   4719    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
   4720    stmt( IRStmt_Put( OFFB_FC3210, e ) );
   4721 }
   4722 
   4723 /* --------- Get/put the FPU rounding mode. --------- */
   4724 static IRExpr* /* :: Ity_I32 */ get_fpround ( void )
   4725 {
   4726    return unop(Iop_64to32, IRExpr_Get( OFFB_FPROUND, Ity_I64 ));
   4727 }
   4728 
   4729 static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
   4730 {
   4731    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   4732    stmt( IRStmt_Put( OFFB_FPROUND, unop(Iop_32Uto64,e) ) );
   4733 }
   4734 
   4735 
   4736 /* --------- Synthesise a 2-bit FPU rounding mode. --------- */
   4737 /* Produces a value in 0 .. 3, which is encoded as per the type
   4738    IRRoundingMode.  Since the guest_FPROUND value is also encoded as
   4739    per IRRoundingMode, we merely need to get it and mask it for
   4740    safety.
   4741 */
   4742 static IRExpr* /* :: Ity_I32 */ get_roundingmode ( void )
   4743 {
   4744    return binop( Iop_And32, get_fpround(), mkU32(3) );
   4745 }
   4746 
   4747 static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
   4748 {
   4749    return mkU32(Irrm_NEAREST);
   4750 }
   4751 
   4752 
   4753 /* --------- Get/set FP register tag bytes. --------- */
   4754 
   4755 /* Given i, and some expression e, generate 'ST_TAG(i) = e'. */
   4756 
   4757 static void put_ST_TAG ( Int i, IRExpr* value )
   4758 {
   4759    IRRegArray* descr;
   4760    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_I8);
   4761    descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   4762    stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
   4763 }
   4764 
   4765 /* Given i, generate an expression yielding 'ST_TAG(i)'.  This will be
   4766    zero to indicate "Empty" and nonzero to indicate "NonEmpty".  */
   4767 
   4768 static IRExpr* get_ST_TAG ( Int i )
   4769 {
   4770    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   4771    return IRExpr_GetI( descr, get_ftop(), i );
   4772 }
   4773 
   4774 
   4775 /* --------- Get/set FP registers. --------- */
   4776 
   4777 /* Given i, and some expression e, emit 'ST(i) = e' and set the
   4778    register's tag to indicate the register is full.  The previous
   4779    state of the register is not checked. */
   4780 
   4781 static void put_ST_UNCHECKED ( Int i, IRExpr* value )
   4782 {
   4783    IRRegArray* descr;
   4784    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_F64);
   4785    descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   4786    stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
   4787    /* Mark the register as in-use. */
   4788    put_ST_TAG(i, mkU8(1));
   4789 }
   4790 
   4791 /* Given i, and some expression e, emit
   4792       ST(i) = is_full(i) ? NaN : e
   4793    and set the tag accordingly.
   4794 */
   4795 
   4796 static void put_ST ( Int i, IRExpr* value )
   4797 {
   4798    put_ST_UNCHECKED( i,
   4799                      IRExpr_Mux0X( get_ST_TAG(i),
   4800                                    /* 0 means empty */
   4801                                    value,
   4802                                    /* non-0 means full */
   4803                                    mkQNaN64()
   4804                    )
   4805    );
   4806 }
   4807 
   4808 
   4809 /* Given i, generate an expression yielding 'ST(i)'. */
   4810 
   4811 static IRExpr* get_ST_UNCHECKED ( Int i )
   4812 {
   4813    IRRegArray* descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   4814    return IRExpr_GetI( descr, get_ftop(), i );
   4815 }
   4816 
   4817 
   4818 /* Given i, generate an expression yielding
   4819   is_full(i) ? ST(i) : NaN
   4820 */
   4821 
   4822 static IRExpr* get_ST ( Int i )
   4823 {
   4824    return
   4825       IRExpr_Mux0X( get_ST_TAG(i),
   4826                     /* 0 means empty */
   4827                     mkQNaN64(),
   4828                     /* non-0 means full */
   4829                     get_ST_UNCHECKED(i));
   4830 }
   4831 
   4832 
   4833 /* Adjust FTOP downwards by one register. */
   4834 
   4835 static void fp_push ( void )
   4836 {
   4837    put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
   4838 }
   4839 
   4840 /* Adjust FTOP upwards by one register, and mark the vacated register
   4841    as empty.  */
   4842 
   4843 static void fp_pop ( void )
   4844 {
   4845    put_ST_TAG(0, mkU8(0));
   4846    put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   4847 }
   4848 
   4849 /* Clear the C2 bit of the FPU status register, for
   4850    sin/cos/tan/sincos. */
   4851 
   4852 static void clear_C2 ( void )
   4853 {
   4854    put_C3210( binop(Iop_And64, get_C3210(), mkU64(~AMD64G_FC_MASK_C2)) );
   4855 }
   4856 
   4857 /* Invent a plausible-looking FPU status word value:
   4858       ((ftop & 7) << 11) | (c3210 & 0x4700)
   4859  */
   4860 static IRExpr* get_FPU_sw ( void )
   4861 {
   4862    return
   4863       unop(Iop_32to16,
   4864            binop(Iop_Or32,
   4865                  binop(Iop_Shl32,
   4866                        binop(Iop_And32, get_ftop(), mkU32(7)),
   4867                              mkU8(11)),
   4868                        binop(Iop_And32, unop(Iop_64to32, get_C3210()),
   4869                                         mkU32(0x4700))
   4870       ));
   4871 }
   4872 
   4873 
   4874 /* ------------------------------------------------------- */
   4875 /* Given all that stack-mangling junk, we can now go ahead
   4876    and describe FP instructions.
   4877 */
   4878 
   4879 /* ST(0) = ST(0) `op` mem64/32(addr)
   4880    Need to check ST(0)'s tag on read, but not on write.
   4881 */
   4882 static
   4883 void fp_do_op_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf,
   4884                          IROp op, Bool dbl )
   4885 {
   4886    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   4887    if (dbl) {
   4888       put_ST_UNCHECKED(0,
   4889          triop( op,
   4890                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4891                 get_ST(0),
   4892                 loadLE(Ity_F64,mkexpr(addr))
   4893          ));
   4894    } else {
   4895       put_ST_UNCHECKED(0,
   4896          triop( op,
   4897                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4898                 get_ST(0),
   4899                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
   4900          ));
   4901    }
   4902 }
   4903 
   4904 
   4905 /* ST(0) = mem64/32(addr) `op` ST(0)
   4906    Need to check ST(0)'s tag on read, but not on write.
   4907 */
   4908 static
   4909 void fp_do_oprev_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf,
   4910                             IROp op, Bool dbl )
   4911 {
   4912    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   4913    if (dbl) {
   4914       put_ST_UNCHECKED(0,
   4915          triop( op,
   4916                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4917                 loadLE(Ity_F64,mkexpr(addr)),
   4918                 get_ST(0)
   4919          ));
   4920    } else {
   4921       put_ST_UNCHECKED(0,
   4922          triop( op,
   4923                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4924                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
   4925                 get_ST(0)
   4926          ));
   4927    }
   4928 }
   4929 
   4930 
   4931 /* ST(dst) = ST(dst) `op` ST(src).
   4932    Check dst and src tags when reading but not on write.
   4933 */
   4934 static
   4935 void fp_do_op_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
   4936                       Bool pop_after )
   4937 {
   4938    DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
   4939    put_ST_UNCHECKED(
   4940       st_dst,
   4941       triop( op,
   4942              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4943              get_ST(st_dst),
   4944              get_ST(st_src) )
   4945    );
   4946    if (pop_after)
   4947       fp_pop();
   4948 }
   4949 
   4950 /* ST(dst) = ST(src) `op` ST(dst).
   4951    Check dst and src tags when reading but not on write.
   4952 */
   4953 static
   4954 void fp_do_oprev_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
   4955                          Bool pop_after )
   4956 {
   4957    DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
   4958    put_ST_UNCHECKED(
   4959       st_dst,
   4960       triop( op,
   4961              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   4962              get_ST(st_src),
   4963              get_ST(st_dst) )
   4964    );
   4965    if (pop_after)
   4966       fp_pop();
   4967 }
   4968 
   4969 /* %rflags(Z,P,C) = UCOMI( st(0), st(i) ) */
   4970 static void fp_do_ucomi_ST0_STi ( UInt i, Bool pop_after )
   4971 {
   4972    DIP("fucomi%s %%st(0),%%st(%u)\n", pop_after ? "p" : "", i);
   4973    /* This is a bit of a hack (and isn't really right).  It sets
   4974       Z,P,C,O correctly, but forces A and S to zero, whereas the Intel
   4975       documentation implies A and S are unchanged.
   4976    */
   4977    /* It's also fishy in that it is used both for COMIP and
   4978       UCOMIP, and they aren't the same (although similar). */
   4979    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   4980    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   4981    stmt( IRStmt_Put(
   4982             OFFB_CC_DEP1,
   4983             binop( Iop_And64,
   4984                    unop( Iop_32Uto64,
   4985                          binop(Iop_CmpF64, get_ST(0), get_ST(i))),
   4986                    mkU64(0x45)
   4987         )));
   4988    if (pop_after)
   4989       fp_pop();
   4990 }
   4991 
   4992 
   4993 /* returns
   4994    32to16( if e32 <s -32768 || e32 >s 32767 then -32768 else e32 )
   4995 */
   4996 static IRExpr* x87ishly_qnarrow_32_to_16 ( IRExpr* e32 )
   4997 {
   4998    IRTemp t32 = newTemp(Ity_I32);
   4999    assign( t32, e32 );
   5000    return
   5001       IRExpr_Mux0X(
   5002          unop(Iop_1Uto8,
   5003               binop(Iop_CmpLT64U,
   5004                     unop(Iop_32Uto64,
   5005                          binop(Iop_Add32, mkexpr(t32), mkU32(32768))),
   5006                     mkU64(65536))),
   5007          mkU16( 0x8000 ),
   5008          unop(Iop_32to16, mkexpr(t32)));
   5009 }
   5010 
   5011 
   5012 static
   5013 ULong dis_FPU ( /*OUT*/Bool* decode_ok,
   5014                 VexAbiInfo* vbi, Prefix pfx, Long delta )
   5015 {
   5016    Int    len;
   5017    UInt   r_src, r_dst;
   5018    HChar  dis_buf[50];
   5019    IRTemp t1, t2;
   5020 
   5021    /* On entry, delta points at the second byte of the insn (the modrm
   5022       byte).*/
   5023    UChar first_opcode = getUChar(delta-1);
   5024    UChar modrm        = getUChar(delta+0);
   5025 
   5026    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */
   5027 
   5028    if (first_opcode == 0xD8) {
   5029       if (modrm < 0xC0) {
   5030 
   5031          /* bits 5,4,3 are an opcode extension, and the modRM also
   5032            specifies an address. */
   5033          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   5034          delta += len;
   5035 
   5036          switch (gregLO3ofRM(modrm)) {
   5037 
   5038             case 0: /* FADD single-real */
   5039                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, False );
   5040                break;
   5041 
   5042             case 1: /* FMUL single-real */
   5043                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, False );
   5044                break;
   5045 
   5046             case 2: /* FCOM single-real */
   5047                DIP("fcoms %s\n", dis_buf);
   5048                /* This forces C1 to zero, which isn't right. */
   5049                /* The AMD documentation suggests that forcing C1 to
   5050                   zero is correct (Eliot Moss) */
   5051                put_C3210(
   5052                    unop( Iop_32Uto64,
   5053                        binop( Iop_And32,
   5054                               binop(Iop_Shl32,
   5055                                     binop(Iop_CmpF64,
   5056                                           get_ST(0),
   5057                                           unop(Iop_F32toF64,
   5058                                                loadLE(Ity_F32,mkexpr(addr)))),
   5059                                     mkU8(8)),
   5060                               mkU32(0x4500)
   5061                    )));
   5062                break;
   5063 
   5064             case 3: /* FCOMP single-real */
   5065                /* The AMD documentation suggests that forcing C1 to
   5066                   zero is correct (Eliot Moss) */
   5067                DIP("fcomps %s\n", dis_buf);
   5068                /* This forces C1 to zero, which isn't right. */
   5069                put_C3210(
   5070                    unop( Iop_32Uto64,
   5071                        binop( Iop_And32,
   5072                               binop(Iop_Shl32,
   5073                                     binop(Iop_CmpF64,
   5074                                           get_ST(0),
   5075                                           unop(Iop_F32toF64,
   5076                                                loadLE(Ity_F32,mkexpr(addr)))),
   5077                                     mkU8(8)),
   5078                               mkU32(0x4500)
   5079                    )));
   5080                fp_pop();
   5081                break;
   5082 
   5083             case 4: /* FSUB single-real */
   5084                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, False );
   5085                break;
   5086 
   5087             case 5: /* FSUBR single-real */
   5088                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, False );
   5089                break;
   5090 
   5091             case 6: /* FDIV single-real */
   5092                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, False );
   5093                break;
   5094 
   5095             case 7: /* FDIVR single-real */
   5096                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, False );
   5097                break;
   5098 
   5099             default:
   5100                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   5101                vex_printf("first_opcode == 0xD8\n");
   5102                goto decode_fail;
   5103          }
   5104       } else {
   5105          delta++;
   5106          switch (modrm) {
   5107 
   5108             case 0xC0 ... 0xC7: /* FADD %st(?),%st(0) */
   5109                fp_do_op_ST_ST ( "add", Iop_AddF64, modrm - 0xC0, 0, False );
   5110                break;
   5111 
   5112             case 0xC8 ... 0xCF: /* FMUL %st(?),%st(0) */
   5113                fp_do_op_ST_ST ( "mul", Iop_MulF64, modrm - 0xC8, 0, False );
   5114                break;
   5115 
   5116             /* Dunno if this is right */
   5117             case 0xD0 ... 0xD7: /* FCOM %st(?),%st(0) */
   5118                r_dst = (UInt)modrm - 0xD0;
   5119                DIP("fcom %%st(0),%%st(%d)\n", r_dst);
   5120                /* This forces C1 to zero, which isn't right. */
   5121                put_C3210(
   5122                    unop(Iop_32Uto64,
   5123                    binop( Iop_And32,
   5124                           binop(Iop_Shl32,
   5125                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   5126                                 mkU8(8)),
   5127                           mkU32(0x4500)
   5128                    )));
   5129                break;
   5130 
   5131             /* Dunno if this is right */
   5132             case 0xD8 ... 0xDF: /* FCOMP %st(?),%st(0) */
   5133                r_dst = (UInt)modrm - 0xD8;
   5134                DIP("fcomp %%st(0),%%st(%d)\n", r_dst);
   5135                /* This forces C1 to zero, which isn't right. */
   5136                put_C3210(
   5137                    unop(Iop_32Uto64,
   5138                    binop( Iop_And32,
   5139                           binop(Iop_Shl32,
   5140                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   5141                                 mkU8(8)),
   5142                           mkU32(0x4500)
   5143                    )));
   5144                fp_pop();
   5145                break;
   5146 
   5147             case 0xE0 ... 0xE7: /* FSUB %st(?),%st(0) */
   5148                fp_do_op_ST_ST ( "sub", Iop_SubF64, modrm - 0xE0, 0, False );
   5149                break;
   5150 
   5151             case 0xE8 ... 0xEF: /* FSUBR %st(?),%st(0) */
   5152                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, modrm - 0xE8, 0, False );
   5153                break;
   5154 
   5155             case 0xF0 ... 0xF7: /* FDIV %st(?),%st(0) */
   5156                fp_do_op_ST_ST ( "div", Iop_DivF64, modrm - 0xF0, 0, False );
   5157                break;
   5158 
   5159             case 0xF8 ... 0xFF: /* FDIVR %st(?),%st(0) */
   5160                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, modrm - 0xF8, 0, False );
   5161                break;
   5162 
   5163             default:
   5164                goto decode_fail;
   5165          }
   5166       }
   5167    }
   5168 
   5169    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
   5170    else
   5171    if (first_opcode == 0xD9) {
   5172       if (modrm < 0xC0) {
   5173 
   5174          /* bits 5,4,3 are an opcode extension, and the modRM also
   5175             specifies an address. */
   5176          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   5177          delta += len;
   5178 
   5179          switch (gregLO3ofRM(modrm)) {
   5180 
   5181             case 0: /* FLD single-real */
   5182                DIP("flds %s\n", dis_buf);
   5183                fp_push();
   5184                put_ST(0, unop(Iop_F32toF64,
   5185                               loadLE(Ity_F32, mkexpr(addr))));
   5186                break;
   5187 
   5188             case 2: /* FST single-real */
   5189                DIP("fsts %s\n", dis_buf);
   5190                storeLE(mkexpr(addr),
   5191                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   5192                break;
   5193 
   5194             case 3: /* FSTP single-real */
   5195                DIP("fstps %s\n", dis_buf);
   5196                storeLE(mkexpr(addr),
   5197                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   5198                fp_pop();
   5199                break;
   5200 
   5201             case 4: { /* FLDENV m28 */
   5202                /* Uses dirty helper:
   5203                      VexEmWarn amd64g_do_FLDENV ( VexGuestX86State*, HWord ) */
   5204                IRTemp    ew = newTemp(Ity_I32);
   5205                IRTemp   w64 = newTemp(Ity_I64);
   5206                IRDirty*   d = unsafeIRDirty_0_N (
   5207                                  0/*regparms*/,
   5208                                  "amd64g_dirtyhelper_FLDENV",
   5209                                  &amd64g_dirtyhelper_FLDENV,
   5210                                  mkIRExprVec_1( mkexpr(addr) )
   5211                               );
   5212                d->needsBBP = True;
   5213                d->tmp      = w64;
   5214                /* declare we're reading memory */
   5215                d->mFx   = Ifx_Read;
   5216                d->mAddr = mkexpr(addr);
   5217                d->mSize = 28;
   5218 
   5219                /* declare we're writing guest state */
   5220                d->nFxState = 4;
   5221                vex_bzero(&d->fxState, sizeof(d->fxState));
   5222 
   5223                d->fxState[0].fx     = Ifx_Write;
   5224                d->fxState[0].offset = OFFB_FTOP;
   5225                d->fxState[0].size   = sizeof(UInt);
   5226 
   5227                d->fxState[1].fx     = Ifx_Write;
   5228                d->fxState[1].offset = OFFB_FPTAGS;
   5229                d->fxState[1].size   = 8 * sizeof(UChar);
   5230 
   5231                d->fxState[2].fx     = Ifx_Write;
   5232                d->fxState[2].offset = OFFB_FPROUND;
   5233                d->fxState[2].size   = sizeof(ULong);
   5234 
   5235                d->fxState[3].fx     = Ifx_Write;
   5236                d->fxState[3].offset = OFFB_FC3210;
   5237                d->fxState[3].size   = sizeof(ULong);
   5238 
   5239                stmt( IRStmt_Dirty(d) );
   5240 
   5241                /* ew contains any emulation warning we may need to
   5242                   issue.  If needed, side-exit to the next insn,
   5243                   reporting the warning, so that Valgrind's dispatcher
   5244                   sees the warning. */
   5245 	       assign(ew, unop(Iop_64to32,mkexpr(w64)) );
   5246                put_emwarn( mkexpr(ew) );
   5247                stmt(
   5248                   IRStmt_Exit(
   5249                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   5250                      Ijk_EmWarn,
   5251                      IRConst_U64( guest_RIP_bbstart+delta ),
   5252                      OFFB_RIP
   5253                   )
   5254                );
   5255 
   5256                DIP("fldenv %s\n", dis_buf);
   5257                break;
   5258             }
   5259 
   5260             case 5: {/* FLDCW */
   5261                /* The only thing we observe in the control word is the
   5262                   rounding mode.  Therefore, pass the 16-bit value
   5263                   (x87 native-format control word) to a clean helper,
   5264                   getting back a 64-bit value, the lower half of which
   5265                   is the FPROUND value to store, and the upper half of
   5266                   which is the emulation-warning token which may be
   5267                   generated.
   5268                */
   5269                /* ULong amd64h_check_fldcw ( ULong ); */
   5270                IRTemp t64 = newTemp(Ity_I64);
   5271                IRTemp ew = newTemp(Ity_I32);
   5272                DIP("fldcw %s\n", dis_buf);
   5273                assign( t64, mkIRExprCCall(
   5274                                Ity_I64, 0/*regparms*/,
   5275                                "amd64g_check_fldcw",
   5276                                &amd64g_check_fldcw,
   5277                                mkIRExprVec_1(
   5278                                   unop( Iop_16Uto64,
   5279                                         loadLE(Ity_I16, mkexpr(addr)))
   5280                                )
   5281                             )
   5282                      );
   5283 
   5284                put_fpround( unop(Iop_64to32, mkexpr(t64)) );
   5285                assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   5286                put_emwarn( mkexpr(ew) );
   5287                /* Finally, if an emulation warning was reported,
   5288                   side-exit to the next insn, reporting the warning,
   5289                   so that Valgrind's dispatcher sees the warning. */
   5290                stmt(
   5291                   IRStmt_Exit(
   5292                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   5293                      Ijk_EmWarn,
   5294                      IRConst_U64( guest_RIP_bbstart+delta ),
   5295                      OFFB_RIP
   5296                   )
   5297                );
   5298                break;
   5299             }
   5300 
   5301             case 6: { /* FNSTENV m28 */
   5302                /* Uses dirty helper:
   5303                      void amd64g_do_FSTENV ( VexGuestAMD64State*, HWord ) */
   5304                IRDirty* d = unsafeIRDirty_0_N (
   5305                                0/*regparms*/,
   5306                                "amd64g_dirtyhelper_FSTENV",
   5307                                &amd64g_dirtyhelper_FSTENV,
   5308                                mkIRExprVec_1( mkexpr(addr) )
   5309                             );
   5310                d->needsBBP = True;
   5311                /* declare we're writing memory */
   5312                d->mFx   = Ifx_Write;
   5313                d->mAddr = mkexpr(addr);
   5314                d->mSize = 28;
   5315 
   5316                /* declare we're reading guest state */
   5317                d->nFxState = 4;
   5318                vex_bzero(&d->fxState, sizeof(d->fxState));
   5319 
   5320                d->fxState[0].fx     = Ifx_Read;
   5321                d->fxState[0].offset = OFFB_FTOP;
   5322                d->fxState[0].size   = sizeof(UInt);
   5323 
   5324                d->fxState[1].fx     = Ifx_Read;
   5325                d->fxState[1].offset = OFFB_FPTAGS;
   5326                d->fxState[1].size   = 8 * sizeof(UChar);
   5327 
   5328                d->fxState[2].fx     = Ifx_Read;
   5329                d->fxState[2].offset = OFFB_FPROUND;
   5330                d->fxState[2].size   = sizeof(ULong);
   5331 
   5332                d->fxState[3].fx     = Ifx_Read;
   5333                d->fxState[3].offset = OFFB_FC3210;
   5334                d->fxState[3].size   = sizeof(ULong);
   5335 
   5336                stmt( IRStmt_Dirty(d) );
   5337 
   5338                DIP("fnstenv %s\n", dis_buf);
   5339                break;
   5340             }
   5341 
   5342             case 7: /* FNSTCW */
   5343                /* Fake up a native x87 FPU control word.  The only
   5344                   thing it depends on is FPROUND[1:0], so call a clean
   5345                   helper to cook it up. */
   5346                /* ULong amd64g_create_fpucw ( ULong fpround ) */
   5347                DIP("fnstcw %s\n", dis_buf);
   5348                storeLE(
   5349                   mkexpr(addr),
   5350                   unop( Iop_64to16,
   5351                         mkIRExprCCall(
   5352                            Ity_I64, 0/*regp*/,
   5353                            "amd64g_create_fpucw", &amd64g_create_fpucw,
   5354                            mkIRExprVec_1( unop(Iop_32Uto64, get_fpround()) )
   5355                         )
   5356                   )
   5357                );
   5358                break;
   5359 
   5360             default:
   5361                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   5362                vex_printf("first_opcode == 0xD9\n");
   5363                goto decode_fail;
   5364          }
   5365 
   5366       } else {
   5367          delta++;
   5368          switch (modrm) {
   5369 
   5370             case 0xC0 ... 0xC7: /* FLD %st(?) */
   5371                r_src = (UInt)modrm - 0xC0;
   5372                DIP("fld %%st(%u)\n", r_src);
   5373                t1 = newTemp(Ity_F64);
   5374                assign(t1, get_ST(r_src));
   5375                fp_push();
   5376                put_ST(0, mkexpr(t1));
   5377                break;
   5378 
   5379             case 0xC8 ... 0xCF: /* FXCH %st(?) */
   5380                r_src = (UInt)modrm - 0xC8;
   5381                DIP("fxch %%st(%u)\n", r_src);
   5382                t1 = newTemp(Ity_F64);
   5383                t2 = newTemp(Ity_F64);
   5384                assign(t1, get_ST(0));
   5385                assign(t2, get_ST(r_src));
   5386                put_ST_UNCHECKED(0, mkexpr(t2));
   5387                put_ST_UNCHECKED(r_src, mkexpr(t1));
   5388                break;
   5389 
   5390             case 0xE0: /* FCHS */
   5391                DIP("fchs\n");
   5392                put_ST_UNCHECKED(0, unop(Iop_NegF64, get_ST(0)));
   5393                break;
   5394 
   5395             case 0xE1: /* FABS */
   5396                DIP("fabs\n");
   5397                put_ST_UNCHECKED(0, unop(Iop_AbsF64, get_ST(0)));
   5398                break;
   5399 
   5400             case 0xE5: { /* FXAM */
   5401                /* This is an interesting one.  It examines %st(0),
   5402                   regardless of whether the tag says it's empty or not.
   5403                   Here, just pass both the tag (in our format) and the
   5404                   value (as a double, actually a ULong) to a helper
   5405                   function. */
   5406                IRExpr** args
   5407                   = mkIRExprVec_2( unop(Iop_8Uto64, get_ST_TAG(0)),
   5408                                    unop(Iop_ReinterpF64asI64,
   5409                                         get_ST_UNCHECKED(0)) );
   5410                put_C3210(mkIRExprCCall(
   5411                             Ity_I64,
   5412                             0/*regparm*/,
   5413                             "amd64g_calculate_FXAM", &amd64g_calculate_FXAM,
   5414                             args
   5415                         ));
   5416                DIP("fxam\n");
   5417                break;
   5418             }
   5419 
   5420             case 0xE8: /* FLD1 */
   5421                DIP("fld1\n");
   5422                fp_push();
   5423                /* put_ST(0, IRExpr_Const(IRConst_F64(1.0))); */
   5424                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL)));
   5425                break;
   5426 
   5427             case 0xE9: /* FLDL2T */
   5428                DIP("fldl2t\n");
   5429                fp_push();
   5430                /* put_ST(0, IRExpr_Const(IRConst_F64(3.32192809488736234781))); */
   5431                put_ST(0, IRExpr_Const(IRConst_F64i(0x400a934f0979a371ULL)));
   5432                break;
   5433 
   5434             case 0xEA: /* FLDL2E */
   5435                DIP("fldl2e\n");
   5436                fp_push();
   5437                /* put_ST(0, IRExpr_Const(IRConst_F64(1.44269504088896340739))); */
   5438                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff71547652b82feULL)));
   5439                break;
   5440 
   5441             case 0xEB: /* FLDPI */
   5442                DIP("fldpi\n");
   5443                fp_push();
   5444                /* put_ST(0, IRExpr_Const(IRConst_F64(3.14159265358979323851))); */
   5445                put_ST(0, IRExpr_Const(IRConst_F64i(0x400921fb54442d18ULL)));
   5446                break;
   5447 
   5448             case 0xEC: /* FLDLG2 */
   5449                DIP("fldlg2\n");
   5450                fp_push();
   5451                /* put_ST(0, IRExpr_Const(IRConst_F64(0.301029995663981143))); */
   5452                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fd34413509f79ffULL)));
   5453                break;
   5454 
   5455             case 0xED: /* FLDLN2 */
   5456                DIP("fldln2\n");
   5457                fp_push();
   5458                /* put_ST(0, IRExpr_Const(IRConst_F64(0.69314718055994530942))); */
   5459                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fe62e42fefa39efULL)));
   5460                break;
   5461 
   5462             case 0xEE: /* FLDZ */
   5463                DIP("fldz\n");
   5464                fp_push();
   5465                /* put_ST(0, IRExpr_Const(IRConst_F64(0.0))); */
   5466                put_ST(0, IRExpr_Const(IRConst_F64i(0x0000000000000000ULL)));
   5467                break;
   5468 
   5469             case 0xF0: /* F2XM1 */
   5470                DIP("f2xm1\n");
   5471                put_ST_UNCHECKED(0,
   5472                   binop(Iop_2xm1F64,
   5473                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5474                         get_ST(0)));
   5475                break;
   5476 
   5477             case 0xF1: /* FYL2X */
   5478                DIP("fyl2x\n");
   5479                put_ST_UNCHECKED(1,
   5480                   triop(Iop_Yl2xF64,
   5481                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5482                         get_ST(1),
   5483                         get_ST(0)));
   5484                fp_pop();
   5485                break;
   5486 
   5487             case 0xF2: /* FPTAN */
   5488                DIP("ftan\n");
   5489                put_ST_UNCHECKED(0,
   5490                   binop(Iop_TanF64,
   5491                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5492                         get_ST(0)));
   5493                fp_push();
   5494                put_ST(0, IRExpr_Const(IRConst_F64(1.0)));
   5495                clear_C2(); /* HACK */
   5496                break;
   5497 
   5498             case 0xF3: /* FPATAN */
   5499                DIP("fpatan\n");
   5500                put_ST_UNCHECKED(1,
   5501                   triop(Iop_AtanF64,
   5502                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5503                         get_ST(1),
   5504                         get_ST(0)));
   5505                fp_pop();
   5506                break;
   5507 
   5508             case 0xF4: { /* FXTRACT */
   5509                IRTemp argF = newTemp(Ity_F64);
   5510                IRTemp sigF = newTemp(Ity_F64);
   5511                IRTemp expF = newTemp(Ity_F64);
   5512                IRTemp argI = newTemp(Ity_I64);
   5513                IRTemp sigI = newTemp(Ity_I64);
   5514                IRTemp expI = newTemp(Ity_I64);
   5515                DIP("fxtract\n");
   5516                assign( argF, get_ST(0) );
   5517                assign( argI, unop(Iop_ReinterpF64asI64, mkexpr(argF)));
   5518                assign( sigI,
   5519                        mkIRExprCCall(
   5520                           Ity_I64, 0/*regparms*/,
   5521                           "x86amd64g_calculate_FXTRACT",
   5522                           &x86amd64g_calculate_FXTRACT,
   5523                           mkIRExprVec_2( mkexpr(argI),
   5524                                          mkIRExpr_HWord(0)/*sig*/ ))
   5525                );
   5526                assign( expI,
   5527                        mkIRExprCCall(
   5528                           Ity_I64, 0/*regparms*/,
   5529                           "x86amd64g_calculate_FXTRACT",
   5530                           &x86amd64g_calculate_FXTRACT,
   5531                           mkIRExprVec_2( mkexpr(argI),
   5532                                          mkIRExpr_HWord(1)/*exp*/ ))
   5533                );
   5534                assign( sigF, unop(Iop_ReinterpI64asF64, mkexpr(sigI)) );
   5535                assign( expF, unop(Iop_ReinterpI64asF64, mkexpr(expI)) );
   5536                /* exponent */
   5537                put_ST_UNCHECKED(0, mkexpr(expF) );
   5538                fp_push();
   5539                /* significand */
   5540                put_ST(0, mkexpr(sigF) );
   5541                break;
   5542             }
   5543 
   5544             case 0xF5: { /* FPREM1 -- IEEE compliant */
   5545                IRTemp a1 = newTemp(Ity_F64);
   5546                IRTemp a2 = newTemp(Ity_F64);
   5547                DIP("fprem1\n");
   5548                /* Do FPREM1 twice, once to get the remainder, and once
   5549                   to get the C3210 flag values. */
   5550                assign( a1, get_ST(0) );
   5551                assign( a2, get_ST(1) );
   5552                put_ST_UNCHECKED(0,
   5553                   triop(Iop_PRem1F64,
   5554                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5555                         mkexpr(a1),
   5556                         mkexpr(a2)));
   5557                put_C3210(
   5558                   unop(Iop_32Uto64,
   5559                   triop(Iop_PRem1C3210F64,
   5560                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5561                         mkexpr(a1),
   5562                         mkexpr(a2)) ));
   5563                break;
   5564             }
   5565 
   5566             case 0xF7: /* FINCSTP */
   5567                DIP("fincstp\n");
   5568                put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   5569                break;
   5570 
   5571             case 0xF8: { /* FPREM -- not IEEE compliant */
   5572                IRTemp a1 = newTemp(Ity_F64);
   5573                IRTemp a2 = newTemp(Ity_F64);
   5574                DIP("fprem\n");
   5575                /* Do FPREM twice, once to get the remainder, and once
   5576                   to get the C3210 flag values. */
   5577                assign( a1, get_ST(0) );
   5578                assign( a2, get_ST(1) );
   5579                put_ST_UNCHECKED(0,
   5580                   triop(Iop_PRemF64,
   5581                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5582                         mkexpr(a1),
   5583                         mkexpr(a2)));
   5584                put_C3210(
   5585                   unop(Iop_32Uto64,
   5586                   triop(Iop_PRemC3210F64,
   5587                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5588                         mkexpr(a1),
   5589                         mkexpr(a2)) ));
   5590                break;
   5591             }
   5592 
   5593             case 0xF9: /* FYL2XP1 */
   5594                DIP("fyl2xp1\n");
   5595                put_ST_UNCHECKED(1,
   5596                   triop(Iop_Yl2xp1F64,
   5597                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5598                         get_ST(1),
   5599                         get_ST(0)));
   5600                fp_pop();
   5601                break;
   5602 
   5603             case 0xFA: /* FSQRT */
   5604                DIP("fsqrt\n");
   5605                put_ST_UNCHECKED(0,
   5606                   binop(Iop_SqrtF64,
   5607                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5608                         get_ST(0)));
   5609                break;
   5610 
   5611             case 0xFB: { /* FSINCOS */
   5612                IRTemp a1 = newTemp(Ity_F64);
   5613                assign( a1, get_ST(0) );
   5614                DIP("fsincos\n");
   5615                put_ST_UNCHECKED(0,
   5616                   binop(Iop_SinF64,
   5617                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5618                         mkexpr(a1)));
   5619                fp_push();
   5620                put_ST(0,
   5621                   binop(Iop_CosF64,
   5622                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5623                         mkexpr(a1)));
   5624                clear_C2(); /* HACK */
   5625                break;
   5626             }
   5627 
   5628             case 0xFC: /* FRNDINT */
   5629                DIP("frndint\n");
   5630                put_ST_UNCHECKED(0,
   5631                   binop(Iop_RoundF64toInt, get_roundingmode(), get_ST(0)) );
   5632                break;
   5633 
   5634             case 0xFD: /* FSCALE */
   5635                DIP("fscale\n");
   5636                put_ST_UNCHECKED(0,
   5637                   triop(Iop_ScaleF64,
   5638                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5639                         get_ST(0),
   5640                         get_ST(1)));
   5641                break;
   5642 
   5643             case 0xFE: /* FSIN */
   5644                DIP("fsin\n");
   5645                put_ST_UNCHECKED(0,
   5646                   binop(Iop_SinF64,
   5647                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5648                         get_ST(0)));
   5649                clear_C2(); /* HACK */
   5650                break;
   5651 
   5652             case 0xFF: /* FCOS */
   5653                DIP("fcos\n");
   5654                put_ST_UNCHECKED(0,
   5655                   binop(Iop_CosF64,
   5656                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5657                         get_ST(0)));
   5658                clear_C2(); /* HACK */
   5659                break;
   5660 
   5661             default:
   5662                goto decode_fail;
   5663          }
   5664       }
   5665    }
   5666 
   5667    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
   5668    else
   5669    if (first_opcode == 0xDA) {
   5670 
   5671       if (modrm < 0xC0) {
   5672 
   5673          /* bits 5,4,3 are an opcode extension, and the modRM also
   5674             specifies an address. */
   5675          IROp   fop;
   5676          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   5677          delta += len;
   5678          switch (gregLO3ofRM(modrm)) {
   5679 
   5680             case 0: /* FIADD m32int */ /* ST(0) += m32int */
   5681                DIP("fiaddl %s\n", dis_buf);
   5682                fop = Iop_AddF64;
   5683                goto do_fop_m32;
   5684 
   5685             case 1: /* FIMUL m32int */ /* ST(0) *= m32int */
   5686                DIP("fimull %s\n", dis_buf);
   5687                fop = Iop_MulF64;
   5688                goto do_fop_m32;
   5689 
   5690             case 4: /* FISUB m32int */ /* ST(0) -= m32int */
   5691                DIP("fisubl %s\n", dis_buf);
   5692                fop = Iop_SubF64;
   5693                goto do_fop_m32;
   5694 
   5695             case 5: /* FISUBR m32int */ /* ST(0) = m32int - ST(0) */
   5696                DIP("fisubrl %s\n", dis_buf);
   5697                fop = Iop_SubF64;
   5698                goto do_foprev_m32;
   5699 
   5700             case 6: /* FIDIV m32int */ /* ST(0) /= m32int */
   5701                DIP("fisubl %s\n", dis_buf);
   5702                fop = Iop_DivF64;
   5703                goto do_fop_m32;
   5704 
   5705             case 7: /* FIDIVR m32int */ /* ST(0) = m32int / ST(0) */
   5706                DIP("fidivrl %s\n", dis_buf);
   5707                fop = Iop_DivF64;
   5708                goto do_foprev_m32;
   5709 
   5710             do_fop_m32:
   5711                put_ST_UNCHECKED(0,
   5712                   triop(fop,
   5713                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5714                         get_ST(0),
   5715                         unop(Iop_I32StoF64,
   5716                              loadLE(Ity_I32, mkexpr(addr)))));
   5717                break;
   5718 
   5719             do_foprev_m32:
   5720                put_ST_UNCHECKED(0,
   5721                   triop(fop,
   5722                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5723                         unop(Iop_I32StoF64,
   5724                              loadLE(Ity_I32, mkexpr(addr))),
   5725                         get_ST(0)));
   5726                break;
   5727 
   5728             default:
   5729                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   5730                vex_printf("first_opcode == 0xDA\n");
   5731                goto decode_fail;
   5732          }
   5733 
   5734       } else {
   5735 
   5736          delta++;
   5737          switch (modrm) {
   5738 
   5739             case 0xC0 ... 0xC7: /* FCMOVB ST(i), ST(0) */
   5740                r_src = (UInt)modrm - 0xC0;
   5741                DIP("fcmovb %%st(%u), %%st(0)\n", r_src);
   5742                put_ST_UNCHECKED(0,
   5743                                 IRExpr_Mux0X(
   5744                                     unop(Iop_1Uto8,
   5745                                          mk_amd64g_calculate_condition(AMD64CondB)),
   5746                                     get_ST(0), get_ST(r_src)) );
   5747                break;
   5748 
   5749             case 0xC8 ... 0xCF: /* FCMOVE(Z) ST(i), ST(0) */
   5750                r_src = (UInt)modrm - 0xC8;
   5751                DIP("fcmovz %%st(%u), %%st(0)\n", r_src);
   5752                put_ST_UNCHECKED(0,
   5753                                 IRExpr_Mux0X(
   5754                                     unop(Iop_1Uto8,
   5755                                          mk_amd64g_calculate_condition(AMD64CondZ)),
   5756                                     get_ST(0), get_ST(r_src)) );
   5757                break;
   5758 
   5759             case 0xD0 ... 0xD7: /* FCMOVBE ST(i), ST(0) */
   5760                r_src = (UInt)modrm - 0xD0;
   5761                DIP("fcmovbe %%st(%u), %%st(0)\n", r_src);
   5762                put_ST_UNCHECKED(0,
   5763                                 IRExpr_Mux0X(
   5764                                     unop(Iop_1Uto8,
   5765                                          mk_amd64g_calculate_condition(AMD64CondBE)),
   5766                                     get_ST(0), get_ST(r_src)) );
   5767                break;
   5768 
   5769             case 0xD8 ... 0xDF: /* FCMOVU ST(i), ST(0) */
   5770                r_src = (UInt)modrm - 0xD8;
   5771                DIP("fcmovu %%st(%u), %%st(0)\n", r_src);
   5772                put_ST_UNCHECKED(0,
   5773                                 IRExpr_Mux0X(
   5774                                     unop(Iop_1Uto8,
   5775                                          mk_amd64g_calculate_condition(AMD64CondP)),
   5776                                     get_ST(0), get_ST(r_src)) );
   5777                break;
   5778 
   5779             case 0xE9: /* FUCOMPP %st(0),%st(1) */
   5780                DIP("fucompp %%st(0),%%st(1)\n");
   5781                /* This forces C1 to zero, which isn't right. */
   5782                put_C3210(
   5783                    unop(Iop_32Uto64,
   5784                    binop( Iop_And32,
   5785                           binop(Iop_Shl32,
   5786                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   5787                                 mkU8(8)),
   5788                           mkU32(0x4500)
   5789                    )));
   5790                fp_pop();
   5791                fp_pop();
   5792                break;
   5793 
   5794             default:
   5795                goto decode_fail;
   5796          }
   5797 
   5798       }
   5799    }
   5800 
   5801    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDB opcodes +-+-+-+-+-+-+-+ */
   5802    else
   5803    if (first_opcode == 0xDB) {
   5804       if (modrm < 0xC0) {
   5805 
   5806          /* bits 5,4,3 are an opcode extension, and the modRM also
   5807             specifies an address. */
   5808          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   5809          delta += len;
   5810 
   5811          switch (gregLO3ofRM(modrm)) {
   5812 
   5813             case 0: /* FILD m32int */
   5814                DIP("fildl %s\n", dis_buf);
   5815                fp_push();
   5816                put_ST(0, unop(Iop_I32StoF64,
   5817                               loadLE(Ity_I32, mkexpr(addr))));
   5818                break;
   5819 
   5820             case 1: /* FISTTPL m32 (SSE3) */
   5821                DIP("fisttpl %s\n", dis_buf);
   5822                storeLE( mkexpr(addr),
   5823                         binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) );
   5824                fp_pop();
   5825                break;
   5826 
   5827             case 2: /* FIST m32 */
   5828                DIP("fistl %s\n", dis_buf);
   5829                storeLE( mkexpr(addr),
   5830                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   5831                break;
   5832 
   5833             case 3: /* FISTP m32 */
   5834                DIP("fistpl %s\n", dis_buf);
   5835                storeLE( mkexpr(addr),
   5836                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   5837                fp_pop();
   5838                break;
   5839 
   5840             case 5: { /* FLD extended-real */
   5841                /* Uses dirty helper:
   5842                      ULong amd64g_loadF80le ( ULong )
   5843                   addr holds the address.  First, do a dirty call to
   5844                   get hold of the data. */
   5845                IRTemp   val  = newTemp(Ity_I64);
   5846                IRExpr** args = mkIRExprVec_1 ( mkexpr(addr) );
   5847 
   5848                IRDirty* d = unsafeIRDirty_1_N (
   5849                                val,
   5850                                0/*regparms*/,
   5851                                "amd64g_dirtyhelper_loadF80le",
   5852                                &amd64g_dirtyhelper_loadF80le,
   5853                                args
   5854                             );
   5855                /* declare that we're reading memory */
   5856                d->mFx   = Ifx_Read;
   5857                d->mAddr = mkexpr(addr);
   5858                d->mSize = 10;
   5859 
   5860                /* execute the dirty call, dumping the result in val. */
   5861                stmt( IRStmt_Dirty(d) );
   5862                fp_push();
   5863                put_ST(0, unop(Iop_ReinterpI64asF64, mkexpr(val)));
   5864 
   5865                DIP("fldt %s\n", dis_buf);
   5866                break;
   5867             }
   5868 
   5869             case 7: { /* FSTP extended-real */
   5870                /* Uses dirty helper:
   5871                      void amd64g_storeF80le ( ULong addr, ULong data )
   5872                */
   5873                IRExpr** args
   5874                   = mkIRExprVec_2( mkexpr(addr),
   5875                                    unop(Iop_ReinterpF64asI64, get_ST(0)) );
   5876 
   5877                IRDirty* d = unsafeIRDirty_0_N (
   5878                                0/*regparms*/,
   5879                                "amd64g_dirtyhelper_storeF80le",
   5880                                &amd64g_dirtyhelper_storeF80le,
   5881                                args
   5882                             );
   5883                /* declare we're writing memory */
   5884                d->mFx   = Ifx_Write;
   5885                d->mAddr = mkexpr(addr);
   5886                d->mSize = 10;
   5887 
   5888                /* execute the dirty call. */
   5889                stmt( IRStmt_Dirty(d) );
   5890                fp_pop();
   5891 
   5892                DIP("fstpt\n %s", dis_buf);
   5893                break;
   5894             }
   5895 
   5896             default:
   5897                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   5898                vex_printf("first_opcode == 0xDB\n");
   5899                goto decode_fail;
   5900          }
   5901 
   5902       } else {
   5903 
   5904          delta++;
   5905          switch (modrm) {
   5906 
   5907             case 0xC0 ... 0xC7: /* FCMOVNB ST(i), ST(0) */
   5908                r_src = (UInt)modrm - 0xC0;
   5909                DIP("fcmovnb %%st(%u), %%st(0)\n", r_src);
   5910                put_ST_UNCHECKED(0,
   5911                                 IRExpr_Mux0X(
   5912                                     unop(Iop_1Uto8,
   5913                                          mk_amd64g_calculate_condition(AMD64CondNB)),
   5914                                     get_ST(0), get_ST(r_src)) );
   5915                break;
   5916 
   5917             case 0xC8 ... 0xCF: /* FCMOVNE(NZ) ST(i), ST(0) */
   5918                r_src = (UInt)modrm - 0xC8;
   5919                DIP("fcmovnz %%st(%u), %%st(0)\n", r_src);
   5920                put_ST_UNCHECKED(
   5921                   0,
   5922                   IRExpr_Mux0X(
   5923                      unop(Iop_1Uto8,
   5924                           mk_amd64g_calculate_condition(AMD64CondNZ)),
   5925                      get_ST(0),
   5926                      get_ST(r_src)
   5927                   )
   5928                );
   5929                break;
   5930 
   5931             case 0xD0 ... 0xD7: /* FCMOVNBE ST(i), ST(0) */
   5932                r_src = (UInt)modrm - 0xD0;
   5933                DIP("fcmovnbe %%st(%u), %%st(0)\n", r_src);
   5934                put_ST_UNCHECKED(
   5935                   0,
   5936                   IRExpr_Mux0X(
   5937                      unop(Iop_1Uto8,
   5938                           mk_amd64g_calculate_condition(AMD64CondNBE)),
   5939                      get_ST(0),
   5940                      get_ST(r_src)
   5941                   )
   5942                );
   5943                break;
   5944 
   5945             case 0xD8 ... 0xDF: /* FCMOVNU ST(i), ST(0) */
   5946                r_src = (UInt)modrm - 0xD8;
   5947                DIP("fcmovnu %%st(%u), %%st(0)\n", r_src);
   5948                put_ST_UNCHECKED(
   5949                   0,
   5950                   IRExpr_Mux0X(
   5951                      unop(Iop_1Uto8,
   5952                           mk_amd64g_calculate_condition(AMD64CondNP)),
   5953                      get_ST(0),
   5954                      get_ST(r_src)
   5955                   )
   5956                );
   5957                break;
   5958 
   5959             case 0xE2:
   5960                DIP("fnclex\n");
   5961                break;
   5962 
   5963             case 0xE3: {
   5964                /* Uses dirty helper:
   5965                      void amd64g_do_FINIT ( VexGuestAMD64State* ) */
   5966                IRDirty* d  = unsafeIRDirty_0_N (
   5967                                 0/*regparms*/,
   5968                                 "amd64g_dirtyhelper_FINIT",
   5969                                 &amd64g_dirtyhelper_FINIT,
   5970                                 mkIRExprVec_0()
   5971                              );
   5972                d->needsBBP = True;
   5973 
   5974                /* declare we're writing guest state */
   5975                d->nFxState = 5;
   5976                vex_bzero(&d->fxState, sizeof(d->fxState));
   5977 
   5978                d->fxState[0].fx     = Ifx_Write;
   5979                d->fxState[0].offset = OFFB_FTOP;
   5980                d->fxState[0].size   = sizeof(UInt);
   5981 
   5982                d->fxState[1].fx     = Ifx_Write;
   5983                d->fxState[1].offset = OFFB_FPREGS;
   5984                d->fxState[1].size   = 8 * sizeof(ULong);
   5985 
   5986                d->fxState[2].fx     = Ifx_Write;
   5987                d->fxState[2].offset = OFFB_FPTAGS;
   5988                d->fxState[2].size   = 8 * sizeof(UChar);
   5989 
   5990                d->fxState[3].fx     = Ifx_Write;
   5991                d->fxState[3].offset = OFFB_FPROUND;
   5992                d->fxState[3].size   = sizeof(ULong);
   5993 
   5994                d->fxState[4].fx     = Ifx_Write;
   5995                d->fxState[4].offset = OFFB_FC3210;
   5996                d->fxState[4].size   = sizeof(ULong);
   5997 
   5998                stmt( IRStmt_Dirty(d) );
   5999 
   6000                DIP("fninit\n");
   6001                break;
   6002             }
   6003 
   6004             case 0xE8 ... 0xEF: /* FUCOMI %st(0),%st(?) */
   6005                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, False );
   6006                break;
   6007 
   6008             case 0xF0 ... 0xF7: /* FCOMI %st(0),%st(?) */
   6009                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, False );
   6010                break;
   6011 
   6012             default:
   6013                goto decode_fail;
   6014          }
   6015       }
   6016    }
   6017 
   6018    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDC opcodes +-+-+-+-+-+-+-+ */
   6019    else
   6020    if (first_opcode == 0xDC) {
   6021       if (modrm < 0xC0) {
   6022 
   6023          /* bits 5,4,3 are an opcode extension, and the modRM also
   6024             specifies an address. */
   6025          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6026          delta += len;
   6027 
   6028          switch (gregLO3ofRM(modrm)) {
   6029 
   6030             case 0: /* FADD double-real */
   6031                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, True );
   6032                break;
   6033 
   6034             case 1: /* FMUL double-real */
   6035                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, True );
   6036                break;
   6037 
   6038 //..             case 2: /* FCOM double-real */
   6039 //..                DIP("fcoml %s\n", dis_buf);
   6040 //..                /* This forces C1 to zero, which isn't right. */
   6041 //..                put_C3210(
   6042 //..                    binop( Iop_And32,
   6043 //..                           binop(Iop_Shl32,
   6044 //..                                 binop(Iop_CmpF64,
   6045 //..                                       get_ST(0),
   6046 //..                                       loadLE(Ity_F64,mkexpr(addr))),
   6047 //..                                 mkU8(8)),
   6048 //..                           mkU32(0x4500)
   6049 //..                    ));
   6050 //..                break;
   6051 
   6052             case 3: /* FCOMP double-real */
   6053                DIP("fcompl %s\n", dis_buf);
   6054                /* This forces C1 to zero, which isn't right. */
   6055                put_C3210(
   6056                    unop(Iop_32Uto64,
   6057                    binop( Iop_And32,
   6058                           binop(Iop_Shl32,
   6059                                 binop(Iop_CmpF64,
   6060                                       get_ST(0),
   6061                                       loadLE(Ity_F64,mkexpr(addr))),
   6062                                 mkU8(8)),
   6063                           mkU32(0x4500)
   6064                    )));
   6065                fp_pop();
   6066                break;
   6067 
   6068             case 4: /* FSUB double-real */
   6069                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, True );
   6070                break;
   6071 
   6072             case 5: /* FSUBR double-real */
   6073                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, True );
   6074                break;
   6075 
   6076             case 6: /* FDIV double-real */
   6077                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, True );
   6078                break;
   6079 
   6080             case 7: /* FDIVR double-real */
   6081                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, True );
   6082                break;
   6083 
   6084             default:
   6085                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   6086                vex_printf("first_opcode == 0xDC\n");
   6087                goto decode_fail;
   6088          }
   6089 
   6090       } else {
   6091 
   6092          delta++;
   6093          switch (modrm) {
   6094 
   6095             case 0xC0 ... 0xC7: /* FADD %st(0),%st(?) */
   6096                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, False );
   6097                break;
   6098 
   6099             case 0xC8 ... 0xCF: /* FMUL %st(0),%st(?) */
   6100                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, False );
   6101                break;
   6102 
   6103             case 0xE0 ... 0xE7: /* FSUBR %st(0),%st(?) */
   6104                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0, modrm - 0xE0, False );
   6105                break;
   6106 
   6107             case 0xE8 ... 0xEF: /* FSUB %st(0),%st(?) */
   6108                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0, modrm - 0xE8, False );
   6109                break;
   6110 
   6111             case 0xF0 ... 0xF7: /* FDIVR %st(0),%st(?) */
   6112                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, False );
   6113                break;
   6114 
   6115             case 0xF8 ... 0xFF: /* FDIV %st(0),%st(?) */
   6116                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, False );
   6117                break;
   6118 
   6119             default:
   6120                goto decode_fail;
   6121          }
   6122 
   6123       }
   6124    }
   6125 
   6126    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDD opcodes +-+-+-+-+-+-+-+ */
   6127    else
   6128    if (first_opcode == 0xDD) {
   6129 
   6130       if (modrm < 0xC0) {
   6131 
   6132          /* bits 5,4,3 are an opcode extension, and the modRM also
   6133             specifies an address. */
   6134          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6135          delta += len;
   6136 
   6137          switch (gregLO3ofRM(modrm)) {
   6138 
   6139             case 0: /* FLD double-real */
   6140                DIP("fldl %s\n", dis_buf);
   6141                fp_push();
   6142                put_ST(0, loadLE(Ity_F64, mkexpr(addr)));
   6143                break;
   6144 
   6145             case 1: /* FISTTPQ m64 (SSE3) */
   6146                DIP("fistppll %s\n", dis_buf);
   6147                storeLE( mkexpr(addr),
   6148                         binop(Iop_F64toI64S, mkU32(Irrm_ZERO), get_ST(0)) );
   6149                fp_pop();
   6150                break;
   6151 
   6152             case 2: /* FST double-real */
   6153                DIP("fstl %s\n", dis_buf);
   6154                storeLE(mkexpr(addr), get_ST(0));
   6155                break;
   6156 
   6157             case 3: /* FSTP double-real */
   6158                DIP("fstpl %s\n", dis_buf);
   6159                storeLE(mkexpr(addr), get_ST(0));
   6160                fp_pop();
   6161                break;
   6162 
   6163             case 4: { /* FRSTOR m94/m108 */
   6164                IRTemp   ew = newTemp(Ity_I32);
   6165                IRTemp  w64 = newTemp(Ity_I64);
   6166                IRDirty*  d;
   6167                if ( have66(pfx) ) {
   6168                   /* Uses dirty helper:
   6169                      VexEmWarn amd64g_dirtyhelper_FRSTORS
   6170                                   ( VexGuestAMD64State*, HWord ) */
   6171                   d = unsafeIRDirty_0_N (
   6172                          0/*regparms*/,
   6173                          "amd64g_dirtyhelper_FRSTORS",
   6174                          &amd64g_dirtyhelper_FRSTORS,
   6175                          mkIRExprVec_1( mkexpr(addr) )
   6176                       );
   6177                   d->mSize = 94;
   6178                } else {
   6179                   /* Uses dirty helper:
   6180                      VexEmWarn amd64g_dirtyhelper_FRSTOR
   6181                                   ( VexGuestAMD64State*, HWord ) */
   6182                   d = unsafeIRDirty_0_N (
   6183                          0/*regparms*/,
   6184                          "amd64g_dirtyhelper_FRSTOR",
   6185                          &amd64g_dirtyhelper_FRSTOR,
   6186                          mkIRExprVec_1( mkexpr(addr) )
   6187                       );
   6188                   d->mSize = 108;
   6189                }
   6190 
   6191                d->needsBBP = True;
   6192                d->tmp      = w64;
   6193                /* declare we're reading memory */
   6194                d->mFx   = Ifx_Read;
   6195                d->mAddr = mkexpr(addr);
   6196                /* d->mSize set above */
   6197 
   6198                /* declare we're writing guest state */
   6199                d->nFxState = 5;
   6200                vex_bzero(&d->fxState, sizeof(d->fxState));
   6201 
   6202                d->fxState[0].fx     = Ifx_Write;
   6203                d->fxState[0].offset = OFFB_FTOP;
   6204                d->fxState[0].size   = sizeof(UInt);
   6205 
   6206                d->fxState[1].fx     = Ifx_Write;
   6207                d->fxState[1].offset = OFFB_FPREGS;
   6208                d->fxState[1].size   = 8 * sizeof(ULong);
   6209 
   6210                d->fxState[2].fx     = Ifx_Write;
   6211                d->fxState[2].offset = OFFB_FPTAGS;
   6212                d->fxState[2].size   = 8 * sizeof(UChar);
   6213 
   6214                d->fxState[3].fx     = Ifx_Write;
   6215                d->fxState[3].offset = OFFB_FPROUND;
   6216                d->fxState[3].size   = sizeof(ULong);
   6217 
   6218                d->fxState[4].fx     = Ifx_Write;
   6219                d->fxState[4].offset = OFFB_FC3210;
   6220                d->fxState[4].size   = sizeof(ULong);
   6221 
   6222                stmt( IRStmt_Dirty(d) );
   6223 
   6224                /* ew contains any emulation warning we may need to
   6225                   issue.  If needed, side-exit to the next insn,
   6226                   reporting the warning, so that Valgrind's dispatcher
   6227                   sees the warning. */
   6228                assign(ew, unop(Iop_64to32,mkexpr(w64)) );
   6229                put_emwarn( mkexpr(ew) );
   6230                stmt(
   6231                   IRStmt_Exit(
   6232                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   6233                      Ijk_EmWarn,
   6234                      IRConst_U64( guest_RIP_bbstart+delta ),
   6235                      OFFB_RIP
   6236                   )
   6237                );
   6238 
   6239                if ( have66(pfx) ) {
   6240                   DIP("frstors %s\n", dis_buf);
   6241                } else {
   6242                   DIP("frstor %s\n", dis_buf);
   6243                }
   6244                break;
   6245             }
   6246 
   6247             case 6: { /* FNSAVE m94/m108 */
   6248                IRDirty *d;
   6249                if ( have66(pfx) ) {
   6250                  /* Uses dirty helper:
   6251                     void amd64g_dirtyhelper_FNSAVES ( VexGuestX86State*, HWord ) */
   6252                   d = unsafeIRDirty_0_N (
   6253                          0/*regparms*/,
   6254                          "amd64g_dirtyhelper_FNSAVES",
   6255                          &amd64g_dirtyhelper_FNSAVES,
   6256                          mkIRExprVec_1( mkexpr(addr) )
   6257                          );
   6258                   d->mSize = 94;
   6259                } else {
   6260                  /* Uses dirty helper:
   6261                     void amd64g_dirtyhelper_FNSAVE ( VexGuestX86State*, HWord ) */
   6262                   d = unsafeIRDirty_0_N (
   6263                          0/*regparms*/,
   6264                          "amd64g_dirtyhelper_FNSAVE",
   6265                          &amd64g_dirtyhelper_FNSAVE,
   6266                          mkIRExprVec_1( mkexpr(addr) )
   6267                          );
   6268                   d->mSize = 108;
   6269                }
   6270                d->needsBBP = True;
   6271                /* declare we're writing memory */
   6272                d->mFx   = Ifx_Write;
   6273                d->mAddr = mkexpr(addr);
   6274                /* d->mSize set above */
   6275 
   6276                /* declare we're reading guest state */
   6277                d->nFxState = 5;
   6278                vex_bzero(&d->fxState, sizeof(d->fxState));
   6279 
   6280                d->fxState[0].fx     = Ifx_Read;
   6281                d->fxState[0].offset = OFFB_FTOP;
   6282                d->fxState[0].size   = sizeof(UInt);
   6283 
   6284                d->fxState[1].fx     = Ifx_Read;
   6285                d->fxState[1].offset = OFFB_FPREGS;
   6286                d->fxState[1].size   = 8 * sizeof(ULong);
   6287 
   6288                d->fxState[2].fx     = Ifx_Read;
   6289                d->fxState[2].offset = OFFB_FPTAGS;
   6290                d->fxState[2].size   = 8 * sizeof(UChar);
   6291 
   6292                d->fxState[3].fx     = Ifx_Read;
   6293                d->fxState[3].offset = OFFB_FPROUND;
   6294                d->fxState[3].size   = sizeof(ULong);
   6295 
   6296                d->fxState[4].fx     = Ifx_Read;
   6297                d->fxState[4].offset = OFFB_FC3210;
   6298                d->fxState[4].size   = sizeof(ULong);
   6299 
   6300                stmt( IRStmt_Dirty(d) );
   6301 
   6302                if ( have66(pfx) ) {
   6303                  DIP("fnsaves %s\n", dis_buf);
   6304                } else {
   6305                  DIP("fnsave %s\n", dis_buf);
   6306                }
   6307                break;
   6308             }
   6309 
   6310             case 7: { /* FNSTSW m16 */
   6311                IRExpr* sw = get_FPU_sw();
   6312                vassert(typeOfIRExpr(irsb->tyenv, sw) == Ity_I16);
   6313                storeLE( mkexpr(addr), sw );
   6314                DIP("fnstsw %s\n", dis_buf);
   6315                break;
   6316             }
   6317 
   6318             default:
   6319                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   6320                vex_printf("first_opcode == 0xDD\n");
   6321                goto decode_fail;
   6322          }
   6323       } else {
   6324          delta++;
   6325          switch (modrm) {
   6326 
   6327             case 0xC0 ... 0xC7: /* FFREE %st(?) */
   6328                r_dst = (UInt)modrm - 0xC0;
   6329                DIP("ffree %%st(%u)\n", r_dst);
   6330                put_ST_TAG ( r_dst, mkU8(0) );
   6331                break;
   6332 
   6333             case 0xD0 ... 0xD7: /* FST %st(0),%st(?) */
   6334                r_dst = (UInt)modrm - 0xD0;
   6335                DIP("fst %%st(0),%%st(%u)\n", r_dst);
   6336                /* P4 manual says: "If the destination operand is a
   6337                   non-empty register, the invalid-operation exception
   6338                   is not generated.  Hence put_ST_UNCHECKED. */
   6339                put_ST_UNCHECKED(r_dst, get_ST(0));
   6340                break;
   6341 
   6342             case 0xD8 ... 0xDF: /* FSTP %st(0),%st(?) */
   6343                r_dst = (UInt)modrm - 0xD8;
   6344                DIP("fstp %%st(0),%%st(%u)\n", r_dst);
   6345                /* P4 manual says: "If the destination operand is a
   6346                   non-empty register, the invalid-operation exception
   6347                   is not generated.  Hence put_ST_UNCHECKED. */
   6348                put_ST_UNCHECKED(r_dst, get_ST(0));
   6349                fp_pop();
   6350                break;
   6351 
   6352             case 0xE0 ... 0xE7: /* FUCOM %st(0),%st(?) */
   6353                r_dst = (UInt)modrm - 0xE0;
   6354                DIP("fucom %%st(0),%%st(%u)\n", r_dst);
   6355                /* This forces C1 to zero, which isn't right. */
   6356                put_C3210(
   6357                    unop(Iop_32Uto64,
   6358                    binop( Iop_And32,
   6359                           binop(Iop_Shl32,
   6360                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   6361                                 mkU8(8)),
   6362                           mkU32(0x4500)
   6363                    )));
   6364                break;
   6365 
   6366             case 0xE8 ... 0xEF: /* FUCOMP %st(0),%st(?) */
   6367                r_dst = (UInt)modrm - 0xE8;
   6368                DIP("fucomp %%st(0),%%st(%u)\n", r_dst);
   6369                /* This forces C1 to zero, which isn't right. */
   6370                put_C3210(
   6371                    unop(Iop_32Uto64,
   6372                    binop( Iop_And32,
   6373                           binop(Iop_Shl32,
   6374                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   6375                                 mkU8(8)),
   6376                           mkU32(0x4500)
   6377                    )));
   6378                fp_pop();
   6379                break;
   6380 
   6381             default:
   6382                goto decode_fail;
   6383          }
   6384       }
   6385    }
   6386 
   6387    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDE opcodes +-+-+-+-+-+-+-+ */
   6388    else
   6389    if (first_opcode == 0xDE) {
   6390 
   6391       if (modrm < 0xC0) {
   6392 
   6393          /* bits 5,4,3 are an opcode extension, and the modRM also
   6394             specifies an address. */
   6395          IROp   fop;
   6396          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6397          delta += len;
   6398 
   6399          switch (gregLO3ofRM(modrm)) {
   6400 
   6401             case 0: /* FIADD m16int */ /* ST(0) += m16int */
   6402                DIP("fiaddw %s\n", dis_buf);
   6403                fop = Iop_AddF64;
   6404                goto do_fop_m16;
   6405 
   6406             case 1: /* FIMUL m16int */ /* ST(0) *= m16int */
   6407                DIP("fimulw %s\n", dis_buf);
   6408                fop = Iop_MulF64;
   6409                goto do_fop_m16;
   6410 
   6411             case 4: /* FISUB m16int */ /* ST(0) -= m16int */
   6412                DIP("fisubw %s\n", dis_buf);
   6413                fop = Iop_SubF64;
   6414                goto do_fop_m16;
   6415 
   6416             case 5: /* FISUBR m16int */ /* ST(0) = m16int - ST(0) */
   6417                DIP("fisubrw %s\n", dis_buf);
   6418                fop = Iop_SubF64;
   6419                goto do_foprev_m16;
   6420 
   6421             case 6: /* FIDIV m16int */ /* ST(0) /= m16int */
   6422                DIP("fisubw %s\n", dis_buf);
   6423                fop = Iop_DivF64;
   6424                goto do_fop_m16;
   6425 
   6426             case 7: /* FIDIVR m16int */ /* ST(0) = m16int / ST(0) */
   6427                DIP("fidivrw %s\n", dis_buf);
   6428                fop = Iop_DivF64;
   6429                goto do_foprev_m16;
   6430 
   6431             do_fop_m16:
   6432                put_ST_UNCHECKED(0,
   6433                   triop(fop,
   6434                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6435                         get_ST(0),
   6436                         unop(Iop_I32StoF64,
   6437                              unop(Iop_16Sto32,
   6438                                   loadLE(Ity_I16, mkexpr(addr))))));
   6439                break;
   6440 
   6441             do_foprev_m16:
   6442                put_ST_UNCHECKED(0,
   6443                   triop(fop,
   6444                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6445                         unop(Iop_I32StoF64,
   6446                              unop(Iop_16Sto32,
   6447                                   loadLE(Ity_I16, mkexpr(addr)))),
   6448                         get_ST(0)));
   6449                break;
   6450 
   6451             default:
   6452                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   6453                vex_printf("first_opcode == 0xDE\n");
   6454                goto decode_fail;
   6455          }
   6456 
   6457       } else {
   6458 
   6459          delta++;
   6460          switch (modrm) {
   6461 
   6462             case 0xC0 ... 0xC7: /* FADDP %st(0),%st(?) */
   6463                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, True );
   6464                break;
   6465 
   6466             case 0xC8 ... 0xCF: /* FMULP %st(0),%st(?) */
   6467                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, True );
   6468                break;
   6469 
   6470             case 0xD9: /* FCOMPP %st(0),%st(1) */
   6471                DIP("fcompp %%st(0),%%st(1)\n");
   6472                /* This forces C1 to zero, which isn't right. */
   6473                put_C3210(
   6474                    unop(Iop_32Uto64,
   6475                    binop( Iop_And32,
   6476                           binop(Iop_Shl32,
   6477                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   6478                                 mkU8(8)),
   6479                           mkU32(0x4500)
   6480                    )));
   6481                fp_pop();
   6482                fp_pop();
   6483                break;
   6484 
   6485             case 0xE0 ... 0xE7: /* FSUBRP %st(0),%st(?) */
   6486                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0,  modrm - 0xE0, True );
   6487                break;
   6488 
   6489             case 0xE8 ... 0xEF: /* FSUBP %st(0),%st(?) */
   6490                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0,  modrm - 0xE8, True );
   6491                break;
   6492 
   6493             case 0xF0 ... 0xF7: /* FDIVRP %st(0),%st(?) */
   6494                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, True );
   6495                break;
   6496 
   6497             case 0xF8 ... 0xFF: /* FDIVP %st(0),%st(?) */
   6498                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, True );
   6499                break;
   6500 
   6501             default:
   6502                goto decode_fail;
   6503          }
   6504 
   6505       }
   6506    }
   6507 
   6508    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDF opcodes +-+-+-+-+-+-+-+ */
   6509    else
   6510    if (first_opcode == 0xDF) {
   6511 
   6512       if (modrm < 0xC0) {
   6513 
   6514          /* bits 5,4,3 are an opcode extension, and the modRM also
   6515             specifies an address. */
   6516          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6517          delta += len;
   6518 
   6519          switch (gregLO3ofRM(modrm)) {
   6520 
   6521             case 0: /* FILD m16int */
   6522                DIP("fildw %s\n", dis_buf);
   6523                fp_push();
   6524                put_ST(0, unop(Iop_I32StoF64,
   6525                               unop(Iop_16Sto32,
   6526                                    loadLE(Ity_I16, mkexpr(addr)))));
   6527                break;
   6528 
   6529             case 1: /* FISTTPS m16 (SSE3) */
   6530                DIP("fisttps %s\n", dis_buf);
   6531                storeLE( mkexpr(addr),
   6532                         x87ishly_qnarrow_32_to_16(
   6533                         binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) ));
   6534                fp_pop();
   6535                break;
   6536 
   6537             case 2: /* FIST m16 */
   6538                DIP("fists %s\n", dis_buf);
   6539                storeLE( mkexpr(addr),
   6540                         x87ishly_qnarrow_32_to_16(
   6541                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
   6542                break;
   6543 
   6544             case 3: /* FISTP m16 */
   6545                DIP("fistps %s\n", dis_buf);
   6546                storeLE( mkexpr(addr),
   6547                         x87ishly_qnarrow_32_to_16(
   6548                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
   6549                fp_pop();
   6550                break;
   6551 
   6552             case 5: /* FILD m64 */
   6553                DIP("fildll %s\n", dis_buf);
   6554                fp_push();
   6555                put_ST(0, binop(Iop_I64StoF64,
   6556                                get_roundingmode(),
   6557                                loadLE(Ity_I64, mkexpr(addr))));
   6558                break;
   6559 
   6560             case 7: /* FISTP m64 */
   6561                DIP("fistpll %s\n", dis_buf);
   6562                storeLE( mkexpr(addr),
   6563                         binop(Iop_F64toI64S, get_roundingmode(), get_ST(0)) );
   6564                fp_pop();
   6565                break;
   6566 
   6567             default:
   6568                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   6569                vex_printf("first_opcode == 0xDF\n");
   6570                goto decode_fail;
   6571          }
   6572 
   6573       } else {
   6574 
   6575          delta++;
   6576          switch (modrm) {
   6577 
   6578             case 0xC0: /* FFREEP %st(0) */
   6579                DIP("ffreep %%st(%d)\n", 0);
   6580                put_ST_TAG ( 0, mkU8(0) );
   6581                fp_pop();
   6582                break;
   6583 
   6584             case 0xE0: /* FNSTSW %ax */
   6585                DIP("fnstsw %%ax\n");
   6586                /* Invent a plausible-looking FPU status word value and
   6587                   dump it in %AX:
   6588                      ((ftop & 7) << 11) | (c3210 & 0x4700)
   6589                */
   6590                putIRegRAX(
   6591                   2,
   6592                   unop(Iop_32to16,
   6593                        binop(Iop_Or32,
   6594                              binop(Iop_Shl32,
   6595                                    binop(Iop_And32, get_ftop(), mkU32(7)),
   6596                                    mkU8(11)),
   6597                              binop(Iop_And32,
   6598                                    unop(Iop_64to32, get_C3210()),
   6599                                    mkU32(0x4700))
   6600                )));
   6601                break;
   6602 
   6603             case 0xE8 ... 0xEF: /* FUCOMIP %st(0),%st(?) */
   6604                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, True );
   6605                break;
   6606 
   6607             case 0xF0 ... 0xF7: /* FCOMIP %st(0),%st(?) */
   6608                /* not really right since COMIP != UCOMIP */
   6609                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, True );
   6610                break;
   6611 
   6612             default:
   6613                goto decode_fail;
   6614          }
   6615       }
   6616 
   6617    }
   6618 
   6619    else
   6620       goto decode_fail;
   6621 
   6622    *decode_ok = True;
   6623    return delta;
   6624 
   6625   decode_fail:
   6626    *decode_ok = False;
   6627    return delta;
   6628 }
   6629 
   6630 
   6631 /*------------------------------------------------------------*/
   6632 /*---                                                      ---*/
   6633 /*--- MMX INSTRUCTIONS                                     ---*/
   6634 /*---                                                      ---*/
   6635 /*------------------------------------------------------------*/
   6636 
   6637 /* Effect of MMX insns on x87 FPU state (table 11-2 of
   6638    IA32 arch manual, volume 3):
   6639 
   6640    Read from, or write to MMX register (viz, any insn except EMMS):
   6641    * All tags set to Valid (non-empty) -- FPTAGS[i] := nonzero
   6642    * FP stack pointer set to zero
   6643 
   6644    EMMS:
   6645    * All tags set to Invalid (empty) -- FPTAGS[i] := zero
   6646    * FP stack pointer set to zero
   6647 */
   6648 
   6649 static void do_MMX_preamble ( void )
   6650 {
   6651    Int         i;
   6652    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   6653    IRExpr*     zero  = mkU32(0);
   6654    IRExpr*     tag1  = mkU8(1);
   6655    put_ftop(zero);
   6656    for (i = 0; i < 8; i++)
   6657       stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag1) ) );
   6658 }
   6659 
   6660 static void do_EMMS_preamble ( void )
   6661 {
   6662    Int         i;
   6663    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   6664    IRExpr*     zero  = mkU32(0);
   6665    IRExpr*     tag0  = mkU8(0);
   6666    put_ftop(zero);
   6667    for (i = 0; i < 8; i++)
   6668       stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag0) ) );
   6669 }
   6670 
   6671 
   6672 static IRExpr* getMMXReg ( UInt archreg )
   6673 {
   6674    vassert(archreg < 8);
   6675    return IRExpr_Get( OFFB_FPREGS + 8 * archreg, Ity_I64 );
   6676 }
   6677 
   6678 
   6679 static void putMMXReg ( UInt archreg, IRExpr* e )
   6680 {
   6681    vassert(archreg < 8);
   6682    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   6683    stmt( IRStmt_Put( OFFB_FPREGS + 8 * archreg, e ) );
   6684 }
   6685 
   6686 
   6687 /* Helper for non-shift MMX insns.  Note this is incomplete in the
   6688    sense that it does not first call do_MMX_preamble() -- that is the
   6689    responsibility of its caller. */
   6690 
   6691 static
   6692 ULong dis_MMXop_regmem_to_reg ( VexAbiInfo* vbi,
   6693                                 Prefix      pfx,
   6694                                 Long        delta,
   6695                                 UChar       opc,
   6696                                 HChar*      name,
   6697                                 Bool        show_granularity )
   6698 {
   6699    HChar   dis_buf[50];
   6700    UChar   modrm = getUChar(delta);
   6701    Bool    isReg = epartIsReg(modrm);
   6702    IRExpr* argL  = NULL;
   6703    IRExpr* argR  = NULL;
   6704    IRExpr* argG  = NULL;
   6705    IRExpr* argE  = NULL;
   6706    IRTemp  res   = newTemp(Ity_I64);
   6707 
   6708    Bool    invG  = False;
   6709    IROp    op    = Iop_INVALID;
   6710    void*   hAddr = NULL;
   6711    HChar*  hName = NULL;
   6712    Bool    eLeft = False;
   6713 
   6714 #  define XXX(_name) do { hAddr = &_name; hName = #_name; } while (0)
   6715 
   6716    switch (opc) {
   6717       /* Original MMX ones */
   6718       case 0xFC: op = Iop_Add8x8; break;
   6719       case 0xFD: op = Iop_Add16x4; break;
   6720       case 0xFE: op = Iop_Add32x2; break;
   6721 
   6722       case 0xEC: op = Iop_QAdd8Sx8; break;
   6723       case 0xED: op = Iop_QAdd16Sx4; break;
   6724 
   6725       case 0xDC: op = Iop_QAdd8Ux8; break;
   6726       case 0xDD: op = Iop_QAdd16Ux4; break;
   6727 
   6728       case 0xF8: op = Iop_Sub8x8;  break;
   6729       case 0xF9: op = Iop_Sub16x4; break;
   6730       case 0xFA: op = Iop_Sub32x2; break;
   6731 
   6732       case 0xE8: op = Iop_QSub8Sx8; break;
   6733       case 0xE9: op = Iop_QSub16Sx4; break;
   6734 
   6735       case 0xD8: op = Iop_QSub8Ux8; break;
   6736       case 0xD9: op = Iop_QSub16Ux4; break;
   6737 
   6738       case 0xE5: op = Iop_MulHi16Sx4; break;
   6739       case 0xD5: op = Iop_Mul16x4; break;
   6740       case 0xF5: XXX(amd64g_calculate_mmx_pmaddwd); break;
   6741 
   6742       case 0x74: op = Iop_CmpEQ8x8; break;
   6743       case 0x75: op = Iop_CmpEQ16x4; break;
   6744       case 0x76: op = Iop_CmpEQ32x2; break;
   6745 
   6746       case 0x64: op = Iop_CmpGT8Sx8; break;
   6747       case 0x65: op = Iop_CmpGT16Sx4; break;
   6748       case 0x66: op = Iop_CmpGT32Sx2; break;
   6749 
   6750       case 0x6B: op = Iop_QNarrowBin32Sto16Sx4; eLeft = True; break;
   6751       case 0x63: op = Iop_QNarrowBin16Sto8Sx8;  eLeft = True; break;
   6752       case 0x67: op = Iop_QNarrowBin16Sto8Ux8;  eLeft = True; break;
   6753 
   6754       case 0x68: op = Iop_InterleaveHI8x8;  eLeft = True; break;
   6755       case 0x69: op = Iop_InterleaveHI16x4; eLeft = True; break;
   6756       case 0x6A: op = Iop_InterleaveHI32x2; eLeft = True; break;
   6757 
   6758       case 0x60: op = Iop_InterleaveLO8x8;  eLeft = True; break;
   6759       case 0x61: op = Iop_InterleaveLO16x4; eLeft = True; break;
   6760       case 0x62: op = Iop_InterleaveLO32x2; eLeft = True; break;
   6761 
   6762       case 0xDB: op = Iop_And64; break;
   6763       case 0xDF: op = Iop_And64; invG = True; break;
   6764       case 0xEB: op = Iop_Or64; break;
   6765       case 0xEF: /* Possibly do better here if argL and argR are the
   6766                     same reg */
   6767                  op = Iop_Xor64; break;
   6768 
   6769       /* Introduced in SSE1 */
   6770       case 0xE0: op = Iop_Avg8Ux8;    break;
   6771       case 0xE3: op = Iop_Avg16Ux4;   break;
   6772       case 0xEE: op = Iop_Max16Sx4;   break;
   6773       case 0xDE: op = Iop_Max8Ux8;    break;
   6774       case 0xEA: op = Iop_Min16Sx4;   break;
   6775       case 0xDA: op = Iop_Min8Ux8;    break;
   6776       case 0xE4: op = Iop_MulHi16Ux4; break;
   6777       case 0xF6: XXX(amd64g_calculate_mmx_psadbw); break;
   6778 
   6779       /* Introduced in SSE2 */
   6780       case 0xD4: op = Iop_Add64; break;
   6781       case 0xFB: op = Iop_Sub64; break;
   6782 
   6783       default:
   6784          vex_printf("\n0x%x\n", (Int)opc);
   6785          vpanic("dis_MMXop_regmem_to_reg");
   6786    }
   6787 
   6788 #  undef XXX
   6789 
   6790    argG = getMMXReg(gregLO3ofRM(modrm));
   6791    if (invG)
   6792       argG = unop(Iop_Not64, argG);
   6793 
   6794    if (isReg) {
   6795       delta++;
   6796       argE = getMMXReg(eregLO3ofRM(modrm));
   6797    } else {
   6798       Int    len;
   6799       IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6800       delta += len;
   6801       argE = loadLE(Ity_I64, mkexpr(addr));
   6802    }
   6803 
   6804    if (eLeft) {
   6805       argL = argE;
   6806       argR = argG;
   6807    } else {
   6808       argL = argG;
   6809       argR = argE;
   6810    }
   6811 
   6812    if (op != Iop_INVALID) {
   6813       vassert(hName == NULL);
   6814       vassert(hAddr == NULL);
   6815       assign(res, binop(op, argL, argR));
   6816    } else {
   6817       vassert(hName != NULL);
   6818       vassert(hAddr != NULL);
   6819       assign( res,
   6820               mkIRExprCCall(
   6821                  Ity_I64,
   6822                  0/*regparms*/, hName, hAddr,
   6823                  mkIRExprVec_2( argL, argR )
   6824               )
   6825             );
   6826    }
   6827 
   6828    putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
   6829 
   6830    DIP("%s%s %s, %s\n",
   6831        name, show_granularity ? nameMMXGran(opc & 3) : "",
   6832        ( isReg ? nameMMXReg(eregLO3ofRM(modrm)) : dis_buf ),
   6833        nameMMXReg(gregLO3ofRM(modrm)) );
   6834 
   6835    return delta;
   6836 }
   6837 
   6838 
   6839 /* Vector by scalar shift of G by the amount specified at the bottom
   6840    of E.  This is a straight copy of dis_SSE_shiftG_byE. */
   6841 
   6842 static ULong dis_MMX_shiftG_byE ( VexAbiInfo* vbi,
   6843                                   Prefix pfx, Long delta,
   6844                                   HChar* opname, IROp op )
   6845 {
   6846    HChar   dis_buf[50];
   6847    Int     alen, size;
   6848    IRTemp  addr;
   6849    Bool    shl, shr, sar;
   6850    UChar   rm   = getUChar(delta);
   6851    IRTemp  g0   = newTemp(Ity_I64);
   6852    IRTemp  g1   = newTemp(Ity_I64);
   6853    IRTemp  amt  = newTemp(Ity_I64);
   6854    IRTemp  amt8 = newTemp(Ity_I8);
   6855 
   6856    if (epartIsReg(rm)) {
   6857       assign( amt, getMMXReg(eregLO3ofRM(rm)) );
   6858       DIP("%s %s,%s\n", opname,
   6859                         nameMMXReg(eregLO3ofRM(rm)),
   6860                         nameMMXReg(gregLO3ofRM(rm)) );
   6861       delta++;
   6862    } else {
   6863       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   6864       assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
   6865       DIP("%s %s,%s\n", opname,
   6866                         dis_buf,
   6867                         nameMMXReg(gregLO3ofRM(rm)) );
   6868       delta += alen;
   6869    }
   6870    assign( g0,   getMMXReg(gregLO3ofRM(rm)) );
   6871    assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
   6872 
   6873    shl = shr = sar = False;
   6874    size = 0;
   6875    switch (op) {
   6876       case Iop_ShlN16x4: shl = True; size = 32; break;
   6877       case Iop_ShlN32x2: shl = True; size = 32; break;
   6878       case Iop_Shl64:    shl = True; size = 64; break;
   6879       case Iop_ShrN16x4: shr = True; size = 16; break;
   6880       case Iop_ShrN32x2: shr = True; size = 32; break;
   6881       case Iop_Shr64:    shr = True; size = 64; break;
   6882       case Iop_SarN16x4: sar = True; size = 16; break;
   6883       case Iop_SarN32x2: sar = True; size = 32; break;
   6884       default: vassert(0);
   6885    }
   6886 
   6887    if (shl || shr) {
   6888      assign(
   6889         g1,
   6890         IRExpr_Mux0X(
   6891            unop(Iop_1Uto8,binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size))),
   6892            mkU64(0),
   6893            binop(op, mkexpr(g0), mkexpr(amt8))
   6894         )
   6895      );
   6896    } else
   6897    if (sar) {
   6898      assign(
   6899         g1,
   6900         IRExpr_Mux0X(
   6901            unop(Iop_1Uto8,binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size))),
   6902            binop(op, mkexpr(g0), mkU8(size-1)),
   6903            binop(op, mkexpr(g0), mkexpr(amt8))
   6904         )
   6905      );
   6906    } else {
   6907       vassert(0);
   6908    }
   6909 
   6910    putMMXReg( gregLO3ofRM(rm), mkexpr(g1) );
   6911    return delta;
   6912 }
   6913 
   6914 
   6915 /* Vector by scalar shift of E by an immediate byte.  This is a
   6916    straight copy of dis_SSE_shiftE_imm. */
   6917 
   6918 static
   6919 ULong dis_MMX_shiftE_imm ( Long delta, HChar* opname, IROp op )
   6920 {
   6921    Bool    shl, shr, sar;
   6922    UChar   rm   = getUChar(delta);
   6923    IRTemp  e0   = newTemp(Ity_I64);
   6924    IRTemp  e1   = newTemp(Ity_I64);
   6925    UChar   amt, size;
   6926    vassert(epartIsReg(rm));
   6927    vassert(gregLO3ofRM(rm) == 2
   6928            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
   6929    amt = getUChar(delta+1);
   6930    delta += 2;
   6931    DIP("%s $%d,%s\n", opname,
   6932                       (Int)amt,
   6933                       nameMMXReg(eregLO3ofRM(rm)) );
   6934 
   6935    assign( e0, getMMXReg(eregLO3ofRM(rm)) );
   6936 
   6937    shl = shr = sar = False;
   6938    size = 0;
   6939    switch (op) {
   6940       case Iop_ShlN16x4: shl = True; size = 16; break;
   6941       case Iop_ShlN32x2: shl = True; size = 32; break;
   6942       case Iop_Shl64:    shl = True; size = 64; break;
   6943       case Iop_SarN16x4: sar = True; size = 16; break;
   6944       case Iop_SarN32x2: sar = True; size = 32; break;
   6945       case Iop_ShrN16x4: shr = True; size = 16; break;
   6946       case Iop_ShrN32x2: shr = True; size = 32; break;
   6947       case Iop_Shr64:    shr = True; size = 64; break;
   6948       default: vassert(0);
   6949    }
   6950 
   6951    if (shl || shr) {
   6952      assign( e1, amt >= size
   6953                     ? mkU64(0)
   6954                     : binop(op, mkexpr(e0), mkU8(amt))
   6955      );
   6956    } else
   6957    if (sar) {
   6958      assign( e1, amt >= size
   6959                     ? binop(op, mkexpr(e0), mkU8(size-1))
   6960                     : binop(op, mkexpr(e0), mkU8(amt))
   6961      );
   6962    } else {
   6963       vassert(0);
   6964    }
   6965 
   6966    putMMXReg( eregLO3ofRM(rm), mkexpr(e1) );
   6967    return delta;
   6968 }
   6969 
   6970 
   6971 /* Completely handle all MMX instructions except emms. */
   6972 
   6973 static
   6974 ULong dis_MMX ( Bool* decode_ok,
   6975                 VexAbiInfo* vbi, Prefix pfx, Int sz, Long delta )
   6976 {
   6977    Int   len;
   6978    UChar modrm;
   6979    HChar dis_buf[50];
   6980    UChar opc = getUChar(delta);
   6981    delta++;
   6982 
   6983    /* dis_MMX handles all insns except emms. */
   6984    do_MMX_preamble();
   6985 
   6986    switch (opc) {
   6987 
   6988       case 0x6E:
   6989          if (sz == 4) {
   6990             /* MOVD (src)ireg32-or-mem32 (E), (dst)mmxreg (G)*/
   6991             modrm = getUChar(delta);
   6992             if (epartIsReg(modrm)) {
   6993                delta++;
   6994                putMMXReg(
   6995                   gregLO3ofRM(modrm),
   6996                   binop( Iop_32HLto64,
   6997                          mkU32(0),
   6998                          getIReg32(eregOfRexRM(pfx,modrm)) ) );
   6999                DIP("movd %s, %s\n",
   7000                    nameIReg32(eregOfRexRM(pfx,modrm)),
   7001                    nameMMXReg(gregLO3ofRM(modrm)));
   7002             } else {
   7003                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7004                delta += len;
   7005                putMMXReg(
   7006                   gregLO3ofRM(modrm),
   7007                   binop( Iop_32HLto64,
   7008                          mkU32(0),
   7009                          loadLE(Ity_I32, mkexpr(addr)) ) );
   7010                DIP("movd %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
   7011             }
   7012          }
   7013          else
   7014          if (sz == 8) {
   7015             /* MOVD (src)ireg64-or-mem64 (E), (dst)mmxreg (G)*/
   7016             modrm = getUChar(delta);
   7017             if (epartIsReg(modrm)) {
   7018                delta++;
   7019                putMMXReg( gregLO3ofRM(modrm),
   7020                           getIReg64(eregOfRexRM(pfx,modrm)) );
   7021                DIP("movd %s, %s\n",
   7022                    nameIReg64(eregOfRexRM(pfx,modrm)),
   7023                    nameMMXReg(gregLO3ofRM(modrm)));
   7024             } else {
   7025                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7026                delta += len;
   7027                putMMXReg( gregLO3ofRM(modrm),
   7028                           loadLE(Ity_I64, mkexpr(addr)) );
   7029                DIP("movd{64} %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
   7030             }
   7031          }
   7032          else {
   7033             goto mmx_decode_failure;
   7034          }
   7035          break;
   7036 
   7037       case 0x7E:
   7038          if (sz == 4) {
   7039             /* MOVD (src)mmxreg (G), (dst)ireg32-or-mem32 (E) */
   7040             modrm = getUChar(delta);
   7041             if (epartIsReg(modrm)) {
   7042                delta++;
   7043                putIReg32( eregOfRexRM(pfx,modrm),
   7044                           unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
   7045                DIP("movd %s, %s\n",
   7046                    nameMMXReg(gregLO3ofRM(modrm)),
   7047                    nameIReg32(eregOfRexRM(pfx,modrm)));
   7048             } else {
   7049                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7050                delta += len;
   7051                storeLE( mkexpr(addr),
   7052                         unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
   7053                DIP("movd %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
   7054             }
   7055          }
   7056          else
   7057          if (sz == 8) {
   7058             /* MOVD (src)mmxreg (G), (dst)ireg64-or-mem64 (E) */
   7059             modrm = getUChar(delta);
   7060             if (epartIsReg(modrm)) {
   7061                delta++;
   7062                putIReg64( eregOfRexRM(pfx,modrm),
   7063                           getMMXReg(gregLO3ofRM(modrm)) );
   7064                DIP("movd %s, %s\n",
   7065                    nameMMXReg(gregLO3ofRM(modrm)),
   7066                    nameIReg64(eregOfRexRM(pfx,modrm)));
   7067             } else {
   7068                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7069                delta += len;
   7070                storeLE( mkexpr(addr),
   7071                        getMMXReg(gregLO3ofRM(modrm)) );
   7072                DIP("movd{64} %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
   7073             }
   7074          } else {
   7075             goto mmx_decode_failure;
   7076          }
   7077          break;
   7078 
   7079       case 0x6F:
   7080          /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
   7081          if (sz != 4
   7082              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7083             goto mmx_decode_failure;
   7084          modrm = getUChar(delta);
   7085          if (epartIsReg(modrm)) {
   7086             delta++;
   7087             putMMXReg( gregLO3ofRM(modrm), getMMXReg(eregLO3ofRM(modrm)) );
   7088             DIP("movq %s, %s\n",
   7089                 nameMMXReg(eregLO3ofRM(modrm)),
   7090                 nameMMXReg(gregLO3ofRM(modrm)));
   7091          } else {
   7092             IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7093             delta += len;
   7094             putMMXReg( gregLO3ofRM(modrm), loadLE(Ity_I64, mkexpr(addr)) );
   7095             DIP("movq %s, %s\n",
   7096                 dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
   7097          }
   7098          break;
   7099 
   7100       case 0x7F:
   7101          /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
   7102          if (sz != 4
   7103              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7104             goto mmx_decode_failure;
   7105          modrm = getUChar(delta);
   7106          if (epartIsReg(modrm)) {
   7107             delta++;
   7108             putMMXReg( eregLO3ofRM(modrm), getMMXReg(gregLO3ofRM(modrm)) );
   7109             DIP("movq %s, %s\n",
   7110                 nameMMXReg(gregLO3ofRM(modrm)),
   7111                 nameMMXReg(eregLO3ofRM(modrm)));
   7112          } else {
   7113             IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7114             delta += len;
   7115             storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
   7116             DIP("mov(nt)q %s, %s\n",
   7117                 nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
   7118          }
   7119          break;
   7120 
   7121       case 0xFC:
   7122       case 0xFD:
   7123       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
   7124          if (sz != 4)
   7125             goto mmx_decode_failure;
   7126          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padd", True );
   7127          break;
   7128 
   7129       case 0xEC:
   7130       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
   7131          if (sz != 4
   7132              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7133             goto mmx_decode_failure;
   7134          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padds", True );
   7135          break;
   7136 
   7137       case 0xDC:
   7138       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   7139          if (sz != 4)
   7140             goto mmx_decode_failure;
   7141          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "paddus", True );
   7142          break;
   7143 
   7144       case 0xF8:
   7145       case 0xF9:
   7146       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
   7147          if (sz != 4)
   7148             goto mmx_decode_failure;
   7149          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psub", True );
   7150          break;
   7151 
   7152       case 0xE8:
   7153       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
   7154          if (sz != 4)
   7155             goto mmx_decode_failure;
   7156          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubs", True );
   7157          break;
   7158 
   7159       case 0xD8:
   7160       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   7161          if (sz != 4)
   7162             goto mmx_decode_failure;
   7163          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubus", True );
   7164          break;
   7165 
   7166       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
   7167          if (sz != 4)
   7168             goto mmx_decode_failure;
   7169          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmulhw", False );
   7170          break;
   7171 
   7172       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
   7173          if (sz != 4)
   7174             goto mmx_decode_failure;
   7175          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmullw", False );
   7176          break;
   7177 
   7178       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
   7179          vassert(sz == 4);
   7180          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmaddwd", False );
   7181          break;
   7182 
   7183       case 0x74:
   7184       case 0x75:
   7185       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
   7186          if (sz != 4)
   7187             goto mmx_decode_failure;
   7188          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpeq", True );
   7189          break;
   7190 
   7191       case 0x64:
   7192       case 0x65:
   7193       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
   7194          if (sz != 4)
   7195             goto mmx_decode_failure;
   7196          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpgt", True );
   7197          break;
   7198 
   7199       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
   7200          if (sz != 4)
   7201             goto mmx_decode_failure;
   7202          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packssdw", False );
   7203          break;
   7204 
   7205       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
   7206          if (sz != 4)
   7207             goto mmx_decode_failure;
   7208          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packsswb", False );
   7209          break;
   7210 
   7211       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
   7212          if (sz != 4)
   7213             goto mmx_decode_failure;
   7214          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packuswb", False );
   7215          break;
   7216 
   7217       case 0x68:
   7218       case 0x69:
   7219       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
   7220          if (sz != 4
   7221              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7222             goto mmx_decode_failure;
   7223          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckh", True );
   7224          break;
   7225 
   7226       case 0x60:
   7227       case 0x61:
   7228       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
   7229          if (sz != 4
   7230              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7231             goto mmx_decode_failure;
   7232          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckl", True );
   7233          break;
   7234 
   7235       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
   7236          if (sz != 4)
   7237             goto mmx_decode_failure;
   7238          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pand", False );
   7239          break;
   7240 
   7241       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
   7242          if (sz != 4)
   7243             goto mmx_decode_failure;
   7244          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pandn", False );
   7245          break;
   7246 
   7247       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
   7248          if (sz != 4)
   7249             goto mmx_decode_failure;
   7250          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "por", False );
   7251          break;
   7252 
   7253       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
   7254          if (sz != 4)
   7255             goto mmx_decode_failure;
   7256          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pxor", False );
   7257          break;
   7258 
   7259 #     define SHIFT_BY_REG(_name,_op)                                     \
   7260                 delta = dis_MMX_shiftG_byE(vbi, pfx, delta, _name, _op); \
   7261                 break;
   7262 
   7263       /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
   7264       case 0xF1: SHIFT_BY_REG("psllw", Iop_ShlN16x4);
   7265       case 0xF2: SHIFT_BY_REG("pslld", Iop_ShlN32x2);
   7266       case 0xF3: SHIFT_BY_REG("psllq", Iop_Shl64);
   7267 
   7268       /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
   7269       case 0xD1: SHIFT_BY_REG("psrlw", Iop_ShrN16x4);
   7270       case 0xD2: SHIFT_BY_REG("psrld", Iop_ShrN32x2);
   7271       case 0xD3: SHIFT_BY_REG("psrlq", Iop_Shr64);
   7272 
   7273       /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
   7274       case 0xE1: SHIFT_BY_REG("psraw", Iop_SarN16x4);
   7275       case 0xE2: SHIFT_BY_REG("psrad", Iop_SarN32x2);
   7276 
   7277 #     undef SHIFT_BY_REG
   7278 
   7279       case 0x71:
   7280       case 0x72:
   7281       case 0x73: {
   7282          /* (sz==4): PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
   7283          UChar byte2, subopc;
   7284          if (sz != 4)
   7285             goto mmx_decode_failure;
   7286          byte2  = getUChar(delta);      /* amode / sub-opcode */
   7287          subopc = toUChar( (byte2 >> 3) & 7 );
   7288 
   7289 #        define SHIFT_BY_IMM(_name,_op)                        \
   7290             do { delta = dis_MMX_shiftE_imm(delta,_name,_op);  \
   7291             } while (0)
   7292 
   7293               if (subopc == 2 /*SRL*/ && opc == 0x71)
   7294                   SHIFT_BY_IMM("psrlw", Iop_ShrN16x4);
   7295          else if (subopc == 2 /*SRL*/ && opc == 0x72)
   7296                  SHIFT_BY_IMM("psrld", Iop_ShrN32x2);
   7297          else if (subopc == 2 /*SRL*/ && opc == 0x73)
   7298                  SHIFT_BY_IMM("psrlq", Iop_Shr64);
   7299 
   7300          else if (subopc == 4 /*SAR*/ && opc == 0x71)
   7301                  SHIFT_BY_IMM("psraw", Iop_SarN16x4);
   7302          else if (subopc == 4 /*SAR*/ && opc == 0x72)
   7303                  SHIFT_BY_IMM("psrad", Iop_SarN32x2);
   7304 
   7305          else if (subopc == 6 /*SHL*/ && opc == 0x71)
   7306                  SHIFT_BY_IMM("psllw", Iop_ShlN16x4);
   7307          else if (subopc == 6 /*SHL*/ && opc == 0x72)
   7308                   SHIFT_BY_IMM("pslld", Iop_ShlN32x2);
   7309          else if (subopc == 6 /*SHL*/ && opc == 0x73)
   7310                  SHIFT_BY_IMM("psllq", Iop_Shl64);
   7311 
   7312          else goto mmx_decode_failure;
   7313 
   7314 #        undef SHIFT_BY_IMM
   7315          break;
   7316       }
   7317 
   7318       case 0xF7: {
   7319          IRTemp addr    = newTemp(Ity_I64);
   7320          IRTemp regD    = newTemp(Ity_I64);
   7321          IRTemp regM    = newTemp(Ity_I64);
   7322          IRTemp mask    = newTemp(Ity_I64);
   7323          IRTemp olddata = newTemp(Ity_I64);
   7324          IRTemp newdata = newTemp(Ity_I64);
   7325 
   7326          modrm = getUChar(delta);
   7327          if (sz != 4 || (!epartIsReg(modrm)))
   7328             goto mmx_decode_failure;
   7329          delta++;
   7330 
   7331          assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
   7332          assign( regM, getMMXReg( eregLO3ofRM(modrm) ));
   7333          assign( regD, getMMXReg( gregLO3ofRM(modrm) ));
   7334          assign( mask, binop(Iop_SarN8x8, mkexpr(regM), mkU8(7)) );
   7335          assign( olddata, loadLE( Ity_I64, mkexpr(addr) ));
   7336          assign( newdata,
   7337                  binop(Iop_Or64,
   7338                        binop(Iop_And64,
   7339                              mkexpr(regD),
   7340                              mkexpr(mask) ),
   7341                        binop(Iop_And64,
   7342                              mkexpr(olddata),
   7343                              unop(Iop_Not64, mkexpr(mask)))) );
   7344          storeLE( mkexpr(addr), mkexpr(newdata) );
   7345          DIP("maskmovq %s,%s\n", nameMMXReg( eregLO3ofRM(modrm) ),
   7346                                  nameMMXReg( gregLO3ofRM(modrm) ) );
   7347          break;
   7348       }
   7349 
   7350       /* --- MMX decode failure --- */
   7351       default:
   7352       mmx_decode_failure:
   7353          *decode_ok = False;
   7354          return delta; /* ignored */
   7355 
   7356    }
   7357 
   7358    *decode_ok = True;
   7359    return delta;
   7360 }
   7361 
   7362 
   7363 /*------------------------------------------------------------*/
   7364 /*--- More misc arithmetic and other obscure insns.        ---*/
   7365 /*------------------------------------------------------------*/
   7366 
   7367 /* Generate base << amt with vacated places filled with stuff
   7368    from xtra.  amt guaranteed in 0 .. 63. */
   7369 static
   7370 IRExpr* shiftL64_with_extras ( IRTemp base, IRTemp xtra, IRTemp amt )
   7371 {
   7372    /* if   amt == 0
   7373       then base
   7374       else (base << amt) | (xtra >>u (64-amt))
   7375    */
   7376    return
   7377       IRExpr_Mux0X(
   7378          mkexpr(amt),
   7379          mkexpr(base),
   7380          binop(Iop_Or64,
   7381                binop(Iop_Shl64, mkexpr(base), mkexpr(amt)),
   7382                binop(Iop_Shr64, mkexpr(xtra),
   7383                                 binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
   7384          )
   7385       );
   7386 }
   7387 
   7388 /* Generate base >>u amt with vacated places filled with stuff
   7389    from xtra.  amt guaranteed in 0 .. 63. */
   7390 static
   7391 IRExpr* shiftR64_with_extras ( IRTemp xtra, IRTemp base, IRTemp amt )
   7392 {
   7393    /* if   amt == 0
   7394       then base
   7395       else (base >>u amt) | (xtra << (64-amt))
   7396    */
   7397    return
   7398       IRExpr_Mux0X(
   7399          mkexpr(amt),
   7400          mkexpr(base),
   7401          binop(Iop_Or64,
   7402                binop(Iop_Shr64, mkexpr(base), mkexpr(amt)),
   7403                binop(Iop_Shl64, mkexpr(xtra),
   7404                                 binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
   7405          )
   7406       );
   7407 }
   7408 
   7409 /* Double length left and right shifts.  Apparently only required in
   7410    v-size (no b- variant). */
   7411 static
   7412 ULong dis_SHLRD_Gv_Ev ( VexAbiInfo* vbi,
   7413                         Prefix pfx,
   7414                         Long delta, UChar modrm,
   7415                         Int sz,
   7416                         IRExpr* shift_amt,
   7417                         Bool amt_is_literal,
   7418                         HChar* shift_amt_txt,
   7419                         Bool left_shift )
   7420 {
   7421    /* shift_amt :: Ity_I8 is the amount to shift.  shift_amt_txt is used
   7422       for printing it.   And eip on entry points at the modrm byte. */
   7423    Int len;
   7424    HChar dis_buf[50];
   7425 
   7426    IRType ty     = szToITy(sz);
   7427    IRTemp gsrc   = newTemp(ty);
   7428    IRTemp esrc   = newTemp(ty);
   7429    IRTemp addr   = IRTemp_INVALID;
   7430    IRTemp tmpSH  = newTemp(Ity_I8);
   7431    IRTemp tmpSS  = newTemp(Ity_I8);
   7432    IRTemp tmp64  = IRTemp_INVALID;
   7433    IRTemp res64  = IRTemp_INVALID;
   7434    IRTemp rss64  = IRTemp_INVALID;
   7435    IRTemp resTy  = IRTemp_INVALID;
   7436    IRTemp rssTy  = IRTemp_INVALID;
   7437    Int    mask   = sz==8 ? 63 : 31;
   7438 
   7439    vassert(sz == 2 || sz == 4 || sz == 8);
   7440 
   7441    /* The E-part is the destination; this is shifted.  The G-part
   7442       supplies bits to be shifted into the E-part, but is not
   7443       changed.
   7444 
   7445       If shifting left, form a double-length word with E at the top
   7446       and G at the bottom, and shift this left.  The result is then in
   7447       the high part.
   7448 
   7449       If shifting right, form a double-length word with G at the top
   7450       and E at the bottom, and shift this right.  The result is then
   7451       at the bottom.  */
   7452 
   7453    /* Fetch the operands. */
   7454 
   7455    assign( gsrc, getIRegG(sz, pfx, modrm) );
   7456 
   7457    if (epartIsReg(modrm)) {
   7458       delta++;
   7459       assign( esrc, getIRegE(sz, pfx, modrm) );
   7460       DIP("sh%cd%c %s, %s, %s\n",
   7461           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   7462           shift_amt_txt,
   7463           nameIRegG(sz, pfx, modrm), nameIRegE(sz, pfx, modrm));
   7464    } else {
   7465       addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
   7466                         /* # bytes following amode */
   7467                         amt_is_literal ? 1 : 0 );
   7468       delta += len;
   7469       assign( esrc, loadLE(ty, mkexpr(addr)) );
   7470       DIP("sh%cd%c %s, %s, %s\n",
   7471           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   7472           shift_amt_txt,
   7473           nameIRegG(sz, pfx, modrm), dis_buf);
   7474    }
   7475 
   7476    /* Calculate the masked shift amount (tmpSH), the masked subshift
   7477       amount (tmpSS), the shifted value (res64) and the subshifted
   7478       value (rss64). */
   7479 
   7480    assign( tmpSH, binop(Iop_And8, shift_amt, mkU8(mask)) );
   7481    assign( tmpSS, binop(Iop_And8,
   7482                         binop(Iop_Sub8, mkexpr(tmpSH), mkU8(1) ),
   7483                         mkU8(mask)));
   7484 
   7485    tmp64 = newTemp(Ity_I64);
   7486    res64 = newTemp(Ity_I64);
   7487    rss64 = newTemp(Ity_I64);
   7488 
   7489    if (sz == 2 || sz == 4) {
   7490 
   7491       /* G is xtra; E is data */
   7492       /* what a freaking nightmare: */
   7493       if (sz == 4 && left_shift) {
   7494          assign( tmp64, binop(Iop_32HLto64, mkexpr(esrc), mkexpr(gsrc)) );
   7495          assign( res64,
   7496                  binop(Iop_Shr64,
   7497                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
   7498                        mkU8(32)) );
   7499          assign( rss64,
   7500                  binop(Iop_Shr64,
   7501                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSS)),
   7502                        mkU8(32)) );
   7503       }
   7504       else
   7505       if (sz == 4 && !left_shift) {
   7506          assign( tmp64, binop(Iop_32HLto64, mkexpr(gsrc), mkexpr(esrc)) );
   7507          assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
   7508          assign( rss64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSS)) );
   7509       }
   7510       else
   7511       if (sz == 2 && left_shift) {
   7512          assign( tmp64,
   7513                  binop(Iop_32HLto64,
   7514                        binop(Iop_16HLto32, mkexpr(esrc), mkexpr(gsrc)),
   7515                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc))
   7516          ));
   7517 	 /* result formed by shifting [esrc'gsrc'gsrc'gsrc] */
   7518          assign( res64,
   7519                  binop(Iop_Shr64,
   7520                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
   7521                        mkU8(48)) );
   7522          /* subshift formed by shifting [esrc'0000'0000'0000] */
   7523          assign( rss64,
   7524                  binop(Iop_Shr64,
   7525                        binop(Iop_Shl64,
   7526                              binop(Iop_Shl64, unop(Iop_16Uto64, mkexpr(esrc)),
   7527                                               mkU8(48)),
   7528                              mkexpr(tmpSS)),
   7529                        mkU8(48)) );
   7530       }
   7531       else
   7532       if (sz == 2 && !left_shift) {
   7533          assign( tmp64,
   7534                  binop(Iop_32HLto64,
   7535                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc)),
   7536                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(esrc))
   7537          ));
   7538          /* result formed by shifting [gsrc'gsrc'gsrc'esrc] */
   7539          assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
   7540          /* subshift formed by shifting [0000'0000'0000'esrc] */
   7541          assign( rss64, binop(Iop_Shr64,
   7542                               unop(Iop_16Uto64, mkexpr(esrc)),
   7543                               mkexpr(tmpSS)) );
   7544       }
   7545 
   7546    } else {
   7547 
   7548       vassert(sz == 8);
   7549       if (left_shift) {
   7550          assign( res64, shiftL64_with_extras( esrc, gsrc, tmpSH ));
   7551          assign( rss64, shiftL64_with_extras( esrc, gsrc, tmpSS ));
   7552       } else {
   7553          assign( res64, shiftR64_with_extras( gsrc, esrc, tmpSH ));
   7554          assign( rss64, shiftR64_with_extras( gsrc, esrc, tmpSS ));
   7555       }
   7556 
   7557    }
   7558 
   7559    resTy = newTemp(ty);
   7560    rssTy = newTemp(ty);
   7561    assign( resTy, narrowTo(ty, mkexpr(res64)) );
   7562    assign( rssTy, narrowTo(ty, mkexpr(rss64)) );
   7563 
   7564    /* Put result back and write the flags thunk. */
   7565    setFlags_DEP1_DEP2_shift ( left_shift ? Iop_Shl64 : Iop_Sar64,
   7566                               resTy, rssTy, ty, tmpSH );
   7567 
   7568    if (epartIsReg(modrm)) {
   7569       putIRegE(sz, pfx, modrm, mkexpr(resTy));
   7570    } else {
   7571       storeLE( mkexpr(addr), mkexpr(resTy) );
   7572    }
   7573 
   7574    if (amt_is_literal) delta++;
   7575    return delta;
   7576 }
   7577 
   7578 
   7579 /* Handle BT/BTS/BTR/BTC Gv, Ev.  Apparently b-size is not
   7580    required. */
   7581 
   7582 typedef enum { BtOpNone, BtOpSet, BtOpReset, BtOpComp } BtOp;
   7583 
   7584 static HChar* nameBtOp ( BtOp op )
   7585 {
   7586    switch (op) {
   7587       case BtOpNone:  return "";
   7588       case BtOpSet:   return "s";
   7589       case BtOpReset: return "r";
   7590       case BtOpComp:  return "c";
   7591       default: vpanic("nameBtOp(amd64)");
   7592    }
   7593 }
   7594 
   7595 
   7596 static
   7597 ULong dis_bt_G_E ( VexAbiInfo* vbi,
   7598                    Prefix pfx, Int sz, Long delta, BtOp op )
   7599 {
   7600    HChar  dis_buf[50];
   7601    UChar  modrm;
   7602    Int    len;
   7603    IRTemp t_fetched, t_bitno0, t_bitno1, t_bitno2, t_addr0,
   7604      t_addr1, t_rsp, t_mask, t_new;
   7605 
   7606    vassert(sz == 2 || sz == 4 || sz == 8);
   7607 
   7608    t_fetched = t_bitno0 = t_bitno1 = t_bitno2
   7609              = t_addr0 = t_addr1 = t_rsp
   7610              = t_mask = t_new = IRTemp_INVALID;
   7611 
   7612    t_fetched = newTemp(Ity_I8);
   7613    t_new     = newTemp(Ity_I8);
   7614    t_bitno0  = newTemp(Ity_I64);
   7615    t_bitno1  = newTemp(Ity_I64);
   7616    t_bitno2  = newTemp(Ity_I8);
   7617    t_addr1   = newTemp(Ity_I64);
   7618    modrm     = getUChar(delta);
   7619 
   7620    assign( t_bitno0, widenSto64(getIRegG(sz, pfx, modrm)) );
   7621 
   7622    if (epartIsReg(modrm)) {
   7623       delta++;
   7624       /* Get it onto the client's stack.  Oh, this is a horrible
   7625          kludge.  See https://bugs.kde.org/show_bug.cgi?id=245925.
   7626          Because of the ELF ABI stack redzone, there may be live data
   7627          up to 128 bytes below %RSP.  So we can't just push it on the
   7628          stack, else we may wind up trashing live data, and causing
   7629          impossible-to-find simulation errors.  (Yes, this did
   7630          happen.)  So we need to drop RSP before at least 128 before
   7631          pushing it.  That unfortunately means hitting Memcheck's
   7632          fast-case painting code.  Ideally we should drop more than
   7633          128, to reduce the chances of breaking buggy programs that
   7634          have live data below -128(%RSP).  Memcheck fast-cases moves
   7635          of 288 bytes due to the need to handle ppc64-linux quickly,
   7636          so let's use 288.  Of course the real fix is to get rid of
   7637          this kludge entirely.  */
   7638       t_rsp = newTemp(Ity_I64);
   7639       t_addr0 = newTemp(Ity_I64);
   7640 
   7641       vassert(vbi->guest_stack_redzone_size == 128);
   7642       assign( t_rsp, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(288)) );
   7643       putIReg64(R_RSP, mkexpr(t_rsp));
   7644 
   7645       storeLE( mkexpr(t_rsp), getIRegE(sz, pfx, modrm) );
   7646 
   7647       /* Make t_addr0 point at it. */
   7648       assign( t_addr0, mkexpr(t_rsp) );
   7649 
   7650       /* Mask out upper bits of the shift amount, since we're doing a
   7651          reg. */
   7652       assign( t_bitno1, binop(Iop_And64,
   7653                               mkexpr(t_bitno0),
   7654                               mkU64(sz == 8 ? 63 : sz == 4 ? 31 : 15)) );
   7655 
   7656    } else {
   7657       t_addr0 = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   7658       delta += len;
   7659       assign( t_bitno1, mkexpr(t_bitno0) );
   7660    }
   7661 
   7662    /* At this point: t_addr0 is the address being operated on.  If it
   7663       was a reg, we will have pushed it onto the client's stack.
   7664       t_bitno1 is the bit number, suitably masked in the case of a
   7665       reg.  */
   7666 
   7667    /* Now the main sequence. */
   7668    assign( t_addr1,
   7669            binop(Iop_Add64,
   7670                  mkexpr(t_addr0),
   7671                  binop(Iop_Sar64, mkexpr(t_bitno1), mkU8(3))) );
   7672 
   7673    /* t_addr1 now holds effective address */
   7674 
   7675    assign( t_bitno2,
   7676            unop(Iop_64to8,
   7677                 binop(Iop_And64, mkexpr(t_bitno1), mkU64(7))) );
   7678 
   7679    /* t_bitno2 contains offset of bit within byte */
   7680 
   7681    if (op != BtOpNone) {
   7682       t_mask = newTemp(Ity_I8);
   7683       assign( t_mask, binop(Iop_Shl8, mkU8(1), mkexpr(t_bitno2)) );
   7684    }
   7685 
   7686    /* t_mask is now a suitable byte mask */
   7687 
   7688    assign( t_fetched, loadLE(Ity_I8, mkexpr(t_addr1)) );
   7689 
   7690    if (op != BtOpNone) {
   7691       switch (op) {
   7692          case BtOpSet:
   7693             assign( t_new,
   7694                     binop(Iop_Or8, mkexpr(t_fetched), mkexpr(t_mask)) );
   7695             break;
   7696          case BtOpComp:
   7697             assign( t_new,
   7698                     binop(Iop_Xor8, mkexpr(t_fetched), mkexpr(t_mask)) );
   7699             break;
   7700          case BtOpReset:
   7701             assign( t_new,
   7702                     binop(Iop_And8, mkexpr(t_fetched),
   7703                                     unop(Iop_Not8, mkexpr(t_mask))) );
   7704             break;
   7705          default:
   7706             vpanic("dis_bt_G_E(amd64)");
   7707       }
   7708       if ((pfx & PFX_LOCK) && !epartIsReg(modrm)) {
   7709          casLE( mkexpr(t_addr1), mkexpr(t_fetched)/*expd*/,
   7710                                  mkexpr(t_new)/*new*/,
   7711                                  guest_RIP_curr_instr );
   7712       } else {
   7713          storeLE( mkexpr(t_addr1), mkexpr(t_new) );
   7714       }
   7715    }
   7716 
   7717    /* Side effect done; now get selected bit into Carry flag */
   7718    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
   7719    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   7720    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   7721    stmt( IRStmt_Put(
   7722             OFFB_CC_DEP1,
   7723             binop(Iop_And64,
   7724                   binop(Iop_Shr64,
   7725                         unop(Iop_8Uto64, mkexpr(t_fetched)),
   7726                         mkexpr(t_bitno2)),
   7727                   mkU64(1)))
   7728        );
   7729    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   7730       elimination of previous stores to this field work better. */
   7731    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   7732 
   7733    /* Move reg operand from stack back to reg */
   7734    if (epartIsReg(modrm)) {
   7735       /* t_rsp still points at it. */
   7736       /* only write the reg if actually modifying it; doing otherwise
   7737          zeroes the top half erroneously when doing btl due to
   7738          standard zero-extend rule */
   7739       if (op != BtOpNone)
   7740          putIRegE(sz, pfx, modrm, loadLE(szToITy(sz), mkexpr(t_rsp)) );
   7741       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t_rsp), mkU64(288)) );
   7742    }
   7743 
   7744    DIP("bt%s%c %s, %s\n",
   7745        nameBtOp(op), nameISize(sz), nameIRegG(sz, pfx, modrm),
   7746        ( epartIsReg(modrm) ? nameIRegE(sz, pfx, modrm) : dis_buf ) );
   7747 
   7748    return delta;
   7749 }
   7750 
   7751 
   7752 
   7753 /* Handle BSF/BSR.  Only v-size seems necessary. */
   7754 static
   7755 ULong dis_bs_E_G ( VexAbiInfo* vbi,
   7756                    Prefix pfx, Int sz, Long delta, Bool fwds )
   7757 {
   7758    Bool   isReg;
   7759    UChar  modrm;
   7760    HChar  dis_buf[50];
   7761 
   7762    IRType ty    = szToITy(sz);
   7763    IRTemp src   = newTemp(ty);
   7764    IRTemp dst   = newTemp(ty);
   7765    IRTemp src64 = newTemp(Ity_I64);
   7766    IRTemp dst64 = newTemp(Ity_I64);
   7767    IRTemp src8  = newTemp(Ity_I8);
   7768 
   7769    vassert(sz == 8 || sz == 4 || sz == 2);
   7770 
   7771    modrm = getUChar(delta);
   7772    isReg = epartIsReg(modrm);
   7773    if (isReg) {
   7774       delta++;
   7775       assign( src, getIRegE(sz, pfx, modrm) );
   7776    } else {
   7777       Int    len;
   7778       IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7779       delta += len;
   7780       assign( src, loadLE(ty, mkexpr(addr)) );
   7781    }
   7782 
   7783    DIP("bs%c%c %s, %s\n",
   7784        fwds ? 'f' : 'r', nameISize(sz),
   7785        ( isReg ? nameIRegE(sz, pfx, modrm) : dis_buf ),
   7786        nameIRegG(sz, pfx, modrm));
   7787 
   7788    /* First, widen src to 64 bits if it is not already. */
   7789    assign( src64, widenUto64(mkexpr(src)) );
   7790 
   7791    /* Generate an 8-bit expression which is zero iff the
   7792       original is zero, and nonzero otherwise */
   7793    assign( src8,
   7794            unop(Iop_1Uto8,
   7795                 binop(Iop_CmpNE64,
   7796                       mkexpr(src64), mkU64(0))) );
   7797 
   7798    /* Flags: Z is 1 iff source value is zero.  All others
   7799       are undefined -- we force them to zero. */
   7800    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   7801    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   7802    stmt( IRStmt_Put(
   7803             OFFB_CC_DEP1,
   7804             IRExpr_Mux0X( mkexpr(src8),
   7805                           /* src==0 */
   7806                           mkU64(AMD64G_CC_MASK_Z),
   7807                           /* src!=0 */
   7808                           mkU64(0)
   7809                         )
   7810        ));
   7811    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   7812       elimination of previous stores to this field work better. */
   7813    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   7814 
   7815    /* Result: iff source value is zero, we can't use
   7816       Iop_Clz64/Iop_Ctz64 as they have no defined result in that case.
   7817       But anyway, amd64 semantics say the result is undefined in
   7818       such situations.  Hence handle the zero case specially. */
   7819 
   7820    /* Bleh.  What we compute:
   7821 
   7822           bsf64:  if src == 0 then {dst is unchanged}
   7823                               else Ctz64(src)
   7824 
   7825           bsr64:  if src == 0 then {dst is unchanged}
   7826                               else 63 - Clz64(src)
   7827 
   7828           bsf32:  if src == 0 then {dst is unchanged}
   7829                               else Ctz64(32Uto64(src))
   7830 
   7831           bsr32:  if src == 0 then {dst is unchanged}
   7832                               else 63 - Clz64(32Uto64(src))
   7833 
   7834           bsf16:  if src == 0 then {dst is unchanged}
   7835                               else Ctz64(32Uto64(16Uto32(src)))
   7836 
   7837           bsr16:  if src == 0 then {dst is unchanged}
   7838                               else 63 - Clz64(32Uto64(16Uto32(src)))
   7839    */
   7840 
   7841    /* The main computation, guarding against zero. */
   7842    assign( dst64,
   7843            IRExpr_Mux0X(
   7844               mkexpr(src8),
   7845               /* src == 0 -- leave dst unchanged */
   7846               widenUto64( getIRegG( sz, pfx, modrm ) ),
   7847               /* src != 0 */
   7848               fwds ? unop(Iop_Ctz64, mkexpr(src64))
   7849                    : binop(Iop_Sub64,
   7850                            mkU64(63),
   7851                            unop(Iop_Clz64, mkexpr(src64)))
   7852            )
   7853          );
   7854 
   7855    if (sz == 2)
   7856       assign( dst, unop(Iop_64to16, mkexpr(dst64)) );
   7857    else
   7858    if (sz == 4)
   7859       assign( dst, unop(Iop_64to32, mkexpr(dst64)) );
   7860    else
   7861       assign( dst, mkexpr(dst64) );
   7862 
   7863    /* dump result back */
   7864    putIRegG( sz, pfx, modrm, mkexpr(dst) );
   7865 
   7866    return delta;
   7867 }
   7868 
   7869 
   7870 /* swap rAX with the reg specified by reg and REX.B */
   7871 static
   7872 void codegen_xchg_rAX_Reg ( Prefix pfx, Int sz, UInt regLo3 )
   7873 {
   7874    IRType ty = szToITy(sz);
   7875    IRTemp t1 = newTemp(ty);
   7876    IRTemp t2 = newTemp(ty);
   7877    vassert(sz == 2 || sz == 4 || sz == 8);
   7878    vassert(regLo3 < 8);
   7879    if (sz == 8) {
   7880       assign( t1, getIReg64(R_RAX) );
   7881       assign( t2, getIRegRexB(8, pfx, regLo3) );
   7882       putIReg64( R_RAX, mkexpr(t2) );
   7883       putIRegRexB(8, pfx, regLo3, mkexpr(t1) );
   7884    } else if (sz == 4) {
   7885       assign( t1, getIReg32(R_RAX) );
   7886       assign( t2, getIRegRexB(4, pfx, regLo3) );
   7887       putIReg32( R_RAX, mkexpr(t2) );
   7888       putIRegRexB(4, pfx, regLo3, mkexpr(t1) );
   7889    } else {
   7890       assign( t1, getIReg16(R_RAX) );
   7891       assign( t2, getIRegRexB(2, pfx, regLo3) );
   7892       putIReg16( R_RAX, mkexpr(t2) );
   7893       putIRegRexB(2, pfx, regLo3, mkexpr(t1) );
   7894    }
   7895    DIP("xchg%c %s, %s\n",
   7896        nameISize(sz), nameIRegRAX(sz),
   7897                       nameIRegRexB(sz,pfx, regLo3));
   7898 }
   7899 
   7900 
   7901 static
   7902 void codegen_SAHF ( void )
   7903 {
   7904    /* Set the flags to:
   7905       (amd64g_calculate_flags_all() & AMD64G_CC_MASK_O)
   7906                                     -- retain the old O flag
   7907       | (%AH & (AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
   7908                 |AMD64G_CC_MASK_P|AMD64G_CC_MASK_C)
   7909    */
   7910    ULong  mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
   7911                        |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
   7912    IRTemp oldflags   = newTemp(Ity_I64);
   7913    assign( oldflags, mk_amd64g_calculate_rflags_all() );
   7914    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   7915    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   7916    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   7917    stmt( IRStmt_Put( OFFB_CC_DEP1,
   7918          binop(Iop_Or64,
   7919                binop(Iop_And64, mkexpr(oldflags), mkU64(AMD64G_CC_MASK_O)),
   7920                binop(Iop_And64,
   7921                      binop(Iop_Shr64, getIReg64(R_RAX), mkU8(8)),
   7922                      mkU64(mask_SZACP))
   7923               )
   7924    ));
   7925 }
   7926 
   7927 
   7928 static
   7929 void codegen_LAHF ( void  )
   7930 {
   7931    /* AH <- EFLAGS(SF:ZF:0:AF:0:PF:1:CF) */
   7932    IRExpr* rax_with_hole;
   7933    IRExpr* new_byte;
   7934    IRExpr* new_rax;
   7935    ULong   mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
   7936                         |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
   7937 
   7938    IRTemp  flags = newTemp(Ity_I64);
   7939    assign( flags, mk_amd64g_calculate_rflags_all() );
   7940 
   7941    rax_with_hole
   7942       = binop(Iop_And64, getIReg64(R_RAX), mkU64(~0xFF00ULL));
   7943    new_byte
   7944       = binop(Iop_Or64, binop(Iop_And64, mkexpr(flags), mkU64(mask_SZACP)),
   7945                         mkU64(1<<1));
   7946    new_rax
   7947       = binop(Iop_Or64, rax_with_hole,
   7948                         binop(Iop_Shl64, new_byte, mkU8(8)));
   7949    putIReg64(R_RAX, new_rax);
   7950 }
   7951 
   7952 
   7953 static
   7954 ULong dis_cmpxchg_G_E ( /*OUT*/Bool* ok,
   7955                         VexAbiInfo*  vbi,
   7956                         Prefix       pfx,
   7957                         Int          size,
   7958                         Long         delta0 )
   7959 {
   7960    HChar dis_buf[50];
   7961    Int   len;
   7962 
   7963    IRType ty    = szToITy(size);
   7964    IRTemp acc   = newTemp(ty);
   7965    IRTemp src   = newTemp(ty);
   7966    IRTemp dest  = newTemp(ty);
   7967    IRTemp dest2 = newTemp(ty);
   7968    IRTemp acc2  = newTemp(ty);
   7969    IRTemp cond8 = newTemp(Ity_I8);
   7970    IRTemp addr  = IRTemp_INVALID;
   7971    UChar  rm    = getUChar(delta0);
   7972 
   7973    /* There are 3 cases to consider:
   7974 
   7975       reg-reg: ignore any lock prefix, generate sequence based
   7976                on Mux0X
   7977 
   7978       reg-mem, not locked: ignore any lock prefix, generate sequence
   7979                            based on Mux0X
   7980 
   7981       reg-mem, locked: use IRCAS
   7982    */
   7983 
   7984    if (epartIsReg(rm)) {
   7985       /* case 1 */
   7986       assign( dest, getIRegE(size, pfx, rm) );
   7987       delta0++;
   7988       assign( src, getIRegG(size, pfx, rm) );
   7989       assign( acc, getIRegRAX(size) );
   7990       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   7991       assign( cond8, unop(Iop_1Uto8, mk_amd64g_calculate_condition(AMD64CondZ)) );
   7992       assign( dest2, IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(src)) );
   7993       assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
   7994       putIRegRAX(size, mkexpr(acc2));
   7995       putIRegE(size, pfx, rm, mkexpr(dest2));
   7996       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   7997                                nameIRegG(size,pfx,rm),
   7998                                nameIRegE(size,pfx,rm) );
   7999    }
   8000    else if (!epartIsReg(rm) && !(pfx & PFX_LOCK)) {
   8001       /* case 2 */
   8002       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8003       assign( dest, loadLE(ty, mkexpr(addr)) );
   8004       delta0 += len;
   8005       assign( src, getIRegG(size, pfx, rm) );
   8006       assign( acc, getIRegRAX(size) );
   8007       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   8008       assign( cond8, unop(Iop_1Uto8, mk_amd64g_calculate_condition(AMD64CondZ)) );
   8009       assign( dest2, IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(src)) );
   8010       assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
   8011       putIRegRAX(size, mkexpr(acc2));
   8012       storeLE( mkexpr(addr), mkexpr(dest2) );
   8013       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   8014                                nameIRegG(size,pfx,rm), dis_buf);
   8015    }
   8016    else if (!epartIsReg(rm) && (pfx & PFX_LOCK)) {
   8017       /* case 3 */
   8018       /* src is new value.  acc is expected value.  dest is old value.
   8019          Compute success from the output of the IRCAS, and steer the
   8020          new value for RAX accordingly: in case of success, RAX is
   8021          unchanged. */
   8022       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8023       delta0 += len;
   8024       assign( src, getIRegG(size, pfx, rm) );
   8025       assign( acc, getIRegRAX(size) );
   8026       stmt( IRStmt_CAS(
   8027          mkIRCAS( IRTemp_INVALID, dest, Iend_LE, mkexpr(addr),
   8028                   NULL, mkexpr(acc), NULL, mkexpr(src) )
   8029       ));
   8030       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   8031       assign( cond8, unop(Iop_1Uto8, mk_amd64g_calculate_condition(AMD64CondZ)) );
   8032       assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
   8033       putIRegRAX(size, mkexpr(acc2));
   8034       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   8035                                nameIRegG(size,pfx,rm), dis_buf);
   8036    }
   8037    else vassert(0);
   8038 
   8039    *ok = True;
   8040    return delta0;
   8041 }
   8042 
   8043 
   8044 /* Handle conditional move instructions of the form
   8045       cmovcc E(reg-or-mem), G(reg)
   8046 
   8047    E(src) is reg-or-mem
   8048    G(dst) is reg.
   8049 
   8050    If E is reg, -->    GET %E, tmps
   8051                        GET %G, tmpd
   8052                        CMOVcc tmps, tmpd
   8053                        PUT tmpd, %G
   8054 
   8055    If E is mem  -->    (getAddr E) -> tmpa
   8056                        LD (tmpa), tmps
   8057                        GET %G, tmpd
   8058                        CMOVcc tmps, tmpd
   8059                        PUT tmpd, %G
   8060 */
   8061 static
   8062 ULong dis_cmov_E_G ( VexAbiInfo* vbi,
   8063                      Prefix        pfx,
   8064                      Int           sz,
   8065                      AMD64Condcode cond,
   8066                      Long          delta0 )
   8067 {
   8068    UChar rm  = getUChar(delta0);
   8069    HChar dis_buf[50];
   8070    Int   len;
   8071 
   8072    IRType ty   = szToITy(sz);
   8073    IRTemp tmps = newTemp(ty);
   8074    IRTemp tmpd = newTemp(ty);
   8075 
   8076    if (epartIsReg(rm)) {
   8077       assign( tmps, getIRegE(sz, pfx, rm) );
   8078       assign( tmpd, getIRegG(sz, pfx, rm) );
   8079 
   8080       putIRegG( sz, pfx, rm,
   8081                 IRExpr_Mux0X( unop(Iop_1Uto8,
   8082                                    mk_amd64g_calculate_condition(cond)),
   8083                               mkexpr(tmpd),
   8084                               mkexpr(tmps) )
   8085               );
   8086       DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
   8087                             nameIRegE(sz,pfx,rm),
   8088                             nameIRegG(sz,pfx,rm));
   8089       return 1+delta0;
   8090    }
   8091 
   8092    /* E refers to memory */
   8093    {
   8094       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8095       assign( tmps, loadLE(ty, mkexpr(addr)) );
   8096       assign( tmpd, getIRegG(sz, pfx, rm) );
   8097 
   8098       putIRegG( sz, pfx, rm,
   8099                 IRExpr_Mux0X( unop(Iop_1Uto8,
   8100                                    mk_amd64g_calculate_condition(cond)),
   8101                               mkexpr(tmpd),
   8102                               mkexpr(tmps) )
   8103               );
   8104 
   8105       DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
   8106                             dis_buf,
   8107                             nameIRegG(sz,pfx,rm));
   8108       return len+delta0;
   8109    }
   8110 }
   8111 
   8112 
   8113 static
   8114 ULong dis_xadd_G_E ( /*OUT*/Bool* decode_ok,
   8115                      VexAbiInfo* vbi,
   8116                      Prefix pfx, Int sz, Long delta0 )
   8117 {
   8118    Int   len;
   8119    UChar rm = getUChar(delta0);
   8120    HChar dis_buf[50];
   8121 
   8122    IRType ty    = szToITy(sz);
   8123    IRTemp tmpd  = newTemp(ty);
   8124    IRTemp tmpt0 = newTemp(ty);
   8125    IRTemp tmpt1 = newTemp(ty);
   8126 
   8127    /* There are 3 cases to consider:
   8128 
   8129       reg-reg: ignore any lock prefix,
   8130                generate 'naive' (non-atomic) sequence
   8131 
   8132       reg-mem, not locked: ignore any lock prefix, generate 'naive'
   8133                            (non-atomic) sequence
   8134 
   8135       reg-mem, locked: use IRCAS
   8136    */
   8137 
   8138    if (epartIsReg(rm)) {
   8139       /* case 1 */
   8140       assign( tmpd, getIRegE(sz, pfx, rm) );
   8141       assign( tmpt0, getIRegG(sz, pfx, rm) );
   8142       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   8143                            mkexpr(tmpd), mkexpr(tmpt0)) );
   8144       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   8145       putIRegG(sz, pfx, rm, mkexpr(tmpd));
   8146       putIRegE(sz, pfx, rm, mkexpr(tmpt1));
   8147       DIP("xadd%c %s, %s\n",
   8148           nameISize(sz), nameIRegG(sz,pfx,rm),
   8149           				 nameIRegE(sz,pfx,rm));
   8150       *decode_ok = True;
   8151       return 1+delta0;
   8152    }
   8153    else if (!epartIsReg(rm) && !(pfx & PFX_LOCK)) {
   8154       /* case 2 */
   8155       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8156       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   8157       assign( tmpt0, getIRegG(sz, pfx, rm) );
   8158       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   8159                            mkexpr(tmpd), mkexpr(tmpt0)) );
   8160       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   8161       storeLE( mkexpr(addr), mkexpr(tmpt1) );
   8162       putIRegG(sz, pfx, rm, mkexpr(tmpd));
   8163       DIP("xadd%c %s, %s\n",
   8164           nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
   8165       *decode_ok = True;
   8166       return len+delta0;
   8167    }
   8168    else if (!epartIsReg(rm) && (pfx & PFX_LOCK)) {
   8169       /* case 3 */
   8170       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8171       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   8172       assign( tmpt0, getIRegG(sz, pfx, rm) );
   8173       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   8174                            mkexpr(tmpd), mkexpr(tmpt0)) );
   8175       casLE( mkexpr(addr), mkexpr(tmpd)/*expVal*/,
   8176                            mkexpr(tmpt1)/*newVal*/, guest_RIP_curr_instr );
   8177       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   8178       putIRegG(sz, pfx, rm, mkexpr(tmpd));
   8179       DIP("xadd%c %s, %s\n",
   8180           nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
   8181       *decode_ok = True;
   8182       return len+delta0;
   8183    }
   8184    /*UNREACHED*/
   8185    vassert(0);
   8186 }
   8187 
   8188 //.. /* Move 16 bits from Ew (ireg or mem) to G (a segment register). */
   8189 //..
   8190 //.. static
   8191 //.. UInt dis_mov_Ew_Sw ( UChar sorb, Long delta0 )
   8192 //.. {
   8193 //..    Int    len;
   8194 //..    IRTemp addr;
   8195 //..    UChar  rm  = getUChar(delta0);
   8196 //..    HChar  dis_buf[50];
   8197 //..
   8198 //..    if (epartIsReg(rm)) {
   8199 //..       putSReg( gregOfRM(rm), getIReg(2, eregOfRM(rm)) );
   8200 //..       DIP("movw %s,%s\n", nameIReg(2,eregOfRM(rm)), nameSReg(gregOfRM(rm)));
   8201 //..       return 1+delta0;
   8202 //..    } else {
   8203 //..       addr = disAMode ( &len, sorb, delta0, dis_buf );
   8204 //..       putSReg( gregOfRM(rm), loadLE(Ity_I16, mkexpr(addr)) );
   8205 //..       DIP("movw %s,%s\n", dis_buf, nameSReg(gregOfRM(rm)));
   8206 //..       return len+delta0;
   8207 //..    }
   8208 //.. }
   8209 //..
   8210 //.. /* Move 16 bits from G (a segment register) to Ew (ireg or mem).  If
   8211 //..    dst is ireg and sz==4, zero out top half of it.  */
   8212 //..
   8213 //.. static
   8214 //.. UInt dis_mov_Sw_Ew ( UChar sorb,
   8215 //..                      Int   sz,
   8216 //..                      UInt  delta0 )
   8217 //.. {
   8218 //..    Int    len;
   8219 //..    IRTemp addr;
   8220 //..    UChar  rm  = getUChar(delta0);
   8221 //..    HChar  dis_buf[50];
   8222 //..
   8223 //..    vassert(sz == 2 || sz == 4);
   8224 //..
   8225 //..    if (epartIsReg(rm)) {
   8226 //..       if (sz == 4)
   8227 //..          putIReg(4, eregOfRM(rm), unop(Iop_16Uto32, getSReg(gregOfRM(rm))));
   8228 //..       else
   8229 //..          putIReg(2, eregOfRM(rm), getSReg(gregOfRM(rm)));
   8230 //..
   8231 //..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), nameIReg(sz,eregOfRM(rm)));
   8232 //..       return 1+delta0;
   8233 //..    } else {
   8234 //..       addr = disAMode ( &len, sorb, delta0, dis_buf );
   8235 //..       storeLE( mkexpr(addr), getSReg(gregOfRM(rm)) );
   8236 //..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), dis_buf);
   8237 //..       return len+delta0;
   8238 //..    }
   8239 //.. }
   8240 //..
   8241 //..
   8242 //.. static
   8243 //.. void dis_push_segreg ( UInt sreg, Int sz )
   8244 //.. {
   8245 //..     IRTemp t1 = newTemp(Ity_I16);
   8246 //..     IRTemp ta = newTemp(Ity_I32);
   8247 //..     vassert(sz == 2 || sz == 4);
   8248 //..
   8249 //..     assign( t1, getSReg(sreg) );
   8250 //..     assign( ta, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)) );
   8251 //..     putIReg(4, R_ESP, mkexpr(ta));
   8252 //..     storeLE( mkexpr(ta), mkexpr(t1) );
   8253 //..
   8254 //..     DIP("pushw %s\n", nameSReg(sreg));
   8255 //.. }
   8256 //..
   8257 //.. static
   8258 //.. void dis_pop_segreg ( UInt sreg, Int sz )
   8259 //.. {
   8260 //..     IRTemp t1 = newTemp(Ity_I16);
   8261 //..     IRTemp ta = newTemp(Ity_I32);
   8262 //..     vassert(sz == 2 || sz == 4);
   8263 //..
   8264 //..     assign( ta, getIReg(4, R_ESP) );
   8265 //..     assign( t1, loadLE(Ity_I16, mkexpr(ta)) );
   8266 //..
   8267 //..     putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(ta), mkU32(sz)) );
   8268 //..     putSReg( sreg, mkexpr(t1) );
   8269 //..     DIP("pop %s\n", nameSReg(sreg));
   8270 //.. }
   8271 
   8272 static
   8273 void dis_ret ( /*MOD*/DisResult* dres, VexAbiInfo* vbi, ULong d64 )
   8274 {
   8275    IRTemp t1 = newTemp(Ity_I64);
   8276    IRTemp t2 = newTemp(Ity_I64);
   8277    IRTemp t3 = newTemp(Ity_I64);
   8278    assign(t1, getIReg64(R_RSP));
   8279    assign(t2, loadLE(Ity_I64,mkexpr(t1)));
   8280    assign(t3, binop(Iop_Add64, mkexpr(t1), mkU64(8+d64)));
   8281    putIReg64(R_RSP, mkexpr(t3));
   8282    make_redzone_AbiHint(vbi, t3, t2/*nia*/, "ret");
   8283    jmp_treg(dres, Ijk_Ret, t2);
   8284    vassert(dres->whatNext == Dis_StopHere);
   8285 }
   8286 
   8287 
   8288 /*------------------------------------------------------------*/
   8289 /*--- SSE/SSE2/SSE3 helpers                                ---*/
   8290 /*------------------------------------------------------------*/
   8291 
   8292 /* Worker function; do not call directly.
   8293    Handles full width G = G `op` E   and   G = (not G) `op` E.
   8294 */
   8295 
   8296 static ULong dis_SSE_E_to_G_all_wrk (
   8297                 VexAbiInfo* vbi,
   8298                 Prefix pfx, Long delta,
   8299                 HChar* opname, IROp op,
   8300                 Bool   invertG
   8301              )
   8302 {
   8303    HChar   dis_buf[50];
   8304    Int     alen;
   8305    IRTemp  addr;
   8306    UChar   rm = getUChar(delta);
   8307    IRExpr* gpart
   8308       = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRexRM(pfx,rm)))
   8309                 : getXMMReg(gregOfRexRM(pfx,rm));
   8310    if (epartIsReg(rm)) {
   8311       putXMMReg( gregOfRexRM(pfx,rm),
   8312                  binop(op, gpart,
   8313                            getXMMReg(eregOfRexRM(pfx,rm))) );
   8314       DIP("%s %s,%s\n", opname,
   8315                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8316                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8317       return delta+1;
   8318    } else {
   8319       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8320       putXMMReg( gregOfRexRM(pfx,rm),
   8321                  binop(op, gpart,
   8322                            loadLE(Ity_V128, mkexpr(addr))) );
   8323       DIP("%s %s,%s\n", opname,
   8324                         dis_buf,
   8325                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8326       return delta+alen;
   8327    }
   8328 }
   8329 
   8330 
   8331 /* All lanes SSE binary operation, G = G `op` E. */
   8332 
   8333 static
   8334 ULong dis_SSE_E_to_G_all ( VexAbiInfo* vbi,
   8335                            Prefix pfx, Long delta,
   8336                            HChar* opname, IROp op )
   8337 {
   8338    return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, False );
   8339 }
   8340 
   8341 /* All lanes SSE binary operation, G = (not G) `op` E. */
   8342 
   8343 static
   8344 ULong dis_SSE_E_to_G_all_invG ( VexAbiInfo* vbi,
   8345                                 Prefix pfx, Long delta,
   8346                                 HChar* opname, IROp op )
   8347 {
   8348    return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, True );
   8349 }
   8350 
   8351 
   8352 /* Lowest 32-bit lane only SSE binary operation, G = G `op` E. */
   8353 
   8354 static ULong dis_SSE_E_to_G_lo32 ( VexAbiInfo* vbi,
   8355                                    Prefix pfx, Long delta,
   8356                                    HChar* opname, IROp op )
   8357 {
   8358    HChar   dis_buf[50];
   8359    Int     alen;
   8360    IRTemp  addr;
   8361    UChar   rm = getUChar(delta);
   8362    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
   8363    if (epartIsReg(rm)) {
   8364       putXMMReg( gregOfRexRM(pfx,rm),
   8365                  binop(op, gpart,
   8366                            getXMMReg(eregOfRexRM(pfx,rm))) );
   8367       DIP("%s %s,%s\n", opname,
   8368                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8369                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8370       return delta+1;
   8371    } else {
   8372       /* We can only do a 32-bit memory read, so the upper 3/4 of the
   8373          E operand needs to be made simply of zeroes. */
   8374       IRTemp epart = newTemp(Ity_V128);
   8375       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8376       assign( epart, unop( Iop_32UtoV128,
   8377                            loadLE(Ity_I32, mkexpr(addr))) );
   8378       putXMMReg( gregOfRexRM(pfx,rm),
   8379                  binop(op, gpart, mkexpr(epart)) );
   8380       DIP("%s %s,%s\n", opname,
   8381                         dis_buf,
   8382                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8383       return delta+alen;
   8384    }
   8385 }
   8386 
   8387 
   8388 /* Lower 64-bit lane only SSE binary operation, G = G `op` E. */
   8389 
   8390 static ULong dis_SSE_E_to_G_lo64 ( VexAbiInfo* vbi,
   8391                                    Prefix pfx, Long delta,
   8392                                    HChar* opname, IROp op )
   8393 {
   8394    HChar   dis_buf[50];
   8395    Int     alen;
   8396    IRTemp  addr;
   8397    UChar   rm = getUChar(delta);
   8398    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
   8399    if (epartIsReg(rm)) {
   8400       putXMMReg( gregOfRexRM(pfx,rm),
   8401                  binop(op, gpart,
   8402                            getXMMReg(eregOfRexRM(pfx,rm))) );
   8403       DIP("%s %s,%s\n", opname,
   8404                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8405                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8406       return delta+1;
   8407    } else {
   8408       /* We can only do a 64-bit memory read, so the upper half of the
   8409          E operand needs to be made simply of zeroes. */
   8410       IRTemp epart = newTemp(Ity_V128);
   8411       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8412       assign( epart, unop( Iop_64UtoV128,
   8413                            loadLE(Ity_I64, mkexpr(addr))) );
   8414       putXMMReg( gregOfRexRM(pfx,rm),
   8415                  binop(op, gpart, mkexpr(epart)) );
   8416       DIP("%s %s,%s\n", opname,
   8417                         dis_buf,
   8418                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8419       return delta+alen;
   8420    }
   8421 }
   8422 
   8423 
   8424 /* All lanes unary SSE operation, G = op(E). */
   8425 
   8426 static ULong dis_SSE_E_to_G_unary_all (
   8427                 VexAbiInfo* vbi,
   8428                 Prefix pfx, Long delta,
   8429                 HChar* opname, IROp op
   8430              )
   8431 {
   8432    HChar   dis_buf[50];
   8433    Int     alen;
   8434    IRTemp  addr;
   8435    UChar   rm = getUChar(delta);
   8436    if (epartIsReg(rm)) {
   8437       putXMMReg( gregOfRexRM(pfx,rm),
   8438                  unop(op, getXMMReg(eregOfRexRM(pfx,rm))) );
   8439       DIP("%s %s,%s\n", opname,
   8440                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8441                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8442       return delta+1;
   8443    } else {
   8444       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8445       putXMMReg( gregOfRexRM(pfx,rm),
   8446                  unop(op, loadLE(Ity_V128, mkexpr(addr))) );
   8447       DIP("%s %s,%s\n", opname,
   8448                         dis_buf,
   8449                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8450       return delta+alen;
   8451    }
   8452 }
   8453 
   8454 
   8455 /* Lowest 32-bit lane only unary SSE operation, G = op(E). */
   8456 
   8457 static ULong dis_SSE_E_to_G_unary_lo32 (
   8458                 VexAbiInfo* vbi,
   8459                 Prefix pfx, Long delta,
   8460                 HChar* opname, IROp op
   8461              )
   8462 {
   8463    /* First we need to get the old G value and patch the low 32 bits
   8464       of the E operand into it.  Then apply op and write back to G. */
   8465    HChar   dis_buf[50];
   8466    Int     alen;
   8467    IRTemp  addr;
   8468    UChar   rm = getUChar(delta);
   8469    IRTemp  oldG0 = newTemp(Ity_V128);
   8470    IRTemp  oldG1 = newTemp(Ity_V128);
   8471 
   8472    assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
   8473 
   8474    if (epartIsReg(rm)) {
   8475       assign( oldG1,
   8476               binop( Iop_SetV128lo32,
   8477                      mkexpr(oldG0),
   8478                      getXMMRegLane32(eregOfRexRM(pfx,rm), 0)) );
   8479       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   8480       DIP("%s %s,%s\n", opname,
   8481                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8482                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8483       return delta+1;
   8484    } else {
   8485       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8486       assign( oldG1,
   8487               binop( Iop_SetV128lo32,
   8488                      mkexpr(oldG0),
   8489                      loadLE(Ity_I32, mkexpr(addr)) ));
   8490       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   8491       DIP("%s %s,%s\n", opname,
   8492                         dis_buf,
   8493                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8494       return delta+alen;
   8495    }
   8496 }
   8497 
   8498 
   8499 /* Lowest 64-bit lane only unary SSE operation, G = op(E). */
   8500 
   8501 static ULong dis_SSE_E_to_G_unary_lo64 (
   8502                 VexAbiInfo* vbi,
   8503                 Prefix pfx, Long delta,
   8504                 HChar* opname, IROp op
   8505              )
   8506 {
   8507    /* First we need to get the old G value and patch the low 64 bits
   8508       of the E operand into it.  Then apply op and write back to G. */
   8509    HChar   dis_buf[50];
   8510    Int     alen;
   8511    IRTemp  addr;
   8512    UChar   rm = getUChar(delta);
   8513    IRTemp  oldG0 = newTemp(Ity_V128);
   8514    IRTemp  oldG1 = newTemp(Ity_V128);
   8515 
   8516    assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
   8517 
   8518    if (epartIsReg(rm)) {
   8519       assign( oldG1,
   8520               binop( Iop_SetV128lo64,
   8521                      mkexpr(oldG0),
   8522                      getXMMRegLane64(eregOfRexRM(pfx,rm), 0)) );
   8523       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   8524       DIP("%s %s,%s\n", opname,
   8525                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8526                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8527       return delta+1;
   8528    } else {
   8529       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8530       assign( oldG1,
   8531               binop( Iop_SetV128lo64,
   8532                      mkexpr(oldG0),
   8533                      loadLE(Ity_I64, mkexpr(addr)) ));
   8534       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   8535       DIP("%s %s,%s\n", opname,
   8536                         dis_buf,
   8537                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8538       return delta+alen;
   8539    }
   8540 }
   8541 
   8542 
   8543 /* SSE integer binary operation:
   8544       G = G `op` E   (eLeft == False)
   8545       G = E `op` G   (eLeft == True)
   8546 */
   8547 static ULong dis_SSEint_E_to_G(
   8548                 VexAbiInfo* vbi,
   8549                 Prefix pfx, Long delta,
   8550                 HChar* opname, IROp op,
   8551                 Bool   eLeft
   8552              )
   8553 {
   8554    HChar   dis_buf[50];
   8555    Int     alen;
   8556    IRTemp  addr;
   8557    UChar   rm = getUChar(delta);
   8558    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
   8559    IRExpr* epart = NULL;
   8560    if (epartIsReg(rm)) {
   8561       epart = getXMMReg(eregOfRexRM(pfx,rm));
   8562       DIP("%s %s,%s\n", opname,
   8563                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8564                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8565       delta += 1;
   8566    } else {
   8567       addr  = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8568       epart = loadLE(Ity_V128, mkexpr(addr));
   8569       DIP("%s %s,%s\n", opname,
   8570                         dis_buf,
   8571                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8572       delta += alen;
   8573    }
   8574    putXMMReg( gregOfRexRM(pfx,rm),
   8575               eLeft ? binop(op, epart, gpart)
   8576 	            : binop(op, gpart, epart) );
   8577    return delta;
   8578 }
   8579 
   8580 
   8581 /* Helper for doing SSE FP comparisons.  False return ==> unhandled.
   8582    This is all a bit of a kludge in that it ignores the subtleties of
   8583    ordered-vs-unordered and signalling-vs-nonsignalling in the Intel
   8584    spec. */
   8585 static Bool findSSECmpOp ( /*OUT*/Bool* preSwapP,
   8586                            /*OUT*/IROp* opP,
   8587                            /*OUT*/Bool* postNotP,
   8588                            UInt imm8, Bool all_lanes, Int sz )
   8589 {
   8590    if (imm8 >= 32) return False;
   8591 
   8592    /* First, compute a (preSwap, op, postNot) triple from
   8593       the supplied imm8. */
   8594    Bool pre = False;
   8595    IROp op  = Iop_INVALID;
   8596    Bool not = False;
   8597 
   8598 #  define XXX(_pre, _op, _not) { pre = _pre; op = _op; not = _not; }
   8599    // If you add a case here, add a corresponding test for both VCMPSD_128
   8600    // and VCMPSS_128 in avx-1.c.
   8601    switch (imm8) {
   8602       // "O" = ordered, "U" = unordered
   8603       // "Q" = non-signalling (quiet), "S" = signalling
   8604       //
   8605       //             swap operands?
   8606       //             |
   8607       //             |      cmp op          invert after?
   8608       //             |      |               |
   8609       //             v      v               v
   8610       case 0x0:  XXX(False, Iop_CmpEQ32Fx4, False); break; // EQ_OQ
   8611       case 0x1:  XXX(False, Iop_CmpLT32Fx4, False); break; // LT_OS
   8612       case 0x2:  XXX(False, Iop_CmpLE32Fx4, False); break; // LE_OS
   8613       case 0x3:  XXX(False, Iop_CmpUN32Fx4, False); break; // UNORD_Q
   8614       case 0x4:  XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_UQ
   8615       case 0x5:  XXX(False, Iop_CmpLT32Fx4, True);  break; // NLT_US
   8616       case 0x6:  XXX(False, Iop_CmpLE32Fx4, True);  break; // NLE_US
   8617       case 0x7:  XXX(False, Iop_CmpUN32Fx4, True);  break; // ORD_Q
   8618       // 0x8  EQ_UQ
   8619       case 0x9:  XXX(True,  Iop_CmpLE32Fx4, True);  break; // NGE_US
   8620       /* "Enhanced Comparison Predicate[s] for VEX-Encoded [insns] */
   8621       case 0xA:  XXX(True,  Iop_CmpLT32Fx4, True);  break; // NGT_US
   8622       // 0xB  FALSE_OQ
   8623       // 0xC: this isn't really right because it returns all-1s when
   8624       // either operand is a NaN, and it should return all-0s.
   8625       case 0xC:  XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_OQ
   8626       case 0xD:  XXX(True,  Iop_CmpLE32Fx4, False); break; // GE_OS
   8627       case 0xE:  XXX(True,  Iop_CmpLT32Fx4, False); break; // GT_OS
   8628       // 0xF  TRUE_UQ
   8629       // 0x10  EQ_OS
   8630       case 0x11: XXX(False, Iop_CmpLT32Fx4, False); break; // LT_OQ
   8631       case 0x12: XXX(False, Iop_CmpLE32Fx4, False); break; // LE_OQ
   8632       // 0x13  UNORD_S
   8633       // 0x14  NEQ_US
   8634       // 0x15  NLT_UQ
   8635       case 0x16: XXX(False, Iop_CmpLE32Fx4, True);  break; // NLE_UQ
   8636       // 0x17  ORD_S
   8637       // 0x18  EQ_US
   8638       // 0x19  NGE_UQ
   8639       // 0x1A  NGT_UQ
   8640       // 0x1B  FALSE_OS
   8641       // 0x1C  NEQ_OS
   8642       // 0x1D  GE_OQ
   8643       case 0x1E: XXX(True,  Iop_CmpLT32Fx4, False); break; // GT_OQ
   8644       // 0x1F  TRUE_US
   8645       /* Don't forget to add test cases to VCMPSS_128_<imm8> in
   8646          avx-1.c if new cases turn up. */
   8647       default: break;
   8648    }
   8649 #  undef XXX
   8650    if (op == Iop_INVALID) return False;
   8651 
   8652    /* Now convert the op into one with the same arithmetic but that is
   8653       correct for the width and laneage requirements. */
   8654 
   8655    /**/ if (sz == 4 && all_lanes) {
   8656       switch (op) {
   8657          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ32Fx4; break;
   8658          case Iop_CmpLT32Fx4: op = Iop_CmpLT32Fx4; break;
   8659          case Iop_CmpLE32Fx4: op = Iop_CmpLE32Fx4; break;
   8660          case Iop_CmpUN32Fx4: op = Iop_CmpUN32Fx4; break;
   8661          default: vassert(0);
   8662       }
   8663    }
   8664    else if (sz == 4 && !all_lanes) {
   8665       switch (op) {
   8666          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ32F0x4; break;
   8667          case Iop_CmpLT32Fx4: op = Iop_CmpLT32F0x4; break;
   8668          case Iop_CmpLE32Fx4: op = Iop_CmpLE32F0x4; break;
   8669          case Iop_CmpUN32Fx4: op = Iop_CmpUN32F0x4; break;
   8670          default: vassert(0);
   8671       }
   8672    }
   8673    else if (sz == 8 && all_lanes) {
   8674       switch (op) {
   8675          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ64Fx2; break;
   8676          case Iop_CmpLT32Fx4: op = Iop_CmpLT64Fx2; break;
   8677          case Iop_CmpLE32Fx4: op = Iop_CmpLE64Fx2; break;
   8678          case Iop_CmpUN32Fx4: op = Iop_CmpUN64Fx2; break;
   8679          default: vassert(0);
   8680       }
   8681    }
   8682    else if (sz == 8 && !all_lanes) {
   8683       switch (op) {
   8684          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ64F0x2; break;
   8685          case Iop_CmpLT32Fx4: op = Iop_CmpLT64F0x2; break;
   8686          case Iop_CmpLE32Fx4: op = Iop_CmpLE64F0x2; break;
   8687          case Iop_CmpUN32Fx4: op = Iop_CmpUN64F0x2; break;
   8688          default: vassert(0);
   8689       }
   8690    }
   8691    else {
   8692       vpanic("findSSECmpOp(amd64,guest)");
   8693    }
   8694 
   8695    *preSwapP = pre; *opP = op; *postNotP = not;
   8696    return True;
   8697 }
   8698 
   8699 
   8700 /* Handles SSE 32F/64F comparisons.  It can fail, in which case it
   8701    returns the original delta to indicate failure. */
   8702 
   8703 static Long dis_SSE_cmp_E_to_G ( VexAbiInfo* vbi,
   8704                                  Prefix pfx, Long delta,
   8705                                  HChar* opname, Bool all_lanes, Int sz )
   8706 {
   8707    Long    delta0 = delta;
   8708    HChar   dis_buf[50];
   8709    Int     alen;
   8710    UInt    imm8;
   8711    IRTemp  addr;
   8712    Bool    preSwap = False;
   8713    IROp    op      = Iop_INVALID;
   8714    Bool    postNot = False;
   8715    IRTemp  plain   = newTemp(Ity_V128);
   8716    UChar   rm      = getUChar(delta);
   8717    UShort  mask    = 0;
   8718    vassert(sz == 4 || sz == 8);
   8719    if (epartIsReg(rm)) {
   8720       imm8 = getUChar(delta+1);
   8721       if (imm8 >= 8) return delta0; /* FAIL */
   8722       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
   8723       if (!ok) return delta0; /* FAIL */
   8724       vassert(!preSwap); /* never needed for imm8 < 8 */
   8725       assign( plain, binop(op, getXMMReg(gregOfRexRM(pfx,rm)),
   8726                                getXMMReg(eregOfRexRM(pfx,rm))) );
   8727       delta += 2;
   8728       DIP("%s $%d,%s,%s\n", opname,
   8729                             (Int)imm8,
   8730                             nameXMMReg(eregOfRexRM(pfx,rm)),
   8731                             nameXMMReg(gregOfRexRM(pfx,rm)) );
   8732    } else {
   8733       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   8734       imm8 = getUChar(delta+alen);
   8735       if (imm8 >= 8) return delta0; /* FAIL */
   8736       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
   8737       if (!ok) return delta0; /* FAIL */
   8738       vassert(!preSwap); /* never needed for imm8 < 8 */
   8739       assign( plain,
   8740               binop(
   8741                  op,
   8742                  getXMMReg(gregOfRexRM(pfx,rm)),
   8743                    all_lanes
   8744                       ? loadLE(Ity_V128, mkexpr(addr))
   8745                    : sz == 8
   8746                       ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
   8747                    : /*sz==4*/
   8748                       unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
   8749 	      )
   8750       );
   8751       delta += alen+1;
   8752       DIP("%s $%d,%s,%s\n", opname,
   8753                             (Int)imm8,
   8754                             dis_buf,
   8755                             nameXMMReg(gregOfRexRM(pfx,rm)) );
   8756    }
   8757 
   8758    if (postNot && all_lanes) {
   8759       putXMMReg( gregOfRexRM(pfx,rm),
   8760                  unop(Iop_NotV128, mkexpr(plain)) );
   8761    }
   8762    else
   8763    if (postNot && !all_lanes) {
   8764       mask = toUShort(sz==4 ? 0x000F : 0x00FF);
   8765       putXMMReg( gregOfRexRM(pfx,rm),
   8766                  binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) );
   8767    }
   8768    else {
   8769       putXMMReg( gregOfRexRM(pfx,rm), mkexpr(plain) );
   8770    }
   8771 
   8772    return delta;
   8773 }
   8774 
   8775 
   8776 /* Vector by scalar shift of G by the amount specified at the bottom
   8777    of E. */
   8778 
   8779 static ULong dis_SSE_shiftG_byE ( VexAbiInfo* vbi,
   8780                                   Prefix pfx, Long delta,
   8781                                   HChar* opname, IROp op )
   8782 {
   8783    HChar   dis_buf[50];
   8784    Int     alen, size;
   8785    IRTemp  addr;
   8786    Bool    shl, shr, sar;
   8787    UChar   rm   = getUChar(delta);
   8788    IRTemp  g0   = newTemp(Ity_V128);
   8789    IRTemp  g1   = newTemp(Ity_V128);
   8790    IRTemp  amt  = newTemp(Ity_I64);
   8791    IRTemp  amt8 = newTemp(Ity_I8);
   8792    if (epartIsReg(rm)) {
   8793       assign( amt, getXMMRegLane64(eregOfRexRM(pfx,rm), 0) );
   8794       DIP("%s %s,%s\n", opname,
   8795                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8796                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8797       delta++;
   8798    } else {
   8799       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8800       assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
   8801       DIP("%s %s,%s\n", opname,
   8802                         dis_buf,
   8803                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8804       delta += alen;
   8805    }
   8806    assign( g0,   getXMMReg(gregOfRexRM(pfx,rm)) );
   8807    assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
   8808 
   8809    shl = shr = sar = False;
   8810    size = 0;
   8811    switch (op) {
   8812       case Iop_ShlN16x8: shl = True; size = 32; break;
   8813       case Iop_ShlN32x4: shl = True; size = 32; break;
   8814       case Iop_ShlN64x2: shl = True; size = 64; break;
   8815       case Iop_SarN16x8: sar = True; size = 16; break;
   8816       case Iop_SarN32x4: sar = True; size = 32; break;
   8817       case Iop_ShrN16x8: shr = True; size = 16; break;
   8818       case Iop_ShrN32x4: shr = True; size = 32; break;
   8819       case Iop_ShrN64x2: shr = True; size = 64; break;
   8820       default: vassert(0);
   8821    }
   8822 
   8823    if (shl || shr) {
   8824      assign(
   8825         g1,
   8826         IRExpr_Mux0X(
   8827            unop(Iop_1Uto8,
   8828                 binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size))),
   8829            mkV128(0x0000),
   8830            binop(op, mkexpr(g0), mkexpr(amt8))
   8831         )
   8832      );
   8833    } else
   8834    if (sar) {
   8835      assign(
   8836         g1,
   8837         IRExpr_Mux0X(
   8838            unop(Iop_1Uto8,
   8839                 binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size))),
   8840            binop(op, mkexpr(g0), mkU8(size-1)),
   8841            binop(op, mkexpr(g0), mkexpr(amt8))
   8842         )
   8843      );
   8844    } else {
   8845       vassert(0);
   8846    }
   8847 
   8848    putXMMReg( gregOfRexRM(pfx,rm), mkexpr(g1) );
   8849    return delta;
   8850 }
   8851 
   8852 
   8853 /* Vector by scalar shift of E by an immediate byte. */
   8854 
   8855 static
   8856 ULong dis_SSE_shiftE_imm ( Prefix pfx,
   8857                            Long delta, HChar* opname, IROp op )
   8858 {
   8859    Bool    shl, shr, sar;
   8860    UChar   rm   = getUChar(delta);
   8861    IRTemp  e0   = newTemp(Ity_V128);
   8862    IRTemp  e1   = newTemp(Ity_V128);
   8863    UChar   amt, size;
   8864    vassert(epartIsReg(rm));
   8865    vassert(gregLO3ofRM(rm) == 2
   8866            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
   8867    amt = getUChar(delta+1);
   8868    delta += 2;
   8869    DIP("%s $%d,%s\n", opname,
   8870                       (Int)amt,
   8871                       nameXMMReg(eregOfRexRM(pfx,rm)) );
   8872    assign( e0, getXMMReg(eregOfRexRM(pfx,rm)) );
   8873 
   8874    shl = shr = sar = False;
   8875    size = 0;
   8876    switch (op) {
   8877       case Iop_ShlN16x8: shl = True; size = 16; break;
   8878       case Iop_ShlN32x4: shl = True; size = 32; break;
   8879       case Iop_ShlN64x2: shl = True; size = 64; break;
   8880       case Iop_SarN16x8: sar = True; size = 16; break;
   8881       case Iop_SarN32x4: sar = True; size = 32; break;
   8882       case Iop_ShrN16x8: shr = True; size = 16; break;
   8883       case Iop_ShrN32x4: shr = True; size = 32; break;
   8884       case Iop_ShrN64x2: shr = True; size = 64; break;
   8885       default: vassert(0);
   8886    }
   8887 
   8888    if (shl || shr) {
   8889      assign( e1, amt >= size
   8890                     ? mkV128(0x0000)
   8891                     : binop(op, mkexpr(e0), mkU8(amt))
   8892      );
   8893    } else
   8894    if (sar) {
   8895      assign( e1, amt >= size
   8896                     ? binop(op, mkexpr(e0), mkU8(size-1))
   8897                     : binop(op, mkexpr(e0), mkU8(amt))
   8898      );
   8899    } else {
   8900       vassert(0);
   8901    }
   8902 
   8903    putXMMReg( eregOfRexRM(pfx,rm), mkexpr(e1) );
   8904    return delta;
   8905 }
   8906 
   8907 
   8908 /* Get the current SSE rounding mode. */
   8909 
   8910 static IRExpr* /* :: Ity_I32 */ get_sse_roundingmode ( void )
   8911 {
   8912    return
   8913       unop( Iop_64to32,
   8914             binop( Iop_And64,
   8915                    IRExpr_Get( OFFB_SSEROUND, Ity_I64 ),
   8916                    mkU64(3) ));
   8917 }
   8918 
   8919 static void put_sse_roundingmode ( IRExpr* sseround )
   8920 {
   8921    vassert(typeOfIRExpr(irsb->tyenv, sseround) == Ity_I32);
   8922    stmt( IRStmt_Put( OFFB_SSEROUND,
   8923                      unop(Iop_32Uto64,sseround) ) );
   8924 }
   8925 
   8926 /* Break a V128-bit value up into four 32-bit ints. */
   8927 
   8928 static void breakupV128to32s ( IRTemp t128,
   8929                                /*OUTs*/
   8930                                IRTemp* t3, IRTemp* t2,
   8931                                IRTemp* t1, IRTemp* t0 )
   8932 {
   8933    IRTemp hi64 = newTemp(Ity_I64);
   8934    IRTemp lo64 = newTemp(Ity_I64);
   8935    assign( hi64, unop(Iop_V128HIto64, mkexpr(t128)) );
   8936    assign( lo64, unop(Iop_V128to64,   mkexpr(t128)) );
   8937 
   8938    vassert(t0 && *t0 == IRTemp_INVALID);
   8939    vassert(t1 && *t1 == IRTemp_INVALID);
   8940    vassert(t2 && *t2 == IRTemp_INVALID);
   8941    vassert(t3 && *t3 == IRTemp_INVALID);
   8942 
   8943    *t0 = newTemp(Ity_I32);
   8944    *t1 = newTemp(Ity_I32);
   8945    *t2 = newTemp(Ity_I32);
   8946    *t3 = newTemp(Ity_I32);
   8947    assign( *t0, unop(Iop_64to32,   mkexpr(lo64)) );
   8948    assign( *t1, unop(Iop_64HIto32, mkexpr(lo64)) );
   8949    assign( *t2, unop(Iop_64to32,   mkexpr(hi64)) );
   8950    assign( *t3, unop(Iop_64HIto32, mkexpr(hi64)) );
   8951 }
   8952 
   8953 /* Construct a V128-bit value from four 32-bit ints. */
   8954 
   8955 static IRExpr* mkV128from32s ( IRTemp t3, IRTemp t2,
   8956                                IRTemp t1, IRTemp t0 )
   8957 {
   8958    return
   8959       binop( Iop_64HLtoV128,
   8960              binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
   8961              binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0))
   8962    );
   8963 }
   8964 
   8965 /* Break a 64-bit value up into four 16-bit ints. */
   8966 
   8967 static void breakup64to16s ( IRTemp t64,
   8968                              /*OUTs*/
   8969                              IRTemp* t3, IRTemp* t2,
   8970                              IRTemp* t1, IRTemp* t0 )
   8971 {
   8972    IRTemp hi32 = newTemp(Ity_I32);
   8973    IRTemp lo32 = newTemp(Ity_I32);
   8974    assign( hi32, unop(Iop_64HIto32, mkexpr(t64)) );
   8975    assign( lo32, unop(Iop_64to32,   mkexpr(t64)) );
   8976 
   8977    vassert(t0 && *t0 == IRTemp_INVALID);
   8978    vassert(t1 && *t1 == IRTemp_INVALID);
   8979    vassert(t2 && *t2 == IRTemp_INVALID);
   8980    vassert(t3 && *t3 == IRTemp_INVALID);
   8981 
   8982    *t0 = newTemp(Ity_I16);
   8983    *t1 = newTemp(Ity_I16);
   8984    *t2 = newTemp(Ity_I16);
   8985    *t3 = newTemp(Ity_I16);
   8986    assign( *t0, unop(Iop_32to16,   mkexpr(lo32)) );
   8987    assign( *t1, unop(Iop_32HIto16, mkexpr(lo32)) );
   8988    assign( *t2, unop(Iop_32to16,   mkexpr(hi32)) );
   8989    assign( *t3, unop(Iop_32HIto16, mkexpr(hi32)) );
   8990 }
   8991 
   8992 /* Construct a 64-bit value from four 16-bit ints. */
   8993 
   8994 static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2,
   8995                              IRTemp t1, IRTemp t0 )
   8996 {
   8997    return
   8998       binop( Iop_32HLto64,
   8999              binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)),
   9000              binop(Iop_16HLto32, mkexpr(t1), mkexpr(t0))
   9001    );
   9002 }
   9003 
   9004 /* Break a V256-bit value up into four 64-bit ints. */
   9005 
   9006 static void breakupV256to64s ( IRTemp t256,
   9007                                /*OUTs*/
   9008                                IRTemp* t3, IRTemp* t2,
   9009                                IRTemp* t1, IRTemp* t0 )
   9010 {
   9011    vassert(t0 && *t0 == IRTemp_INVALID);
   9012    vassert(t1 && *t1 == IRTemp_INVALID);
   9013    vassert(t2 && *t2 == IRTemp_INVALID);
   9014    vassert(t3 && *t3 == IRTemp_INVALID);
   9015    *t0 = newTemp(Ity_I64);
   9016    *t1 = newTemp(Ity_I64);
   9017    *t2 = newTemp(Ity_I64);
   9018    *t3 = newTemp(Ity_I64);
   9019    assign( *t0, unop(Iop_V256to64_0, mkexpr(t256)) );
   9020    assign( *t1, unop(Iop_V256to64_1, mkexpr(t256)) );
   9021    assign( *t2, unop(Iop_V256to64_2, mkexpr(t256)) );
   9022    assign( *t3, unop(Iop_V256to64_3, mkexpr(t256)) );
   9023 }
   9024 
   9025 /* Break a V256-bit value up into two V128s. */
   9026 
   9027 static void breakupV256toV128s ( IRTemp t256,
   9028                                  /*OUTs*/
   9029                                  IRTemp* t1, IRTemp* t0 )
   9030 {
   9031    vassert(t0 && *t0 == IRTemp_INVALID);
   9032    vassert(t1 && *t1 == IRTemp_INVALID);
   9033    *t0 = newTemp(Ity_V128);
   9034    *t1 = newTemp(Ity_V128);
   9035    assign(*t1, unop(Iop_V256toV128_1, mkexpr(t256)));
   9036    assign(*t0, unop(Iop_V256toV128_0, mkexpr(t256)));
   9037 }
   9038 
   9039 /* Break a V256-bit value up into eight 32-bit ints.  */
   9040 
   9041 static void breakupV256to32s ( IRTemp t256,
   9042                                /*OUTs*/
   9043                                IRTemp* t7, IRTemp* t6,
   9044                                IRTemp* t5, IRTemp* t4,
   9045                                IRTemp* t3, IRTemp* t2,
   9046                                IRTemp* t1, IRTemp* t0 )
   9047 {
   9048    IRTemp t128_1 = IRTemp_INVALID;
   9049    IRTemp t128_0 = IRTemp_INVALID;
   9050    breakupV256toV128s( t256, &t128_1, &t128_0 );
   9051    breakupV128to32s( t128_1, t7, t6, t5, t4 );
   9052    breakupV128to32s( t128_0, t3, t2, t1, t0 );
   9053 }
   9054 
   9055 /* Break a V128-bit value up into two 64-bit ints. */
   9056 
   9057 static void breakupV128to64s ( IRTemp t128,
   9058                                /*OUTs*/
   9059                                IRTemp* t1, IRTemp* t0 )
   9060 {
   9061    vassert(t0 && *t0 == IRTemp_INVALID);
   9062    vassert(t1 && *t1 == IRTemp_INVALID);
   9063    *t0 = newTemp(Ity_I64);
   9064    *t1 = newTemp(Ity_I64);
   9065    assign( *t0, unop(Iop_V128to64,   mkexpr(t128)) );
   9066    assign( *t1, unop(Iop_V128HIto64, mkexpr(t128)) );
   9067 }
   9068 
   9069 /* Construct a V256-bit value from eight 32-bit ints. */
   9070 
   9071 static IRExpr* mkV256from32s ( IRTemp t7, IRTemp t6,
   9072                                IRTemp t5, IRTemp t4,
   9073                                IRTemp t3, IRTemp t2,
   9074                                IRTemp t1, IRTemp t0 )
   9075 {
   9076    return
   9077       binop( Iop_V128HLtoV256,
   9078              binop( Iop_64HLtoV128,
   9079                     binop(Iop_32HLto64, mkexpr(t7), mkexpr(t6)),
   9080                     binop(Iop_32HLto64, mkexpr(t5), mkexpr(t4)) ),
   9081              binop( Iop_64HLtoV128,
   9082                     binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
   9083                     binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0)) )
   9084    );
   9085 }
   9086 
   9087 /* Construct a V256-bit value from four 64-bit ints. */
   9088 
   9089 static IRExpr* mkV256from64s ( IRTemp t3, IRTemp t2,
   9090                                IRTemp t1, IRTemp t0 )
   9091 {
   9092    return
   9093       binop( Iop_V128HLtoV256,
   9094              binop(Iop_64HLtoV128, mkexpr(t3), mkexpr(t2)),
   9095              binop(Iop_64HLtoV128, mkexpr(t1), mkexpr(t0))
   9096    );
   9097 }
   9098 
   9099 /* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
   9100    values (aa,bb), computes, for each of the 4 16-bit lanes:
   9101 
   9102    (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
   9103 */
   9104 static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
   9105 {
   9106    IRTemp aa      = newTemp(Ity_I64);
   9107    IRTemp bb      = newTemp(Ity_I64);
   9108    IRTemp aahi32s = newTemp(Ity_I64);
   9109    IRTemp aalo32s = newTemp(Ity_I64);
   9110    IRTemp bbhi32s = newTemp(Ity_I64);
   9111    IRTemp bblo32s = newTemp(Ity_I64);
   9112    IRTemp rHi     = newTemp(Ity_I64);
   9113    IRTemp rLo     = newTemp(Ity_I64);
   9114    IRTemp one32x2 = newTemp(Ity_I64);
   9115    assign(aa, aax);
   9116    assign(bb, bbx);
   9117    assign( aahi32s,
   9118            binop(Iop_SarN32x2,
   9119                  binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
   9120                  mkU8(16) ));
   9121    assign( aalo32s,
   9122            binop(Iop_SarN32x2,
   9123                  binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
   9124                  mkU8(16) ));
   9125    assign( bbhi32s,
   9126            binop(Iop_SarN32x2,
   9127                  binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
   9128                  mkU8(16) ));
   9129    assign( bblo32s,
   9130            binop(Iop_SarN32x2,
   9131                  binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
   9132                  mkU8(16) ));
   9133    assign(one32x2, mkU64( (1ULL << 32) + 1 ));
   9134    assign(
   9135       rHi,
   9136       binop(
   9137          Iop_ShrN32x2,
   9138          binop(
   9139             Iop_Add32x2,
   9140             binop(
   9141                Iop_ShrN32x2,
   9142                binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
   9143                mkU8(14)
   9144             ),
   9145             mkexpr(one32x2)
   9146          ),
   9147          mkU8(1)
   9148       )
   9149    );
   9150    assign(
   9151       rLo,
   9152       binop(
   9153          Iop_ShrN32x2,
   9154          binop(
   9155             Iop_Add32x2,
   9156             binop(
   9157                Iop_ShrN32x2,
   9158                binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
   9159                mkU8(14)
   9160             ),
   9161             mkexpr(one32x2)
   9162          ),
   9163          mkU8(1)
   9164       )
   9165    );
   9166    return
   9167       binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
   9168 }
   9169 
   9170 /* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns.  Given two 64-bit
   9171    values (aa,bb), computes, for each lane:
   9172 
   9173           if aa_lane < 0 then - bb_lane
   9174      else if aa_lane > 0 then bb_lane
   9175      else 0
   9176 */
   9177 static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
   9178 {
   9179    IRTemp aa       = newTemp(Ity_I64);
   9180    IRTemp bb       = newTemp(Ity_I64);
   9181    IRTemp zero     = newTemp(Ity_I64);
   9182    IRTemp bbNeg    = newTemp(Ity_I64);
   9183    IRTemp negMask  = newTemp(Ity_I64);
   9184    IRTemp posMask  = newTemp(Ity_I64);
   9185    IROp   opSub    = Iop_INVALID;
   9186    IROp   opCmpGTS = Iop_INVALID;
   9187 
   9188    switch (laneszB) {
   9189       case 1: opSub = Iop_Sub8x8;  opCmpGTS = Iop_CmpGT8Sx8;  break;
   9190       case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
   9191       case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
   9192       default: vassert(0);
   9193    }
   9194 
   9195    assign( aa,      aax );
   9196    assign( bb,      bbx );
   9197    assign( zero,    mkU64(0) );
   9198    assign( bbNeg,   binop(opSub,    mkexpr(zero), mkexpr(bb)) );
   9199    assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
   9200    assign( posMask, binop(opCmpGTS, mkexpr(aa),   mkexpr(zero)) );
   9201 
   9202    return
   9203       binop(Iop_Or64,
   9204             binop(Iop_And64, mkexpr(bb),    mkexpr(posMask)),
   9205             binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
   9206 
   9207 }
   9208 
   9209 
   9210 /* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns.  Given a 64-bit
   9211    value aa, computes, for each lane
   9212 
   9213    if aa < 0 then -aa else aa
   9214 
   9215    Note that the result is interpreted as unsigned, so that the
   9216    absolute value of the most negative signed input can be
   9217    represented.
   9218 */
   9219 static IRTemp math_PABS_MMX ( IRTemp aa, Int laneszB )
   9220 {
   9221    IRTemp res     = newTemp(Ity_I64);
   9222    IRTemp zero    = newTemp(Ity_I64);
   9223    IRTemp aaNeg   = newTemp(Ity_I64);
   9224    IRTemp negMask = newTemp(Ity_I64);
   9225    IRTemp posMask = newTemp(Ity_I64);
   9226    IROp   opSub   = Iop_INVALID;
   9227    IROp   opSarN  = Iop_INVALID;
   9228 
   9229    switch (laneszB) {
   9230       case 1: opSub = Iop_Sub8x8;  opSarN = Iop_SarN8x8;  break;
   9231       case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
   9232       case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
   9233       default: vassert(0);
   9234    }
   9235 
   9236    assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
   9237    assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
   9238    assign( zero,    mkU64(0) );
   9239    assign( aaNeg,   binop(opSub, mkexpr(zero), mkexpr(aa)) );
   9240    assign( res,
   9241            binop(Iop_Or64,
   9242                  binop(Iop_And64, mkexpr(aa),    mkexpr(posMask)),
   9243                  binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) ));
   9244    return res;
   9245 }
   9246 
   9247 /* XMM version of math_PABS_MMX. */
   9248 static IRTemp math_PABS_XMM ( IRTemp aa, Int laneszB )
   9249 {
   9250    IRTemp res  = newTemp(Ity_V128);
   9251    IRTemp aaHi = newTemp(Ity_I64);
   9252    IRTemp aaLo = newTemp(Ity_I64);
   9253    assign(aaHi, unop(Iop_V128HIto64, mkexpr(aa)));
   9254    assign(aaLo, unop(Iop_V128to64, mkexpr(aa)));
   9255    assign(res, binop(Iop_64HLtoV128,
   9256                      mkexpr(math_PABS_MMX(aaHi, laneszB)),
   9257                      mkexpr(math_PABS_MMX(aaLo, laneszB))));
   9258    return res;
   9259 }
   9260 
   9261 /* Specialisations of math_PABS_XMM, since there's no easy way to do
   9262    partial applications in C :-( */
   9263 static IRTemp math_PABS_XMM_pap4 ( IRTemp aa ) {
   9264    return math_PABS_XMM(aa, 4);
   9265 }
   9266 
   9267 static IRTemp math_PABS_XMM_pap2 ( IRTemp aa ) {
   9268    return math_PABS_XMM(aa, 2);
   9269 }
   9270 
   9271 static IRTemp math_PABS_XMM_pap1 ( IRTemp aa ) {
   9272    return math_PABS_XMM(aa, 1);
   9273 }
   9274 
   9275 static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
   9276                                         IRTemp lo64, Long byteShift )
   9277 {
   9278    vassert(byteShift >= 1 && byteShift <= 7);
   9279    return
   9280       binop(Iop_Or64,
   9281             binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
   9282             binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
   9283       );
   9284 }
   9285 
   9286 static IRTemp math_PALIGNR_XMM ( IRTemp sV, IRTemp dV, UInt imm8 )
   9287 {
   9288    IRTemp res = newTemp(Ity_V128);
   9289    IRTemp sHi = newTemp(Ity_I64);
   9290    IRTemp sLo = newTemp(Ity_I64);
   9291    IRTemp dHi = newTemp(Ity_I64);
   9292    IRTemp dLo = newTemp(Ity_I64);
   9293    IRTemp rHi = newTemp(Ity_I64);
   9294    IRTemp rLo = newTemp(Ity_I64);
   9295 
   9296    assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   9297    assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   9298    assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   9299    assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   9300 
   9301    if (imm8 == 0) {
   9302       assign( rHi, mkexpr(sHi) );
   9303       assign( rLo, mkexpr(sLo) );
   9304    }
   9305    else if (imm8 >= 1 && imm8 <= 7) {
   9306       assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, imm8) );
   9307       assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, imm8) );
   9308    }
   9309    else if (imm8 == 8) {
   9310       assign( rHi, mkexpr(dLo) );
   9311       assign( rLo, mkexpr(sHi) );
   9312    }
   9313    else if (imm8 >= 9 && imm8 <= 15) {
   9314       assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, imm8-8) );
   9315       assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, imm8-8) );
   9316    }
   9317    else if (imm8 == 16) {
   9318       assign( rHi, mkexpr(dHi) );
   9319       assign( rLo, mkexpr(dLo) );
   9320    }
   9321    else if (imm8 >= 17 && imm8 <= 23) {
   9322       assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(imm8-16))) );
   9323       assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, imm8-16) );
   9324    }
   9325    else if (imm8 == 24) {
   9326       assign( rHi, mkU64(0) );
   9327       assign( rLo, mkexpr(dHi) );
   9328    }
   9329    else if (imm8 >= 25 && imm8 <= 31) {
   9330       assign( rHi, mkU64(0) );
   9331       assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(imm8-24))) );
   9332    }
   9333    else if (imm8 >= 32 && imm8 <= 255) {
   9334       assign( rHi, mkU64(0) );
   9335       assign( rLo, mkU64(0) );
   9336    }
   9337    else
   9338       vassert(0);
   9339 
   9340    assign( res, binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo)));
   9341    return res;
   9342 }
   9343 
   9344 
   9345 /* Generate a SIGSEGV followed by a restart of the current instruction
   9346    if effective_addr is not 16-aligned.  This is required behaviour
   9347    for some SSE3 instructions and all 128-bit SSSE3 instructions.
   9348    This assumes that guest_RIP_curr_instr is set correctly! */
   9349 static
   9350 void gen_SEGV_if_not_XX_aligned ( IRTemp effective_addr, ULong mask )
   9351 {
   9352    stmt(
   9353       IRStmt_Exit(
   9354          binop(Iop_CmpNE64,
   9355                binop(Iop_And64,mkexpr(effective_addr),mkU64(mask)),
   9356                mkU64(0)),
   9357          Ijk_SigSEGV,
   9358          IRConst_U64(guest_RIP_curr_instr),
   9359          OFFB_RIP
   9360       )
   9361    );
   9362 }
   9363 
   9364 static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr ) {
   9365    gen_SEGV_if_not_XX_aligned(effective_addr, 16-1);
   9366 }
   9367 
   9368 static void gen_SEGV_if_not_32_aligned ( IRTemp effective_addr ) {
   9369    gen_SEGV_if_not_XX_aligned(effective_addr, 32-1);
   9370 }
   9371 
   9372 /* Helper for deciding whether a given insn (starting at the opcode
   9373    byte) may validly be used with a LOCK prefix.  The following insns
   9374    may be used with LOCK when their destination operand is in memory.
   9375    AFAICS this is exactly the same for both 32-bit and 64-bit mode.
   9376 
   9377    ADD        80 /0,  81 /0,  82 /0,  83 /0,  00,  01
   9378    OR         80 /1,  81 /1,  82 /x,  83 /1,  08,  09
   9379    ADC        80 /2,  81 /2,  82 /2,  83 /2,  10,  11
   9380    SBB        81 /3,  81 /3,  82 /x,  83 /3,  18,  19
   9381    AND        80 /4,  81 /4,  82 /x,  83 /4,  20,  21
   9382    SUB        80 /5,  81 /5,  82 /x,  83 /5,  28,  29
   9383    XOR        80 /6,  81 /6,  82 /x,  83 /6,  30,  31
   9384 
   9385    DEC        FE /1,  FF /1
   9386    INC        FE /0,  FF /0
   9387 
   9388    NEG        F6 /3,  F7 /3
   9389    NOT        F6 /2,  F7 /2
   9390 
   9391    XCHG       86, 87
   9392 
   9393    BTC        0F BB,  0F BA /7
   9394    BTR        0F B3,  0F BA /6
   9395    BTS        0F AB,  0F BA /5
   9396 
   9397    CMPXCHG    0F B0,  0F B1
   9398    CMPXCHG8B  0F C7 /1
   9399 
   9400    XADD       0F C0,  0F C1
   9401 
   9402    ------------------------------
   9403 
   9404    80 /0  =  addb $imm8,  rm8
   9405    81 /0  =  addl $imm32, rm32  and  addw $imm16, rm16
   9406    82 /0  =  addb $imm8,  rm8
   9407    83 /0  =  addl $simm8, rm32  and  addw $simm8, rm16
   9408 
   9409    00     =  addb r8,  rm8
   9410    01     =  addl r32, rm32  and  addw r16, rm16
   9411 
   9412    Same for ADD OR ADC SBB AND SUB XOR
   9413 
   9414    FE /1  = dec rm8
   9415    FF /1  = dec rm32  and  dec rm16
   9416 
   9417    FE /0  = inc rm8
   9418    FF /0  = inc rm32  and  inc rm16
   9419 
   9420    F6 /3  = neg rm8
   9421    F7 /3  = neg rm32  and  neg rm16
   9422 
   9423    F6 /2  = not rm8
   9424    F7 /2  = not rm32  and  not rm16
   9425 
   9426    0F BB     = btcw r16, rm16    and  btcl r32, rm32
   9427    OF BA /7  = btcw $imm8, rm16  and  btcw $imm8, rm32
   9428 
   9429    Same for BTS, BTR
   9430 */
   9431 static Bool can_be_used_with_LOCK_prefix ( UChar* opc )
   9432 {
   9433    switch (opc[0]) {
   9434       case 0x00: case 0x01: case 0x08: case 0x09:
   9435       case 0x10: case 0x11: case 0x18: case 0x19:
   9436       case 0x20: case 0x21: case 0x28: case 0x29:
   9437       case 0x30: case 0x31:
   9438          if (!epartIsReg(opc[1]))
   9439             return True;
   9440          break;
   9441 
   9442       case 0x80: case 0x81: case 0x82: case 0x83:
   9443          if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 6
   9444              && !epartIsReg(opc[1]))
   9445             return True;
   9446          break;
   9447 
   9448       case 0xFE: case 0xFF:
   9449          if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 1
   9450              && !epartIsReg(opc[1]))
   9451             return True;
   9452          break;
   9453 
   9454       case 0xF6: case 0xF7:
   9455          if (gregLO3ofRM(opc[1]) >= 2 && gregLO3ofRM(opc[1]) <= 3
   9456              && !epartIsReg(opc[1]))
   9457             return True;
   9458          break;
   9459 
   9460       case 0x86: case 0x87:
   9461          if (!epartIsReg(opc[1]))
   9462             return True;
   9463          break;
   9464 
   9465       case 0x0F: {
   9466          switch (opc[1]) {
   9467             case 0xBB: case 0xB3: case 0xAB:
   9468                if (!epartIsReg(opc[2]))
   9469                   return True;
   9470                break;
   9471             case 0xBA:
   9472                if (gregLO3ofRM(opc[2]) >= 5 && gregLO3ofRM(opc[2]) <= 7
   9473                    && !epartIsReg(opc[2]))
   9474                   return True;
   9475                break;
   9476             case 0xB0: case 0xB1:
   9477                if (!epartIsReg(opc[2]))
   9478                   return True;
   9479                break;
   9480             case 0xC7:
   9481                if (gregLO3ofRM(opc[2]) == 1 && !epartIsReg(opc[2]) )
   9482                   return True;
   9483                break;
   9484             case 0xC0: case 0xC1:
   9485                if (!epartIsReg(opc[2]))
   9486                   return True;
   9487                break;
   9488             default:
   9489                break;
   9490          } /* switch (opc[1]) */
   9491          break;
   9492       }
   9493 
   9494       default:
   9495          break;
   9496    } /* switch (opc[0]) */
   9497 
   9498    return False;
   9499 }
   9500 
   9501 
   9502 /*------------------------------------------------------------*/
   9503 /*---                                                      ---*/
   9504 /*--- Top-level SSE/SSE2: dis_ESC_0F__SSE2                 ---*/
   9505 /*---                                                      ---*/
   9506 /*------------------------------------------------------------*/
   9507 
   9508 static Long dis_COMISD ( VexAbiInfo* vbi, Prefix pfx,
   9509                          Long delta, Bool isAvx, UChar opc )
   9510 {
   9511    vassert(opc == 0x2F/*COMISD*/ || opc == 0x2E/*UCOMISD*/);
   9512    Int    alen  = 0;
   9513    HChar  dis_buf[50];
   9514    IRTemp argL  = newTemp(Ity_F64);
   9515    IRTemp argR  = newTemp(Ity_F64);
   9516    UChar  modrm = getUChar(delta);
   9517    IRTemp addr  = IRTemp_INVALID;
   9518    if (epartIsReg(modrm)) {
   9519       assign( argR, getXMMRegLane64F( eregOfRexRM(pfx,modrm),
   9520                                       0/*lowest lane*/ ) );
   9521       delta += 1;
   9522       DIP("%s%scomisd %s,%s\n", isAvx ? "v" : "",
   9523                                 opc==0x2E ? "u" : "",
   9524                                 nameXMMReg(eregOfRexRM(pfx,modrm)),
   9525                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
   9526    } else {
   9527       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9528       assign( argR, loadLE(Ity_F64, mkexpr(addr)) );
   9529       delta += alen;
   9530       DIP("%s%scomisd %s,%s\n", isAvx ? "v" : "",
   9531                                 opc==0x2E ? "u" : "",
   9532                                 dis_buf,
   9533                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
   9534    }
   9535    assign( argL, getXMMRegLane64F( gregOfRexRM(pfx,modrm),
   9536                                    0/*lowest lane*/ ) );
   9537 
   9538    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   9539    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   9540    stmt( IRStmt_Put(
   9541             OFFB_CC_DEP1,
   9542             binop( Iop_And64,
   9543                    unop( Iop_32Uto64,
   9544                          binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)) ),
   9545                    mkU64(0x45)
   9546        )));
   9547    return delta;
   9548 }
   9549 
   9550 
   9551 static Long dis_COMISS ( VexAbiInfo* vbi, Prefix pfx,
   9552                          Long delta, Bool isAvx, UChar opc )
   9553 {
   9554    vassert(opc == 0x2F/*COMISS*/ || opc == 0x2E/*UCOMISS*/);
   9555    Int    alen  = 0;
   9556    HChar  dis_buf[50];
   9557    IRTemp argL  = newTemp(Ity_F32);
   9558    IRTemp argR  = newTemp(Ity_F32);
   9559    UChar  modrm = getUChar(delta);
   9560    IRTemp addr  = IRTemp_INVALID;
   9561    if (epartIsReg(modrm)) {
   9562       assign( argR, getXMMRegLane32F( eregOfRexRM(pfx,modrm),
   9563                                       0/*lowest lane*/ ) );
   9564       delta += 1;
   9565       DIP("%s%scomiss %s,%s\n", isAvx ? "v" : "",
   9566                                 opc==0x2E ? "u" : "",
   9567                                 nameXMMReg(eregOfRexRM(pfx,modrm)),
   9568                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
   9569    } else {
   9570       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9571       assign( argR, loadLE(Ity_F32, mkexpr(addr)) );
   9572       delta += alen;
   9573       DIP("%s%scomiss %s,%s\n", isAvx ? "v" : "",
   9574                                 opc==0x2E ? "u" : "",
   9575                                 dis_buf,
   9576                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
   9577    }
   9578    assign( argL, getXMMRegLane32F( gregOfRexRM(pfx,modrm),
   9579                                    0/*lowest lane*/ ) );
   9580 
   9581    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   9582    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   9583    stmt( IRStmt_Put(
   9584             OFFB_CC_DEP1,
   9585             binop( Iop_And64,
   9586                    unop( Iop_32Uto64,
   9587                          binop(Iop_CmpF64,
   9588                                unop(Iop_F32toF64,mkexpr(argL)),
   9589                                unop(Iop_F32toF64,mkexpr(argR)))),
   9590                    mkU64(0x45)
   9591        )));
   9592    return delta;
   9593 }
   9594 
   9595 
   9596 static Long dis_PSHUFD_32x4 ( VexAbiInfo* vbi, Prefix pfx,
   9597                               Long delta, Bool writesYmm )
   9598 {
   9599    Int    order;
   9600    Int    alen  = 0;
   9601    HChar  dis_buf[50];
   9602    IRTemp sV    = newTemp(Ity_V128);
   9603    UChar  modrm = getUChar(delta);
   9604    HChar* strV  = writesYmm ? "v" : "";
   9605    IRTemp addr  = IRTemp_INVALID;
   9606    if (epartIsReg(modrm)) {
   9607       assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   9608       order = (Int)getUChar(delta+1);
   9609       delta += 1+1;
   9610       DIP("%spshufd $%d,%s,%s\n", strV, order,
   9611                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
   9612                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   9613    } else {
   9614       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   9615                         1/*byte after the amode*/ );
   9616       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   9617       order = (Int)getUChar(delta+alen);
   9618       delta += alen+1;
   9619       DIP("%spshufd $%d,%s,%s\n", strV, order,
   9620                                  dis_buf,
   9621                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   9622    }
   9623 
   9624    IRTemp s3, s2, s1, s0;
   9625    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   9626    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   9627 
   9628 #  define SEL(n)  ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   9629    IRTemp dV = newTemp(Ity_V128);
   9630    assign(dV,
   9631           mkV128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
   9632                          SEL((order>>2)&3), SEL((order>>0)&3) )
   9633    );
   9634 #  undef SEL
   9635 
   9636    (writesYmm ? putYMMRegLoAndZU : putXMMReg)
   9637       (gregOfRexRM(pfx,modrm), mkexpr(dV));
   9638    return delta;
   9639 }
   9640 
   9641 
   9642 static IRTemp math_PSRLDQ ( IRTemp sV, Int imm )
   9643 {
   9644    IRTemp dV    = newTemp(Ity_V128);
   9645    IRTemp hi64  = newTemp(Ity_I64);
   9646    IRTemp lo64  = newTemp(Ity_I64);
   9647    IRTemp hi64r = newTemp(Ity_I64);
   9648    IRTemp lo64r = newTemp(Ity_I64);
   9649 
   9650    vassert(imm >= 0 && imm <= 255);
   9651    if (imm >= 16) {
   9652       assign(dV, mkV128(0x0000));
   9653       return dV;
   9654    }
   9655 
   9656    assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   9657    assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   9658 
   9659    if (imm == 0) {
   9660       assign( lo64r, mkexpr(lo64) );
   9661       assign( hi64r, mkexpr(hi64) );
   9662    }
   9663    else
   9664    if (imm == 8) {
   9665       assign( hi64r, mkU64(0) );
   9666       assign( lo64r, mkexpr(hi64) );
   9667    }
   9668    else
   9669    if (imm > 8) {
   9670       assign( hi64r, mkU64(0) );
   9671       assign( lo64r, binop( Iop_Shr64, mkexpr(hi64), mkU8( 8*(imm-8) ) ));
   9672    } else {
   9673       assign( hi64r, binop( Iop_Shr64, mkexpr(hi64), mkU8(8 * imm) ));
   9674       assign( lo64r,
   9675               binop( Iop_Or64,
   9676                      binop(Iop_Shr64, mkexpr(lo64),
   9677                            mkU8(8 * imm)),
   9678                      binop(Iop_Shl64, mkexpr(hi64),
   9679                            mkU8(8 * (8 - imm)) )
   9680                      )
   9681               );
   9682    }
   9683 
   9684    assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   9685    return dV;
   9686 }
   9687 
   9688 
   9689 static IRTemp math_PSLLDQ ( IRTemp sV, Int imm )
   9690 {
   9691    IRTemp       dV    = newTemp(Ity_V128);
   9692    IRTemp       hi64  = newTemp(Ity_I64);
   9693    IRTemp       lo64  = newTemp(Ity_I64);
   9694    IRTemp       hi64r = newTemp(Ity_I64);
   9695    IRTemp       lo64r = newTemp(Ity_I64);
   9696 
   9697    vassert(imm >= 0 && imm <= 255);
   9698    if (imm >= 16) {
   9699       assign(dV, mkV128(0x0000));
   9700       return dV;
   9701    }
   9702 
   9703    assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   9704    assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   9705 
   9706    if (imm == 0) {
   9707       assign( lo64r, mkexpr(lo64) );
   9708       assign( hi64r, mkexpr(hi64) );
   9709    }
   9710    else
   9711    if (imm == 8) {
   9712       assign( lo64r, mkU64(0) );
   9713       assign( hi64r, mkexpr(lo64) );
   9714    }
   9715    else
   9716    if (imm > 8) {
   9717       assign( lo64r, mkU64(0) );
   9718       assign( hi64r, binop( Iop_Shl64, mkexpr(lo64), mkU8( 8*(imm-8) ) ));
   9719    } else {
   9720       assign( lo64r, binop( Iop_Shl64, mkexpr(lo64), mkU8(8 * imm) ));
   9721       assign( hi64r,
   9722               binop( Iop_Or64,
   9723                      binop(Iop_Shl64, mkexpr(hi64),
   9724                            mkU8(8 * imm)),
   9725                      binop(Iop_Shr64, mkexpr(lo64),
   9726                            mkU8(8 * (8 - imm)) )
   9727                      )
   9728               );
   9729    }
   9730 
   9731    assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   9732    return dV;
   9733 }
   9734 
   9735 
   9736 static Long dis_CVTxSD2SI ( VexAbiInfo* vbi, Prefix pfx,
   9737                             Long delta, Bool isAvx, UChar opc, Int sz )
   9738 {
   9739    vassert(opc == 0x2D/*CVTSD2SI*/ || opc == 0x2C/*CVTTSD2SI*/);
   9740    HChar  dis_buf[50];
   9741    Int    alen   = 0;
   9742    UChar  modrm  = getUChar(delta);
   9743    IRTemp addr   = IRTemp_INVALID;
   9744    IRTemp rmode  = newTemp(Ity_I32);
   9745    IRTemp f64lo  = newTemp(Ity_F64);
   9746    Bool   r2zero = toBool(opc == 0x2C);
   9747 
   9748    if (epartIsReg(modrm)) {
   9749       delta += 1;
   9750       assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
   9751       DIP("%scvt%ssd2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
   9752                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
   9753                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
   9754                                            False));
   9755    } else {
   9756       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9757       assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   9758       delta += alen;
   9759       DIP("%scvt%ssd2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
   9760                                   dis_buf,
   9761                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
   9762                                            False));
   9763    }
   9764 
   9765    if (r2zero) {
   9766       assign( rmode, mkU32((UInt)Irrm_ZERO) );
   9767    } else {
   9768       assign( rmode, get_sse_roundingmode() );
   9769    }
   9770 
   9771    if (sz == 4) {
   9772       putIReg32( gregOfRexRM(pfx,modrm),
   9773                  binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) );
   9774    } else {
   9775       vassert(sz == 8);
   9776       putIReg64( gregOfRexRM(pfx,modrm),
   9777                  binop( Iop_F64toI64S, mkexpr(rmode), mkexpr(f64lo)) );
   9778    }
   9779 
   9780    return delta;
   9781 }
   9782 
   9783 
   9784 static Long dis_CVTxSS2SI ( VexAbiInfo* vbi, Prefix pfx,
   9785                             Long delta, Bool isAvx, UChar opc, Int sz )
   9786 {
   9787    vassert(opc == 0x2D/*CVTSS2SI*/ || opc == 0x2C/*CVTTSS2SI*/);
   9788    HChar  dis_buf[50];
   9789    Int    alen   = 0;
   9790    UChar  modrm  = getUChar(delta);
   9791    IRTemp addr   = IRTemp_INVALID;
   9792    IRTemp rmode  = newTemp(Ity_I32);
   9793    IRTemp f32lo  = newTemp(Ity_F32);
   9794    Bool   r2zero = toBool(opc == 0x2C);
   9795 
   9796    if (epartIsReg(modrm)) {
   9797       delta += 1;
   9798       assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
   9799       DIP("%scvt%sss2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
   9800                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
   9801                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
   9802                                            False));
   9803    } else {
   9804       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9805       assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   9806       delta += alen;
   9807       DIP("%scvt%sss2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
   9808                                   dis_buf,
   9809                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
   9810                                            False));
   9811    }
   9812 
   9813    if (r2zero) {
   9814       assign( rmode, mkU32((UInt)Irrm_ZERO) );
   9815    } else {
   9816       assign( rmode, get_sse_roundingmode() );
   9817    }
   9818 
   9819    if (sz == 4) {
   9820       putIReg32( gregOfRexRM(pfx,modrm),
   9821                  binop( Iop_F64toI32S,
   9822                         mkexpr(rmode),
   9823                         unop(Iop_F32toF64, mkexpr(f32lo))) );
   9824    } else {
   9825       vassert(sz == 8);
   9826       putIReg64( gregOfRexRM(pfx,modrm),
   9827                  binop( Iop_F64toI64S,
   9828                         mkexpr(rmode),
   9829                         unop(Iop_F32toF64, mkexpr(f32lo))) );
   9830    }
   9831 
   9832    return delta;
   9833 }
   9834 
   9835 
   9836 static Long dis_CVTPS2PD_128 ( VexAbiInfo* vbi, Prefix pfx,
   9837                                Long delta, Bool isAvx )
   9838 {
   9839    IRTemp addr  = IRTemp_INVALID;
   9840    Int    alen  = 0;
   9841    HChar  dis_buf[50];
   9842    IRTemp f32lo = newTemp(Ity_F32);
   9843    IRTemp f32hi = newTemp(Ity_F32);
   9844    UChar  modrm = getUChar(delta);
   9845    UInt   rG    = gregOfRexRM(pfx,modrm);
   9846    if (epartIsReg(modrm)) {
   9847       UInt rE = eregOfRexRM(pfx,modrm);
   9848       assign( f32lo, getXMMRegLane32F(rE, 0) );
   9849       assign( f32hi, getXMMRegLane32F(rE, 1) );
   9850       delta += 1;
   9851       DIP("%scvtps2pd %s,%s\n",
   9852           isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
   9853    } else {
   9854       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9855       assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) );
   9856       assign( f32hi, loadLE(Ity_F32,
   9857                             binop(Iop_Add64,mkexpr(addr),mkU64(4))) );
   9858       delta += alen;
   9859       DIP("%scvtps2pd %s,%s\n",
   9860           isAvx ? "v" : "", dis_buf, nameXMMReg(rG));
   9861    }
   9862 
   9863    putXMMRegLane64F( rG, 1, unop(Iop_F32toF64, mkexpr(f32hi)) );
   9864    putXMMRegLane64F( rG, 0, unop(Iop_F32toF64, mkexpr(f32lo)) );
   9865    if (isAvx)
   9866       putYMMRegLane128( rG, 1, mkV128(0));
   9867    return delta;
   9868 }
   9869 
   9870 
   9871 static Long dis_CVTPS2PD_256 ( VexAbiInfo* vbi, Prefix pfx,
   9872                                Long delta )
   9873 {
   9874    IRTemp addr  = IRTemp_INVALID;
   9875    Int    alen  = 0;
   9876    HChar  dis_buf[50];
   9877    IRTemp f32_0 = newTemp(Ity_F32);
   9878    IRTemp f32_1 = newTemp(Ity_F32);
   9879    IRTemp f32_2 = newTemp(Ity_F32);
   9880    IRTemp f32_3 = newTemp(Ity_F32);
   9881    UChar  modrm = getUChar(delta);
   9882    UInt   rG    = gregOfRexRM(pfx,modrm);
   9883    if (epartIsReg(modrm)) {
   9884       UInt rE = eregOfRexRM(pfx,modrm);
   9885       assign( f32_0, getXMMRegLane32F(rE, 0) );
   9886       assign( f32_1, getXMMRegLane32F(rE, 1) );
   9887       assign( f32_2, getXMMRegLane32F(rE, 2) );
   9888       assign( f32_3, getXMMRegLane32F(rE, 3) );
   9889       delta += 1;
   9890       DIP("vcvtps2pd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   9891    } else {
   9892       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9893       assign( f32_0, loadLE(Ity_F32, mkexpr(addr)) );
   9894       assign( f32_1, loadLE(Ity_F32,
   9895                             binop(Iop_Add64,mkexpr(addr),mkU64(4))) );
   9896       assign( f32_2, loadLE(Ity_F32,
   9897                             binop(Iop_Add64,mkexpr(addr),mkU64(8))) );
   9898       assign( f32_3, loadLE(Ity_F32,
   9899                             binop(Iop_Add64,mkexpr(addr),mkU64(12))) );
   9900       delta += alen;
   9901       DIP("vcvtps2pd %s,%s\n", dis_buf, nameYMMReg(rG));
   9902    }
   9903 
   9904    putYMMRegLane64F( rG, 3, unop(Iop_F32toF64, mkexpr(f32_3)) );
   9905    putYMMRegLane64F( rG, 2, unop(Iop_F32toF64, mkexpr(f32_2)) );
   9906    putYMMRegLane64F( rG, 1, unop(Iop_F32toF64, mkexpr(f32_1)) );
   9907    putYMMRegLane64F( rG, 0, unop(Iop_F32toF64, mkexpr(f32_0)) );
   9908    return delta;
   9909 }
   9910 
   9911 
   9912 static Long dis_CVTPD2PS_128 ( VexAbiInfo* vbi, Prefix pfx,
   9913                                Long delta, Bool isAvx )
   9914 {
   9915    IRTemp addr  = IRTemp_INVALID;
   9916    Int    alen  = 0;
   9917    HChar  dis_buf[50];
   9918    UChar  modrm = getUChar(delta);
   9919    UInt   rG    = gregOfRexRM(pfx,modrm);
   9920    IRTemp argV  = newTemp(Ity_V128);
   9921    IRTemp rmode = newTemp(Ity_I32);
   9922    if (epartIsReg(modrm)) {
   9923       UInt rE = eregOfRexRM(pfx,modrm);
   9924       assign( argV, getXMMReg(rE) );
   9925       delta += 1;
   9926       DIP("%scvtpd2ps %s,%s\n", isAvx ? "v" : "",
   9927           nameXMMReg(rE), nameXMMReg(rG));
   9928    } else {
   9929       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9930       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9931       delta += alen;
   9932       DIP("%scvtpd2ps %s,%s\n", isAvx ? "v" : "",
   9933           dis_buf, nameXMMReg(rG) );
   9934    }
   9935 
   9936    assign( rmode, get_sse_roundingmode() );
   9937    IRTemp t0 = newTemp(Ity_F64);
   9938    IRTemp t1 = newTemp(Ity_F64);
   9939    assign( t0, unop(Iop_ReinterpI64asF64,
   9940                     unop(Iop_V128to64, mkexpr(argV))) );
   9941    assign( t1, unop(Iop_ReinterpI64asF64,
   9942                     unop(Iop_V128HIto64, mkexpr(argV))) );
   9943 
   9944 #  define CVT(_t)  binop( Iop_F64toF32, mkexpr(rmode), mkexpr(_t) )
   9945    putXMMRegLane32(  rG, 3, mkU32(0) );
   9946    putXMMRegLane32(  rG, 2, mkU32(0) );
   9947    putXMMRegLane32F( rG, 1, CVT(t1) );
   9948    putXMMRegLane32F( rG, 0, CVT(t0) );
   9949 #  undef CVT
   9950    if (isAvx)
   9951       putYMMRegLane128( rG, 1, mkV128(0) );
   9952 
   9953    return delta;
   9954 }
   9955 
   9956 
   9957 static Long dis_CVTxPS2DQ_128 ( VexAbiInfo* vbi, Prefix pfx,
   9958                                 Long delta, Bool isAvx, Bool r2zero )
   9959 {
   9960    IRTemp addr  = IRTemp_INVALID;
   9961    Int    alen  = 0;
   9962    HChar  dis_buf[50];
   9963    UChar  modrm = getUChar(delta);
   9964    IRTemp argV  = newTemp(Ity_V128);
   9965    IRTemp rmode = newTemp(Ity_I32);
   9966    UInt   rG    = gregOfRexRM(pfx,modrm);
   9967    IRTemp t0, t1, t2, t3;
   9968 
   9969    if (epartIsReg(modrm)) {
   9970       UInt rE = eregOfRexRM(pfx,modrm);
   9971       assign( argV, getXMMReg(rE) );
   9972       delta += 1;
   9973       DIP("%scvt%sps2dq %s,%s\n",
   9974           isAvx ? "v" : "", r2zero ? "t" : "", nameXMMReg(rE), nameXMMReg(rG));
   9975    } else {
   9976       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9977       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   9978       delta += alen;
   9979       DIP("%scvt%sps2dq %s,%s\n",
   9980           isAvx ? "v" : "", r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
   9981    }
   9982 
   9983    assign( rmode, r2zero ? mkU32((UInt)Irrm_ZERO)
   9984                          : get_sse_roundingmode() );
   9985    t0 = t1 = t2 = t3 = IRTemp_INVALID;
   9986    breakupV128to32s( argV, &t3, &t2, &t1, &t0 );
   9987    /* This is less than ideal.  If it turns out to be a performance
   9988       bottleneck it can be improved. */
   9989 #  define CVT(_t)                             \
   9990       binop( Iop_F64toI32S,                   \
   9991              mkexpr(rmode),                   \
   9992              unop( Iop_F32toF64,              \
   9993                    unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
   9994 
   9995    putXMMRegLane32( rG, 3, CVT(t3) );
   9996    putXMMRegLane32( rG, 2, CVT(t2) );
   9997    putXMMRegLane32( rG, 1, CVT(t1) );
   9998    putXMMRegLane32( rG, 0, CVT(t0) );
   9999 #  undef CVT
   10000    if (isAvx)
   10001       putYMMRegLane128( rG, 1, mkV128(0) );
   10002 
   10003    return delta;
   10004 }
   10005 
   10006 
   10007 static Long dis_CVTxPS2DQ_256 ( VexAbiInfo* vbi, Prefix pfx,
   10008                                 Long delta, Bool r2zero )
   10009 {
   10010    IRTemp addr  = IRTemp_INVALID;
   10011    Int    alen  = 0;
   10012    HChar  dis_buf[50];
   10013    UChar  modrm = getUChar(delta);
   10014    IRTemp argV  = newTemp(Ity_V256);
   10015    IRTemp rmode = newTemp(Ity_I32);
   10016    UInt   rG    = gregOfRexRM(pfx,modrm);
   10017    IRTemp t0, t1, t2, t3, t4, t5, t6, t7;
   10018 
   10019    if (epartIsReg(modrm)) {
   10020       UInt rE = eregOfRexRM(pfx,modrm);
   10021       assign( argV, getYMMReg(rE) );
   10022       delta += 1;
   10023       DIP("vcvt%sps2dq %s,%s\n",
   10024           r2zero ? "t" : "", nameYMMReg(rE), nameYMMReg(rG));
   10025    } else {
   10026       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10027       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
   10028       delta += alen;
   10029       DIP("vcvt%sps2dq %s,%s\n",
   10030           r2zero ? "t" : "", dis_buf, nameYMMReg(rG) );
   10031    }
   10032 
   10033    assign( rmode, r2zero ? mkU32((UInt)Irrm_ZERO)
   10034                          : get_sse_roundingmode() );
   10035    t0 = t1 = t2 = t3 = t4 = t5 = t6 = t7 = IRTemp_INVALID;
   10036    breakupV256to32s( argV, &t7, &t6, &t5, &t4, &t3, &t2, &t1, &t0 );
   10037    /* This is less than ideal.  If it turns out to be a performance
   10038       bottleneck it can be improved. */
   10039 #  define CVT(_t)                             \
   10040       binop( Iop_F64toI32S,                   \
   10041              mkexpr(rmode),                   \
   10042              unop( Iop_F32toF64,              \
   10043                    unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
   10044 
   10045    putYMMRegLane32( rG, 7, CVT(t7) );
   10046    putYMMRegLane32( rG, 6, CVT(t6) );
   10047    putYMMRegLane32( rG, 5, CVT(t5) );
   10048    putYMMRegLane32( rG, 4, CVT(t4) );
   10049    putYMMRegLane32( rG, 3, CVT(t3) );
   10050    putYMMRegLane32( rG, 2, CVT(t2) );
   10051    putYMMRegLane32( rG, 1, CVT(t1) );
   10052    putYMMRegLane32( rG, 0, CVT(t0) );
   10053 #  undef CVT
   10054 
   10055    return delta;
   10056 }
   10057 
   10058 
   10059 static Long dis_CVTxPD2DQ_128 ( VexAbiInfo* vbi, Prefix pfx,
   10060                                 Long delta, Bool isAvx, Bool r2zero )
   10061 {
   10062    IRTemp addr  = IRTemp_INVALID;
   10063    Int    alen  = 0;
   10064    HChar  dis_buf[50];
   10065    UChar  modrm = getUChar(delta);
   10066    IRTemp argV  = newTemp(Ity_V128);
   10067    IRTemp rmode = newTemp(Ity_I32);
   10068    UInt   rG    = gregOfRexRM(pfx,modrm);
   10069    IRTemp t0, t1;
   10070 
   10071    if (epartIsReg(modrm)) {
   10072       UInt rE = eregOfRexRM(pfx,modrm);
   10073       assign( argV, getXMMReg(rE) );
   10074       delta += 1;
   10075       DIP("%scvt%spd2dq %s,%s\n",
   10076           isAvx ? "v" : "", r2zero ? "t" : "", nameXMMReg(rE), nameXMMReg(rG));
   10077    } else {
   10078       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10079       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10080       delta += alen;
   10081       DIP("%scvt%spd2dqx %s,%s\n",
   10082           isAvx ? "v" : "", r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
   10083    }
   10084 
   10085    if (r2zero) {
   10086       assign(rmode, mkU32((UInt)Irrm_ZERO) );
   10087    } else {
   10088       assign( rmode, get_sse_roundingmode() );
   10089    }
   10090 
   10091    t0 = newTemp(Ity_F64);
   10092    t1 = newTemp(Ity_F64);
   10093    assign( t0, unop(Iop_ReinterpI64asF64,
   10094                     unop(Iop_V128to64, mkexpr(argV))) );
   10095    assign( t1, unop(Iop_ReinterpI64asF64,
   10096                     unop(Iop_V128HIto64, mkexpr(argV))) );
   10097 
   10098 #  define CVT(_t)  binop( Iop_F64toI32S,                   \
   10099                           mkexpr(rmode),                   \
   10100                           mkexpr(_t) )
   10101 
   10102    putXMMRegLane32( rG, 3, mkU32(0) );
   10103    putXMMRegLane32( rG, 2, mkU32(0) );
   10104    putXMMRegLane32( rG, 1, CVT(t1) );
   10105    putXMMRegLane32( rG, 0, CVT(t0) );
   10106 #  undef CVT
   10107    if (isAvx)
   10108       putYMMRegLane128( rG, 1, mkV128(0) );
   10109 
   10110    return delta;
   10111 }
   10112 
   10113 
   10114 static Long dis_CVTxPD2DQ_256 ( VexAbiInfo* vbi, Prefix pfx,
   10115                                 Long delta, Bool r2zero )
   10116 {
   10117    IRTemp addr  = IRTemp_INVALID;
   10118    Int    alen  = 0;
   10119    HChar  dis_buf[50];
   10120    UChar  modrm = getUChar(delta);
   10121    IRTemp argV  = newTemp(Ity_V256);
   10122    IRTemp rmode = newTemp(Ity_I32);
   10123    UInt   rG    = gregOfRexRM(pfx,modrm);
   10124    IRTemp t0, t1, t2, t3;
   10125 
   10126    if (epartIsReg(modrm)) {
   10127       UInt rE = eregOfRexRM(pfx,modrm);
   10128       assign( argV, getYMMReg(rE) );
   10129       delta += 1;
   10130       DIP("vcvt%spd2dq %s,%s\n",
   10131           r2zero ? "t" : "", nameYMMReg(rE), nameXMMReg(rG));
   10132    } else {
   10133       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10134       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
   10135       delta += alen;
   10136       DIP("vcvt%spd2dqy %s,%s\n",
   10137           r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
   10138    }
   10139 
   10140    if (r2zero) {
   10141       assign(rmode, mkU32((UInt)Irrm_ZERO) );
   10142    } else {
   10143       assign( rmode, get_sse_roundingmode() );
   10144    }
   10145 
   10146    t0 = IRTemp_INVALID;
   10147    t1 = IRTemp_INVALID;
   10148    t2 = IRTemp_INVALID;
   10149    t3 = IRTemp_INVALID;
   10150    breakupV256to64s( argV, &t3, &t2, &t1, &t0 );
   10151 
   10152 #  define CVT(_t)  binop( Iop_F64toI32S,                   \
   10153                           mkexpr(rmode),                   \
   10154                           unop( Iop_ReinterpI64asF64,      \
   10155                                 mkexpr(_t) ) )
   10156 
   10157    putXMMRegLane32( rG, 3, CVT(t3) );
   10158    putXMMRegLane32( rG, 2, CVT(t2) );
   10159    putXMMRegLane32( rG, 1, CVT(t1) );
   10160    putXMMRegLane32( rG, 0, CVT(t0) );
   10161 #  undef CVT
   10162    putYMMRegLane128( rG, 1, mkV128(0) );
   10163 
   10164    return delta;
   10165 }
   10166 
   10167 
   10168 static Long dis_CVTDQ2PS_128 ( VexAbiInfo* vbi, Prefix pfx,
   10169                                Long delta, Bool isAvx )
   10170 {
   10171    IRTemp addr  = IRTemp_INVALID;
   10172    Int    alen  = 0;
   10173    HChar  dis_buf[50];
   10174    UChar  modrm = getUChar(delta);
   10175    IRTemp argV  = newTemp(Ity_V128);
   10176    IRTemp rmode = newTemp(Ity_I32);
   10177    UInt   rG    = gregOfRexRM(pfx,modrm);
   10178    IRTemp t0, t1, t2, t3;
   10179 
   10180    if (epartIsReg(modrm)) {
   10181       UInt rE = eregOfRexRM(pfx,modrm);
   10182       assign( argV, getXMMReg(rE) );
   10183       delta += 1;
   10184       DIP("%scvtdq2ps %s,%s\n",
   10185           isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
   10186    } else {
   10187       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10188       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10189       delta += alen;
   10190       DIP("%scvtdq2ps %s,%s\n",
   10191           isAvx ? "v" : "", dis_buf, nameXMMReg(rG) );
   10192    }
   10193 
   10194    assign( rmode, get_sse_roundingmode() );
   10195    t0 = IRTemp_INVALID;
   10196    t1 = IRTemp_INVALID;
   10197    t2 = IRTemp_INVALID;
   10198    t3 = IRTemp_INVALID;
   10199    breakupV128to32s( argV, &t3, &t2, &t1, &t0 );
   10200 
   10201 #  define CVT(_t)  binop( Iop_F64toF32,                    \
   10202                           mkexpr(rmode),                   \
   10203                           unop(Iop_I32StoF64,mkexpr(_t)))
   10204 
   10205    putXMMRegLane32F( rG, 3, CVT(t3) );
   10206    putXMMRegLane32F( rG, 2, CVT(t2) );
   10207    putXMMRegLane32F( rG, 1, CVT(t1) );
   10208    putXMMRegLane32F( rG, 0, CVT(t0) );
   10209 #  undef CVT
   10210    if (isAvx)
   10211       putYMMRegLane128( rG, 1, mkV128(0) );
   10212 
   10213    return delta;
   10214 }
   10215 
   10216 static Long dis_CVTDQ2PS_256 ( VexAbiInfo* vbi, Prefix pfx,
   10217                                Long delta )
   10218 {
   10219    IRTemp addr   = IRTemp_INVALID;
   10220    Int    alen   = 0;
   10221    HChar  dis_buf[50];
   10222    UChar  modrm  = getUChar(delta);
   10223    IRTemp argV   = newTemp(Ity_V256);
   10224    IRTemp rmode  = newTemp(Ity_I32);
   10225    UInt   rG     = gregOfRexRM(pfx,modrm);
   10226    IRTemp t0, t1, t2, t3, t4, t5, t6, t7;
   10227 
   10228    if (epartIsReg(modrm)) {
   10229       UInt rE = eregOfRexRM(pfx,modrm);
   10230       assign( argV, getYMMReg(rE) );
   10231       delta += 1;
   10232       DIP("vcvtdq2ps %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   10233    } else {
   10234       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10235       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
   10236       delta += alen;
   10237       DIP("vcvtdq2ps %s,%s\n", dis_buf, nameYMMReg(rG) );
   10238    }
   10239 
   10240    assign( rmode, get_sse_roundingmode() );
   10241    t0 = IRTemp_INVALID;
   10242    t1 = IRTemp_INVALID;
   10243    t2 = IRTemp_INVALID;
   10244    t3 = IRTemp_INVALID;
   10245    t4 = IRTemp_INVALID;
   10246    t5 = IRTemp_INVALID;
   10247    t6 = IRTemp_INVALID;
   10248    t7 = IRTemp_INVALID;
   10249    breakupV256to32s( argV, &t7, &t6, &t5, &t4, &t3, &t2, &t1, &t0 );
   10250 
   10251 #  define CVT(_t)  binop( Iop_F64toF32,                    \
   10252                           mkexpr(rmode),                   \
   10253                           unop(Iop_I32StoF64,mkexpr(_t)))
   10254 
   10255    putYMMRegLane32F( rG, 7, CVT(t7) );
   10256    putYMMRegLane32F( rG, 6, CVT(t6) );
   10257    putYMMRegLane32F( rG, 5, CVT(t5) );
   10258    putYMMRegLane32F( rG, 4, CVT(t4) );
   10259    putYMMRegLane32F( rG, 3, CVT(t3) );
   10260    putYMMRegLane32F( rG, 2, CVT(t2) );
   10261    putYMMRegLane32F( rG, 1, CVT(t1) );
   10262    putYMMRegLane32F( rG, 0, CVT(t0) );
   10263 #  undef CVT
   10264 
   10265    return delta;
   10266 }
   10267 
   10268 
   10269 static Long dis_PMOVMSKB_128 ( VexAbiInfo* vbi, Prefix pfx,
   10270                                Long delta, Bool isAvx )
   10271 {
   10272    /* UInt x86g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo ); */
   10273    UChar modrm = getUChar(delta);
   10274    vassert(epartIsReg(modrm)); /* ensured by caller */
   10275    UInt   rE = eregOfRexRM(pfx,modrm);
   10276    UInt   rG = gregOfRexRM(pfx,modrm);
   10277    IRTemp t0 = newTemp(Ity_I64);
   10278    IRTemp t1 = newTemp(Ity_I64);
   10279    IRTemp t5 = newTemp(Ity_I64);
   10280    assign(t0, getXMMRegLane64(rE, 0));
   10281    assign(t1, getXMMRegLane64(rE, 1));
   10282    assign(t5, mkIRExprCCall( Ity_I64, 0/*regparms*/,
   10283                              "amd64g_calculate_sse_pmovmskb",
   10284                              &amd64g_calculate_sse_pmovmskb,
   10285                              mkIRExprVec_2( mkexpr(t1), mkexpr(t0) )));
   10286    putIReg32(rG, unop(Iop_64to32,mkexpr(t5)));
   10287    DIP("%spmovmskb %s,%s\n", isAvx ? "v" : "", nameXMMReg(rE),
   10288        nameIReg32(rG));
   10289    delta += 1;
   10290    return delta;
   10291 }
   10292 
   10293 
   10294 /* FIXME: why not just use InterleaveLO / InterleaveHI?  I think the
   10295    relevant ops are "xIsH ? InterleaveHI32x4 : InterleaveLO32x4". */
   10296 /* Does the maths for 128 bit versions of UNPCKLPS and UNPCKHPS */
   10297 static IRTemp math_UNPCKxPS_128 ( IRTemp sV, IRTemp dV, Bool xIsH )
   10298 {
   10299    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   10300    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   10301    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   10302    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   10303    IRTemp res = newTemp(Ity_V128);
   10304    assign(res,  xIsH ? mkV128from32s( s3, d3, s2, d2 )
   10305                      : mkV128from32s( s1, d1, s0, d0 ));
   10306    return res;
   10307 }
   10308 
   10309 
   10310 /* FIXME: why not just use InterleaveLO / InterleaveHI ?? */
   10311 /* Does the maths for 128 bit versions of UNPCKLPD and UNPCKHPD */
   10312 static IRTemp math_UNPCKxPD_128 ( IRTemp sV, IRTemp dV, Bool xIsH )
   10313 {
   10314    IRTemp s1 = newTemp(Ity_I64);
   10315    IRTemp s0 = newTemp(Ity_I64);
   10316    IRTemp d1 = newTemp(Ity_I64);
   10317    IRTemp d0 = newTemp(Ity_I64);
   10318    assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   10319    assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   10320    assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   10321    assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   10322    IRTemp res = newTemp(Ity_V128);
   10323    assign(res, xIsH ? binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1))
   10324                     : binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)));
   10325    return res;
   10326 }
   10327 
   10328 
   10329 /* Does the maths for 256 bit versions of UNPCKLPD and UNPCKHPD.
   10330    Doesn't seem like this fits in either of the Iop_Interleave{LO,HI}
   10331    or the Iop_Cat{Odd,Even}Lanes idioms, hence just do it the stupid
   10332    way. */
   10333 static IRTemp math_UNPCKxPD_256 ( IRTemp sV, IRTemp dV, Bool xIsH )
   10334 {
   10335    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   10336    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   10337    breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
   10338    breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
   10339    IRTemp res = newTemp(Ity_V256);
   10340    assign(res, xIsH
   10341                ? IRExpr_Qop(Iop_64x4toV256, mkexpr(s3), mkexpr(d3),
   10342                                             mkexpr(s1), mkexpr(d1))
   10343                : IRExpr_Qop(Iop_64x4toV256, mkexpr(s2), mkexpr(d2),
   10344                                             mkexpr(s0), mkexpr(d0)));
   10345    return res;
   10346 }
   10347 
   10348 
   10349 /* FIXME: this is really bad.  Surely can do something better here?
   10350    One observation is that the steering in the upper and lower 128 bit
   10351    halves is the same as with math_UNPCKxPS_128, so we simply split
   10352    into two halves, and use that.  Consequently any improvement in
   10353    math_UNPCKxPS_128 (probably, to use interleave-style primops)
   10354    benefits this too. */
   10355 static IRTemp math_UNPCKxPS_256 ( IRTemp sV, IRTemp dV, Bool xIsH )
   10356 {
   10357    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   10358    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   10359    breakupV256toV128s( sV, &sVhi, &sVlo );
   10360    breakupV256toV128s( dV, &dVhi, &dVlo );
   10361    IRTemp rVhi = math_UNPCKxPS_128(sVhi, dVhi, xIsH);
   10362    IRTemp rVlo = math_UNPCKxPS_128(sVlo, dVlo, xIsH);
   10363    IRTemp rV   = newTemp(Ity_V256);
   10364    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   10365    return rV;
   10366 }
   10367 
   10368 
   10369 static IRTemp math_SHUFPS_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   10370 {
   10371    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   10372    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   10373    vassert(imm8 < 256);
   10374 
   10375    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   10376    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   10377 
   10378 #  define SELD(n) ((n)==0 ? d0 : ((n)==1 ? d1 : ((n)==2 ? d2 : d3)))
   10379 #  define SELS(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   10380    IRTemp res = newTemp(Ity_V128);
   10381    assign(res,
   10382           mkV128from32s( SELS((imm8>>6)&3), SELS((imm8>>4)&3),
   10383                          SELD((imm8>>2)&3), SELD((imm8>>0)&3) ) );
   10384 #  undef SELD
   10385 #  undef SELS
   10386    return res;
   10387 }
   10388 
   10389 
   10390 /* 256-bit SHUFPS appears to steer each of the 128-bit halves
   10391    identically.  Hence do the clueless thing and use math_SHUFPS_128
   10392    twice. */
   10393 static IRTemp math_SHUFPS_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
   10394 {
   10395    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   10396    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   10397    breakupV256toV128s( sV, &sVhi, &sVlo );
   10398    breakupV256toV128s( dV, &dVhi, &dVlo );
   10399    IRTemp rVhi = math_SHUFPS_128(sVhi, dVhi, imm8);
   10400    IRTemp rVlo = math_SHUFPS_128(sVlo, dVlo, imm8);
   10401    IRTemp rV   = newTemp(Ity_V256);
   10402    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   10403    return rV;
   10404 }
   10405 
   10406 
   10407 static IRTemp math_SHUFPD_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   10408 {
   10409    IRTemp s1 = newTemp(Ity_I64);
   10410    IRTemp s0 = newTemp(Ity_I64);
   10411    IRTemp d1 = newTemp(Ity_I64);
   10412    IRTemp d0 = newTemp(Ity_I64);
   10413 
   10414    assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   10415    assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   10416    assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   10417    assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   10418 
   10419 #  define SELD(n) mkexpr((n)==0 ? d0 : d1)
   10420 #  define SELS(n) mkexpr((n)==0 ? s0 : s1)
   10421 
   10422    IRTemp res = newTemp(Ity_V128);
   10423    assign(res, binop( Iop_64HLtoV128,
   10424                       SELS((imm8>>1)&1), SELD((imm8>>0)&1) ) );
   10425 
   10426 #  undef SELD
   10427 #  undef SELS
   10428    return res;
   10429 }
   10430 
   10431 
   10432 static IRTemp math_SHUFPD_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
   10433 {
   10434    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   10435    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   10436    breakupV256toV128s( sV, &sVhi, &sVlo );
   10437    breakupV256toV128s( dV, &dVhi, &dVlo );
   10438    IRTemp rVhi = math_SHUFPD_128(sVhi, dVhi, (imm8 >> 2) & 3);
   10439    IRTemp rVlo = math_SHUFPD_128(sVlo, dVlo, imm8 & 3);
   10440    IRTemp rV   = newTemp(Ity_V256);
   10441    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   10442    return rV;
   10443 }
   10444 
   10445 
   10446 static IRTemp math_BLENDPD_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   10447 {
   10448    UShort imm8_mask_16;
   10449    IRTemp imm8_mask = newTemp(Ity_V128);
   10450 
   10451    switch( imm8 & 3 ) {
   10452       case 0:  imm8_mask_16 = 0x0000; break;
   10453       case 1:  imm8_mask_16 = 0x00FF; break;
   10454       case 2:  imm8_mask_16 = 0xFF00; break;
   10455       case 3:  imm8_mask_16 = 0xFFFF; break;
   10456       default: vassert(0);            break;
   10457    }
   10458    assign( imm8_mask, mkV128( imm8_mask_16 ) );
   10459 
   10460    IRTemp res = newTemp(Ity_V128);
   10461    assign ( res, binop( Iop_OrV128,
   10462                         binop( Iop_AndV128, mkexpr(sV),
   10463                                             mkexpr(imm8_mask) ),
   10464                         binop( Iop_AndV128, mkexpr(dV),
   10465                                unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
   10466    return res;
   10467 }
   10468 
   10469 
   10470 static IRTemp math_BLENDPD_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
   10471 {
   10472    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   10473    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   10474    breakupV256toV128s( sV, &sVhi, &sVlo );
   10475    breakupV256toV128s( dV, &dVhi, &dVlo );
   10476    IRTemp rVhi = math_BLENDPD_128(sVhi, dVhi, (imm8 >> 2) & 3);
   10477    IRTemp rVlo = math_BLENDPD_128(sVlo, dVlo, imm8 & 3);
   10478    IRTemp rV   = newTemp(Ity_V256);
   10479    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   10480    return rV;
   10481 }
   10482 
   10483 
   10484 static IRTemp math_BLENDPS_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   10485 {
   10486    UShort imm8_perms[16] = { 0x0000, 0x000F, 0x00F0, 0x00FF, 0x0F00,
   10487                              0x0F0F, 0x0FF0, 0x0FFF, 0xF000, 0xF00F,
   10488                              0xF0F0, 0xF0FF, 0xFF00, 0xFF0F, 0xFFF0,
   10489                              0xFFFF };
   10490    IRTemp imm8_mask = newTemp(Ity_V128);
   10491    assign( imm8_mask, mkV128( imm8_perms[ (imm8 & 15) ] ) );
   10492 
   10493    IRTemp res = newTemp(Ity_V128);
   10494    assign ( res, binop( Iop_OrV128,
   10495                         binop( Iop_AndV128, mkexpr(sV),
   10496                                             mkexpr(imm8_mask) ),
   10497                         binop( Iop_AndV128, mkexpr(dV),
   10498                                unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
   10499    return res;
   10500 }
   10501 
   10502 
   10503 static IRTemp math_BLENDPS_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
   10504 {
   10505    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   10506    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   10507    breakupV256toV128s( sV, &sVhi, &sVlo );
   10508    breakupV256toV128s( dV, &dVhi, &dVlo );
   10509    IRTemp rVhi = math_BLENDPS_128(sVhi, dVhi, (imm8 >> 4) & 15);
   10510    IRTemp rVlo = math_BLENDPS_128(sVlo, dVlo, imm8 & 15);
   10511    IRTemp rV   = newTemp(Ity_V256);
   10512    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   10513    return rV;
   10514 }
   10515 
   10516 
   10517 static IRTemp math_PBLENDW_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   10518 {
   10519    /* Make w be a 16-bit version of imm8, formed by duplicating each
   10520       bit in imm8. */
   10521    Int i;
   10522    UShort imm16 = 0;
   10523    for (i = 0; i < 8; i++) {
   10524       if (imm8 & (1 << i))
   10525          imm16 |= (3 << (2*i));
   10526    }
   10527    IRTemp imm16_mask = newTemp(Ity_V128);
   10528    assign( imm16_mask, mkV128( imm16 ));
   10529 
   10530    IRTemp res = newTemp(Ity_V128);
   10531    assign ( res, binop( Iop_OrV128,
   10532                         binop( Iop_AndV128, mkexpr(sV),
   10533                                             mkexpr(imm16_mask) ),
   10534                         binop( Iop_AndV128, mkexpr(dV),
   10535                                unop( Iop_NotV128, mkexpr(imm16_mask) ) ) ) );
   10536    return res;
   10537 }
   10538 
   10539 
   10540 static IRTemp math_PMULUDQ_128 ( IRTemp sV, IRTemp dV )
   10541 {
   10542    /* This is a really poor translation -- could be improved if
   10543       performance critical */
   10544    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   10545    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   10546    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   10547    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   10548    IRTemp res = newTemp(Ity_V128);
   10549    assign(res, binop(Iop_64HLtoV128,
   10550                      binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)),
   10551                      binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) ));
   10552    return res;
   10553 }
   10554 
   10555 
   10556 static IRTemp math_PMULDQ_128 ( IRTemp dV, IRTemp sV )
   10557 {
   10558    /* This is a really poor translation -- could be improved if
   10559       performance critical */
   10560    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   10561    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   10562    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   10563    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   10564    IRTemp res = newTemp(Ity_V128);
   10565    assign(res, binop(Iop_64HLtoV128,
   10566                      binop( Iop_MullS32, mkexpr(d2), mkexpr(s2)),
   10567                      binop( Iop_MullS32, mkexpr(d0), mkexpr(s0)) ));
   10568    return res;
   10569 }
   10570 
   10571 
   10572 static IRTemp math_PMADDWD_128 ( IRTemp dV, IRTemp sV )
   10573 {
   10574    IRTemp sVhi, sVlo, dVhi, dVlo;
   10575    IRTemp resHi = newTemp(Ity_I64);
   10576    IRTemp resLo = newTemp(Ity_I64);
   10577    sVhi = sVlo = dVhi = dVlo = IRTemp_INVALID;
   10578    breakupV128to64s( sV, &sVhi, &sVlo );
   10579    breakupV128to64s( dV, &dVhi, &dVlo );
   10580    assign( resHi, mkIRExprCCall(Ity_I64, 0/*regparms*/,
   10581                                 "amd64g_calculate_mmx_pmaddwd",
   10582                                 &amd64g_calculate_mmx_pmaddwd,
   10583                                 mkIRExprVec_2( mkexpr(sVhi), mkexpr(dVhi))));
   10584    assign( resLo, mkIRExprCCall(Ity_I64, 0/*regparms*/,
   10585                                 "amd64g_calculate_mmx_pmaddwd",
   10586                                 &amd64g_calculate_mmx_pmaddwd,
   10587                                 mkIRExprVec_2( mkexpr(sVlo), mkexpr(dVlo))));
   10588    IRTemp res = newTemp(Ity_V128);
   10589    assign( res, binop(Iop_64HLtoV128, mkexpr(resHi), mkexpr(resLo))) ;
   10590    return res;
   10591 }
   10592 
   10593 
   10594 static IRTemp math_ADDSUBPD_128 ( IRTemp dV, IRTemp sV )
   10595 {
   10596    IRTemp addV = newTemp(Ity_V128);
   10597    IRTemp subV = newTemp(Ity_V128);
   10598    IRTemp a1   = newTemp(Ity_I64);
   10599    IRTemp s0   = newTemp(Ity_I64);
   10600 
   10601    assign( addV, binop(Iop_Add64Fx2, mkexpr(dV), mkexpr(sV)) );
   10602    assign( subV, binop(Iop_Sub64Fx2, mkexpr(dV), mkexpr(sV)) );
   10603 
   10604    assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
   10605    assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
   10606 
   10607    IRTemp res = newTemp(Ity_V128);
   10608    assign( res, binop(Iop_64HLtoV128, mkexpr(a1), mkexpr(s0)) );
   10609    return res;
   10610 }
   10611 
   10612 
   10613 static IRTemp math_ADDSUBPD_256 ( IRTemp dV, IRTemp sV )
   10614 {
   10615    IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
   10616    IRTemp addV = newTemp(Ity_V256);
   10617    IRTemp subV = newTemp(Ity_V256);
   10618    a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   10619 
   10620    assign( addV, binop(Iop_Add64Fx4, mkexpr(dV), mkexpr(sV)) );
   10621    assign( subV, binop(Iop_Sub64Fx4, mkexpr(dV), mkexpr(sV)) );
   10622 
   10623    breakupV256to64s( addV, &a3, &a2, &a1, &a0 );
   10624    breakupV256to64s( subV, &s3, &s2, &s1, &s0 );
   10625 
   10626    IRTemp res = newTemp(Ity_V256);
   10627    assign( res, mkV256from64s( a3, s2, a1, s0 ) );
   10628    return res;
   10629 }
   10630 
   10631 
   10632 static IRTemp math_ADDSUBPS_128 ( IRTemp dV, IRTemp sV )
   10633 {
   10634    IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
   10635    IRTemp addV = newTemp(Ity_V128);
   10636    IRTemp subV = newTemp(Ity_V128);
   10637    a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   10638 
   10639    assign( addV, binop(Iop_Add32Fx4, mkexpr(dV), mkexpr(sV)) );
   10640    assign( subV, binop(Iop_Sub32Fx4, mkexpr(dV), mkexpr(sV)) );
   10641 
   10642    breakupV128to32s( addV, &a3, &a2, &a1, &a0 );
   10643    breakupV128to32s( subV, &s3, &s2, &s1, &s0 );
   10644 
   10645    IRTemp res = newTemp(Ity_V128);
   10646    assign( res, mkV128from32s( a3, s2, a1, s0 ) );
   10647    return res;
   10648 }
   10649 
   10650 
   10651 static IRTemp math_ADDSUBPS_256 ( IRTemp dV, IRTemp sV )
   10652 {
   10653    IRTemp a7, a6, a5, a4, a3, a2, a1, a0;
   10654    IRTemp s7, s6, s5, s4, s3, s2, s1, s0;
   10655    IRTemp addV = newTemp(Ity_V256);
   10656    IRTemp subV = newTemp(Ity_V256);
   10657    a7 = a6 = a5 = a4 = a3 = a2 = a1 = a0 = IRTemp_INVALID;
   10658    s7 = s6 = s5 = s4 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   10659 
   10660    assign( addV, binop(Iop_Add32Fx8, mkexpr(dV), mkexpr(sV)) );
   10661    assign( subV, binop(Iop_Sub32Fx8, mkexpr(dV), mkexpr(sV)) );
   10662 
   10663    breakupV256to32s( addV, &a7, &a6, &a5, &a4, &a3, &a2, &a1, &a0 );
   10664    breakupV256to32s( subV, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
   10665 
   10666    IRTemp res = newTemp(Ity_V256);
   10667    assign( res, mkV256from32s( a7, s6, a5, s4, a3, s2, a1, s0 ) );
   10668    return res;
   10669 }
   10670 
   10671 
   10672 /* Handle 128 bit PSHUFLW and PSHUFHW. */
   10673 static Long dis_PSHUFxW_128 ( VexAbiInfo* vbi, Prefix pfx,
   10674                               Long delta, Bool isAvx, Bool xIsH )
   10675 {
   10676    IRTemp addr  = IRTemp_INVALID;
   10677    Int    alen  = 0;
   10678    HChar  dis_buf[50];
   10679    UChar  modrm = getUChar(delta);
   10680    UInt   rG = gregOfRexRM(pfx,modrm);
   10681    UInt   imm8;
   10682    IRTemp sVmut, dVmut, sVcon, sV, dV, s3, s2, s1, s0;
   10683    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   10684    sV    = newTemp(Ity_V128);
   10685    dV    = newTemp(Ity_V128);
   10686    sVmut = newTemp(Ity_I64);
   10687    dVmut = newTemp(Ity_I64);
   10688    sVcon = newTemp(Ity_I64);
   10689    if (epartIsReg(modrm)) {
   10690       UInt rE = eregOfRexRM(pfx,modrm);
   10691       assign( sV, getXMMReg(rE) );
   10692       imm8 = (UInt)getUChar(delta+1);
   10693       delta += 1+1;
   10694       DIP("%spshuf%cw $%u,%s,%s\n",
   10695           isAvx ? "v" : "", xIsH ? 'h' : 'l',
   10696           imm8, nameXMMReg(rE), nameXMMReg(rG));
   10697    } else {
   10698       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   10699       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   10700       imm8 = (UInt)getUChar(delta+alen);
   10701       delta += alen+1;
   10702       DIP("%spshuf%cw $%u,%s,%s\n",
   10703           isAvx ? "v" : "", xIsH ? 'h' : 'l',
   10704           imm8, dis_buf, nameXMMReg(rG));
   10705    }
   10706 
   10707    /* Get the to-be-changed (mut) and unchanging (con) bits of the
   10708       source. */
   10709    assign( sVmut, unop(xIsH ? Iop_V128HIto64 : Iop_V128to64,   mkexpr(sV)) );
   10710    assign( sVcon, unop(xIsH ? Iop_V128to64   : Iop_V128HIto64, mkexpr(sV)) );
   10711 
   10712    breakup64to16s( sVmut, &s3, &s2, &s1, &s0 );
   10713 #  define SEL(n) \
   10714              ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   10715    assign(dVmut, mk64from16s( SEL((imm8>>6)&3), SEL((imm8>>4)&3),
   10716                               SEL((imm8>>2)&3), SEL((imm8>>0)&3) ));
   10717 #  undef SEL
   10718 
   10719    assign(dV, xIsH ? binop(Iop_64HLtoV128, mkexpr(dVmut), mkexpr(sVcon))
   10720                    : binop(Iop_64HLtoV128, mkexpr(sVcon), mkexpr(dVmut)) );
   10721 
   10722    (isAvx ? putYMMRegLoAndZU : putXMMReg)(rG, mkexpr(dV));
   10723    return delta;
   10724 }
   10725 
   10726 
   10727 static Long dis_PEXTRW_128_EregOnly_toG ( VexAbiInfo* vbi, Prefix pfx,
   10728                                           Long delta, Bool isAvx )
   10729 {
   10730    Long   deltaIN = delta;
   10731    UChar  modrm   = getUChar(delta);
   10732    UInt   rG      = gregOfRexRM(pfx,modrm);
   10733    IRTemp sV      = newTemp(Ity_V128);
   10734    IRTemp d16     = newTemp(Ity_I16);
   10735    UInt   imm8;
   10736    IRTemp s0, s1, s2, s3;
   10737    if (epartIsReg(modrm)) {
   10738       UInt rE = eregOfRexRM(pfx,modrm);
   10739       assign(sV, getXMMReg(rE));
   10740       imm8 = getUChar(delta+1) & 7;
   10741       delta += 1+1;
   10742       DIP("%spextrw $%d,%s,%s\n", isAvx ? "v" : "",
   10743           (Int)imm8, nameXMMReg(rE), nameIReg32(rG));
   10744    } else {
   10745       /* The memory case is disallowed, apparently. */
   10746       return deltaIN; /* FAIL */
   10747    }
   10748    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   10749    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   10750    switch (imm8) {
   10751       case 0:  assign(d16, unop(Iop_32to16,   mkexpr(s0))); break;
   10752       case 1:  assign(d16, unop(Iop_32HIto16, mkexpr(s0))); break;
   10753       case 2:  assign(d16, unop(Iop_32to16,   mkexpr(s1))); break;
   10754       case 3:  assign(d16, unop(Iop_32HIto16, mkexpr(s1))); break;
   10755       case 4:  assign(d16, unop(Iop_32to16,   mkexpr(s2))); break;
   10756       case 5:  assign(d16, unop(Iop_32HIto16, mkexpr(s2))); break;
   10757       case 6:  assign(d16, unop(Iop_32to16,   mkexpr(s3))); break;
   10758       case 7:  assign(d16, unop(Iop_32HIto16, mkexpr(s3))); break;
   10759       default: vassert(0);
   10760    }
   10761    putIReg32(rG, unop(Iop_16Uto32, mkexpr(d16)));
   10762    return delta;
   10763 }
   10764 
   10765 
   10766 static Long dis_CVTDQ2PD_128 ( VexAbiInfo* vbi, Prefix pfx,
   10767                                Long delta, Bool isAvx )
   10768 {
   10769    IRTemp addr  = IRTemp_INVALID;
   10770    Int    alen  = 0;
   10771    HChar  dis_buf[50];
   10772    UChar  modrm = getUChar(delta);
   10773    IRTemp arg64 = newTemp(Ity_I64);
   10774    UInt   rG    = gregOfRexRM(pfx,modrm);
   10775    UChar* mbV   = isAvx ? "v" : "";
   10776    if (epartIsReg(modrm)) {
   10777       UInt rE = eregOfRexRM(pfx,modrm);
   10778       assign( arg64, getXMMRegLane64(rE, 0) );
   10779       delta += 1;
   10780       DIP("%scvtdq2pd %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG));
   10781    } else {
   10782       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10783       assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   10784       delta += alen;
   10785       DIP("%scvtdq2pd %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
   10786    }
   10787    putXMMRegLane64F(
   10788       rG, 0,
   10789       unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
   10790    );
   10791    putXMMRegLane64F(
   10792       rG, 1,
   10793       unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
   10794    );
   10795    if (isAvx)
   10796       putYMMRegLane128(rG, 1, mkV128(0));
   10797    return delta;
   10798 }
   10799 
   10800 
   10801 static Long dis_STMXCSR ( VexAbiInfo* vbi, Prefix pfx,
   10802                           Long delta, Bool isAvx )
   10803 {
   10804    IRTemp addr  = IRTemp_INVALID;
   10805    Int    alen  = 0;
   10806    HChar  dis_buf[50];
   10807    UChar  modrm = getUChar(delta);
   10808    vassert(!epartIsReg(modrm)); /* ensured by caller */
   10809    vassert(gregOfRexRM(pfx,modrm) == 3); /* ditto */
   10810 
   10811    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10812    delta += alen;
   10813 
   10814    /* Fake up a native SSE mxcsr word.  The only thing it depends on
   10815       is SSEROUND[1:0], so call a clean helper to cook it up.
   10816    */
   10817    /* ULong amd64h_create_mxcsr ( ULong sseround ) */
   10818    DIP("%sstmxcsr %s\n",  isAvx ? "v" : "", dis_buf);
   10819    storeLE(
   10820       mkexpr(addr),
   10821       unop(Iop_64to32,
   10822            mkIRExprCCall(
   10823               Ity_I64, 0/*regp*/,
   10824               "amd64g_create_mxcsr", &amd64g_create_mxcsr,
   10825               mkIRExprVec_1( unop(Iop_32Uto64,get_sse_roundingmode()) )
   10826            )
   10827       )
   10828    );
   10829    return delta;
   10830 }
   10831 
   10832 
   10833 static Long dis_LDMXCSR ( VexAbiInfo* vbi, Prefix pfx,
   10834                           Long delta, Bool isAvx )
   10835 {
   10836    IRTemp addr  = IRTemp_INVALID;
   10837    Int    alen  = 0;
   10838    HChar  dis_buf[50];
   10839    UChar  modrm = getUChar(delta);
   10840    vassert(!epartIsReg(modrm)); /* ensured by caller */
   10841    vassert(gregOfRexRM(pfx,modrm) == 2); /* ditto */
   10842 
   10843    IRTemp t64 = newTemp(Ity_I64);
   10844    IRTemp ew  = newTemp(Ity_I32);
   10845 
   10846    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10847    delta += alen;
   10848    DIP("%sldmxcsr %s\n",  isAvx ? "v" : "", dis_buf);
   10849 
   10850    /* The only thing we observe in %mxcsr is the rounding mode.
   10851       Therefore, pass the 32-bit value (SSE native-format control
   10852       word) to a clean helper, getting back a 64-bit value, the
   10853       lower half of which is the SSEROUND value to store, and the
   10854       upper half of which is the emulation-warning token which may
   10855       be generated.
   10856    */
   10857    /* ULong amd64h_check_ldmxcsr ( ULong ); */
   10858    assign( t64, mkIRExprCCall(
   10859                    Ity_I64, 0/*regparms*/,
   10860                    "amd64g_check_ldmxcsr",
   10861                    &amd64g_check_ldmxcsr,
   10862                    mkIRExprVec_1(
   10863                       unop(Iop_32Uto64,
   10864                            loadLE(Ity_I32, mkexpr(addr))
   10865                       )
   10866                    )
   10867                 )
   10868          );
   10869 
   10870    put_sse_roundingmode( unop(Iop_64to32, mkexpr(t64)) );
   10871    assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   10872    put_emwarn( mkexpr(ew) );
   10873    /* Finally, if an emulation warning was reported, side-exit to
   10874       the next insn, reporting the warning, so that Valgrind's
   10875       dispatcher sees the warning. */
   10876    stmt(
   10877       IRStmt_Exit(
   10878          binop(Iop_CmpNE64, unop(Iop_32Uto64,mkexpr(ew)), mkU64(0)),
   10879          Ijk_EmWarn,
   10880          IRConst_U64(guest_RIP_bbstart+delta),
   10881          OFFB_RIP
   10882       )
   10883    );
   10884    return delta;
   10885 }
   10886 
   10887 
   10888 static IRTemp math_PINSRW_128 ( IRTemp v128, IRTemp u16, UInt imm8 )
   10889 {
   10890    vassert(imm8 >= 0 && imm8 <= 7);
   10891 
   10892    // Create a V128 value which has the selected word in the
   10893    // specified lane, and zeroes everywhere else.
   10894    IRTemp tmp128    = newTemp(Ity_V128);
   10895    IRTemp halfshift = newTemp(Ity_I64);
   10896    assign(halfshift, binop(Iop_Shl64,
   10897                            unop(Iop_16Uto64, mkexpr(u16)),
   10898                            mkU8(16 * (imm8 & 3))));
   10899    if (imm8 < 4) {
   10900       assign(tmp128, binop(Iop_64HLtoV128, mkU64(0), mkexpr(halfshift)));
   10901    } else {
   10902       assign(tmp128, binop(Iop_64HLtoV128, mkexpr(halfshift), mkU64(0)));
   10903    }
   10904 
   10905    UShort mask = ~(3 << (imm8 * 2));
   10906    IRTemp res  = newTemp(Ity_V128);
   10907    assign( res, binop(Iop_OrV128,
   10908                       mkexpr(tmp128),
   10909                       binop(Iop_AndV128, mkexpr(v128), mkV128(mask))) );
   10910    return res;
   10911 }
   10912 
   10913 
   10914 static IRTemp math_PSADBW_128 ( IRTemp dV, IRTemp sV )
   10915 {
   10916    IRTemp s1, s0, d1, d0;
   10917    s1 = s0 = d1 = d0 = IRTemp_INVALID;
   10918 
   10919    breakupV128to64s( sV, &s1, &s0 );
   10920    breakupV128to64s( dV, &d1, &d0 );
   10921 
   10922    IRTemp res = newTemp(Ity_V128);
   10923    assign( res,
   10924            binop(Iop_64HLtoV128,
   10925                  mkIRExprCCall(Ity_I64, 0/*regparms*/,
   10926                                "amd64g_calculate_mmx_psadbw",
   10927                                &amd64g_calculate_mmx_psadbw,
   10928                                mkIRExprVec_2( mkexpr(s1), mkexpr(d1))),
   10929                  mkIRExprCCall(Ity_I64, 0/*regparms*/,
   10930                                "amd64g_calculate_mmx_psadbw",
   10931                                &amd64g_calculate_mmx_psadbw,
   10932                                mkIRExprVec_2( mkexpr(s0), mkexpr(d0)))) );
   10933    return res;
   10934 }
   10935 
   10936 
   10937 static Long dis_MASKMOVDQU ( VexAbiInfo* vbi, Prefix pfx,
   10938                              Long delta, Bool isAvx )
   10939 {
   10940    IRTemp regD    = newTemp(Ity_V128);
   10941    IRTemp mask    = newTemp(Ity_V128);
   10942    IRTemp olddata = newTemp(Ity_V128);
   10943    IRTemp newdata = newTemp(Ity_V128);
   10944    IRTemp addr    = newTemp(Ity_I64);
   10945    UChar  modrm   = getUChar(delta);
   10946    UInt   rG      = gregOfRexRM(pfx,modrm);
   10947    UInt   rE      = eregOfRexRM(pfx,modrm);
   10948 
   10949    assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
   10950    assign( regD, getXMMReg( rG ));
   10951 
   10952    /* Unfortunately can't do the obvious thing with SarN8x16
   10953       here since that can't be re-emitted as SSE2 code - no such
   10954       insn. */
   10955    assign( mask,
   10956            binop(Iop_64HLtoV128,
   10957                  binop(Iop_SarN8x8,
   10958                        getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ),
   10959                        mkU8(7) ),
   10960                  binop(Iop_SarN8x8,
   10961                        getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ),
   10962                        mkU8(7) ) ));
   10963    assign( olddata, loadLE( Ity_V128, mkexpr(addr) ));
   10964    assign( newdata, binop(Iop_OrV128,
   10965                           binop(Iop_AndV128,
   10966                                 mkexpr(regD),
   10967                                 mkexpr(mask) ),
   10968                           binop(Iop_AndV128,
   10969                                 mkexpr(olddata),
   10970                                 unop(Iop_NotV128, mkexpr(mask)))) );
   10971    storeLE( mkexpr(addr), mkexpr(newdata) );
   10972 
   10973    delta += 1;
   10974    DIP("%smaskmovdqu %s,%s\n", isAvx ? "v" : "",
   10975        nameXMMReg(rE), nameXMMReg(rG) );
   10976    return delta;
   10977 }
   10978 
   10979 
   10980 static Long dis_MOVMSKPS_128 ( VexAbiInfo* vbi, Prefix pfx,
   10981                                Long delta, Bool isAvx )
   10982 {
   10983    UChar modrm = getUChar(delta);
   10984    UInt   rG   = gregOfRexRM(pfx,modrm);
   10985    UInt   rE   = eregOfRexRM(pfx,modrm);
   10986    IRTemp t0   = newTemp(Ity_I32);
   10987    IRTemp t1   = newTemp(Ity_I32);
   10988    IRTemp t2   = newTemp(Ity_I32);
   10989    IRTemp t3   = newTemp(Ity_I32);
   10990    delta += 1;
   10991    assign( t0, binop( Iop_And32,
   10992                       binop(Iop_Shr32, getXMMRegLane32(rE,0), mkU8(31)),
   10993                       mkU32(1) ));
   10994    assign( t1, binop( Iop_And32,
   10995                       binop(Iop_Shr32, getXMMRegLane32(rE,1), mkU8(30)),
   10996                       mkU32(2) ));
   10997    assign( t2, binop( Iop_And32,
   10998                       binop(Iop_Shr32, getXMMRegLane32(rE,2), mkU8(29)),
   10999                       mkU32(4) ));
   11000    assign( t3, binop( Iop_And32,
   11001                       binop(Iop_Shr32, getXMMRegLane32(rE,3), mkU8(28)),
   11002                       mkU32(8) ));
   11003    putIReg32( rG, binop(Iop_Or32,
   11004                         binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
   11005                         binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ) );
   11006    DIP("%smovmskps %s,%s\n", isAvx ? "v" : "",
   11007        nameXMMReg(rE), nameIReg32(rG));
   11008    return delta;
   11009 }
   11010 
   11011 
   11012 static Long dis_MOVMSKPS_256 ( VexAbiInfo* vbi, Prefix pfx, Long delta )
   11013 {
   11014    UChar modrm = getUChar(delta);
   11015    UInt   rG   = gregOfRexRM(pfx,modrm);
   11016    UInt   rE   = eregOfRexRM(pfx,modrm);
   11017    IRTemp t0   = newTemp(Ity_I32);
   11018    IRTemp t1   = newTemp(Ity_I32);
   11019    IRTemp t2   = newTemp(Ity_I32);
   11020    IRTemp t3   = newTemp(Ity_I32);
   11021    IRTemp t4   = newTemp(Ity_I32);
   11022    IRTemp t5   = newTemp(Ity_I32);
   11023    IRTemp t6   = newTemp(Ity_I32);
   11024    IRTemp t7   = newTemp(Ity_I32);
   11025    delta += 1;
   11026    assign( t0, binop( Iop_And32,
   11027                       binop(Iop_Shr32, getYMMRegLane32(rE,0), mkU8(31)),
   11028                       mkU32(1) ));
   11029    assign( t1, binop( Iop_And32,
   11030                       binop(Iop_Shr32, getYMMRegLane32(rE,1), mkU8(30)),
   11031                       mkU32(2) ));
   11032    assign( t2, binop( Iop_And32,
   11033                       binop(Iop_Shr32, getYMMRegLane32(rE,2), mkU8(29)),
   11034                       mkU32(4) ));
   11035    assign( t3, binop( Iop_And32,
   11036                       binop(Iop_Shr32, getYMMRegLane32(rE,3), mkU8(28)),
   11037                       mkU32(8) ));
   11038    assign( t4, binop( Iop_And32,
   11039                       binop(Iop_Shr32, getYMMRegLane32(rE,4), mkU8(27)),
   11040                       mkU32(16) ));
   11041    assign( t5, binop( Iop_And32,
   11042                       binop(Iop_Shr32, getYMMRegLane32(rE,5), mkU8(26)),
   11043                       mkU32(32) ));
   11044    assign( t6, binop( Iop_And32,
   11045                       binop(Iop_Shr32, getYMMRegLane32(rE,6), mkU8(25)),
   11046                       mkU32(64) ));
   11047    assign( t7, binop( Iop_And32,
   11048                       binop(Iop_Shr32, getYMMRegLane32(rE,7), mkU8(24)),
   11049                       mkU32(128) ));
   11050    putIReg32( rG, binop(Iop_Or32,
   11051                         binop(Iop_Or32,
   11052                               binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
   11053                               binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ),
   11054                         binop(Iop_Or32,
   11055                               binop(Iop_Or32, mkexpr(t4), mkexpr(t5)),
   11056                               binop(Iop_Or32, mkexpr(t6), mkexpr(t7)) ) ) );
   11057    DIP("vmovmskps %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
   11058    return delta;
   11059 }
   11060 
   11061 
   11062 static Long dis_MOVMSKPD_128 ( VexAbiInfo* vbi, Prefix pfx,
   11063                                Long delta, Bool isAvx )
   11064 {
   11065    UChar modrm = getUChar(delta);
   11066    UInt   rG   = gregOfRexRM(pfx,modrm);
   11067    UInt   rE   = eregOfRexRM(pfx,modrm);
   11068    IRTemp t0   = newTemp(Ity_I32);
   11069    IRTemp t1   = newTemp(Ity_I32);
   11070    delta += 1;
   11071    assign( t0, binop( Iop_And32,
   11072                       binop(Iop_Shr32, getXMMRegLane32(rE,1), mkU8(31)),
   11073                       mkU32(1) ));
   11074    assign( t1, binop( Iop_And32,
   11075                       binop(Iop_Shr32, getXMMRegLane32(rE,3), mkU8(30)),
   11076                       mkU32(2) ));
   11077    putIReg32( rG, binop(Iop_Or32, mkexpr(t0), mkexpr(t1) ) );
   11078    DIP("%smovmskpd %s,%s\n", isAvx ? "v" : "",
   11079        nameXMMReg(rE), nameIReg32(rG));
   11080    return delta;
   11081 }
   11082 
   11083 
   11084 static Long dis_MOVMSKPD_256 ( VexAbiInfo* vbi, Prefix pfx, Long delta )
   11085 {
   11086    UChar modrm = getUChar(delta);
   11087    UInt   rG   = gregOfRexRM(pfx,modrm);
   11088    UInt   rE   = eregOfRexRM(pfx,modrm);
   11089    IRTemp t0   = newTemp(Ity_I32);
   11090    IRTemp t1   = newTemp(Ity_I32);
   11091    IRTemp t2   = newTemp(Ity_I32);
   11092    IRTemp t3   = newTemp(Ity_I32);
   11093    delta += 1;
   11094    assign( t0, binop( Iop_And32,
   11095                       binop(Iop_Shr32, getYMMRegLane32(rE,1), mkU8(31)),
   11096                       mkU32(1) ));
   11097    assign( t1, binop( Iop_And32,
   11098                       binop(Iop_Shr32, getYMMRegLane32(rE,3), mkU8(30)),
   11099                       mkU32(2) ));
   11100    assign( t2, binop( Iop_And32,
   11101                       binop(Iop_Shr32, getYMMRegLane32(rE,5), mkU8(29)),
   11102                       mkU32(4) ));
   11103    assign( t3, binop( Iop_And32,
   11104                       binop(Iop_Shr32, getYMMRegLane32(rE,7), mkU8(28)),
   11105                       mkU32(8) ));
   11106    putIReg32( rG, binop(Iop_Or32,
   11107                         binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
   11108                         binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ) );
   11109    DIP("vmovmskps %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
   11110    return delta;
   11111 }
   11112 
   11113 
   11114 /* Note, this also handles SSE(1) insns. */
   11115 __attribute__((noinline))
   11116 static
   11117 Long dis_ESC_0F__SSE2 ( Bool* decode_OK,
   11118                         VexAbiInfo* vbi,
   11119                         Prefix pfx, Int sz, Long deltaIN,
   11120                         DisResult* dres )
   11121 {
   11122    IRTemp addr  = IRTemp_INVALID;
   11123    IRTemp t0    = IRTemp_INVALID;
   11124    IRTemp t1    = IRTemp_INVALID;
   11125    IRTemp t2    = IRTemp_INVALID;
   11126    IRTemp t3    = IRTemp_INVALID;
   11127    IRTemp t4    = IRTemp_INVALID;
   11128    IRTemp t5    = IRTemp_INVALID;
   11129    IRTemp t6    = IRTemp_INVALID;
   11130    UChar  modrm = 0;
   11131    Int    alen  = 0;
   11132    HChar  dis_buf[50];
   11133 
   11134    *decode_OK = False;
   11135 
   11136    Long   delta = deltaIN;
   11137    UChar  opc   = getUChar(delta);
   11138    delta++;
   11139    switch (opc) {
   11140 
   11141    case 0x10:
   11142       if (have66noF2noF3(pfx)
   11143           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   11144          /* 66 0F 10 = MOVUPD -- move from E (mem or xmm) to G (xmm). */
   11145          modrm = getUChar(delta);
   11146          if (epartIsReg(modrm)) {
   11147             putXMMReg( gregOfRexRM(pfx,modrm),
   11148                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   11149             DIP("movupd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   11150                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   11151             delta += 1;
   11152          } else {
   11153             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11154             putXMMReg( gregOfRexRM(pfx,modrm),
   11155                        loadLE(Ity_V128, mkexpr(addr)) );
   11156             DIP("movupd %s,%s\n", dis_buf,
   11157                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   11158             delta += alen;
   11159          }
   11160          goto decode_success;
   11161       }
   11162       /* F2 0F 10 = MOVSD -- move 64 bits from E (mem or lo half xmm) to
   11163          G (lo half xmm).  If E is mem, upper half of G is zeroed out.
   11164          If E is reg, upper half of G is unchanged. */
   11165       if (haveF2no66noF3(pfx)
   11166           && (sz == 4 || /* ignore redundant REX.W */ sz == 8) ) {
   11167          modrm = getUChar(delta);
   11168          if (epartIsReg(modrm)) {
   11169             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   11170                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
   11171             DIP("movsd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   11172                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   11173             delta += 1;
   11174          } else {
   11175             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11176             putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
   11177             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   11178                              loadLE(Ity_I64, mkexpr(addr)) );
   11179             DIP("movsd %s,%s\n", dis_buf,
   11180                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   11181             delta += alen;
   11182          }
   11183          goto decode_success;
   11184       }
   11185       /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
   11186          (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
   11187       if (haveF3no66noF2(pfx)
   11188           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   11189          modrm = getUChar(delta);
   11190          if (epartIsReg(modrm)) {
   11191             putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
   11192                              getXMMRegLane32( eregOfRexRM(pfx,modrm), 0 ));
   11193             DIP("movss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   11194                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   11195             delta += 1;
   11196          } else {
   11197             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11198             putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
   11199             putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
   11200                              loadLE(Ity_I32, mkexpr(addr)) );
   11201             DIP("movss %s,%s\n", dis_buf,
   11202                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   11203             delta += alen;
   11204          }
   11205          goto decode_success;
   11206       }
   11207       /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
   11208       if (haveNo66noF2noF3(pfx)
   11209           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   11210          modrm = getUChar(delta);
   11211          if (epartIsReg(modrm)) {
   11212             putXMMReg( gregOfRexRM(pfx,modrm),
   11213                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   11214             DIP("movups %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   11215                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   11216             delta += 1;
   11217          } else {
   11218             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11219             putXMMReg( gregOfRexRM(pfx,modrm),
   11220                        loadLE(Ity_V128, mkexpr(addr)) );
   11221             DIP("movups %s,%s\n", dis_buf,
   11222                                      nameXMMReg(gregOfRexRM(pfx,modrm)));
   11223             delta += alen;
   11224          }
   11225          goto decode_success;
   11226       }
   11227       break;
   11228 
   11229    case 0x11:
   11230       /* F2 0F 11 = MOVSD -- move 64 bits from G (lo half xmm) to E (mem
   11231          or lo half xmm). */
   11232       if (haveF2no66noF3(pfx)
   11233           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   11234          modrm = getUChar(delta);
   11235          if (epartIsReg(modrm)) {
   11236             putXMMRegLane64( eregOfRexRM(pfx,modrm), 0,
   11237                              getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
   11238             DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11239                                  nameXMMReg(eregOfRexRM(pfx,modrm)));
   11240             delta += 1;
   11241          } else {
   11242             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11243             storeLE( mkexpr(addr),
   11244                      getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
   11245             DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11246                                  dis_buf);
   11247             delta += alen;
   11248          }
   11249          goto decode_success;
   11250       }
   11251       /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
   11252          or lo 1/4 xmm). */
   11253       if (haveF3no66noF2(pfx) && sz == 4) {
   11254          modrm = getUChar(delta);
   11255          if (epartIsReg(modrm)) {
   11256             /* fall through, we don't yet have a test case */
   11257          } else {
   11258             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11259             storeLE( mkexpr(addr),
   11260                      getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
   11261             DIP("movss %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11262                                  dis_buf);
   11263             delta += alen;
   11264             goto decode_success;
   11265          }
   11266       }
   11267       /* 66 0F 11 = MOVUPD -- move from G (xmm) to E (mem or xmm). */
   11268       if (have66noF2noF3(pfx)
   11269           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   11270          modrm = getUChar(delta);
   11271          if (epartIsReg(modrm)) {
   11272             putXMMReg( eregOfRexRM(pfx,modrm),
   11273    		    getXMMReg( gregOfRexRM(pfx,modrm) ) );
   11274             DIP("movupd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11275    	                       nameXMMReg(eregOfRexRM(pfx,modrm)));
   11276             delta += 1;
   11277          } else {
   11278             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11279             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   11280             DIP("movupd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11281                                   dis_buf );
   11282             delta += alen;
   11283          }
   11284          goto decode_success;
   11285       }
   11286       /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
   11287       if (haveNo66noF2noF3(pfx)
   11288           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   11289          modrm = getUChar(delta);
   11290          if (epartIsReg(modrm)) {
   11291             /* fall through; awaiting test case */
   11292          } else {
   11293             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11294             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   11295             DIP("movups %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11296                                   dis_buf );
   11297             delta += alen;
   11298             goto decode_success;
   11299          }
   11300       }
   11301       break;
   11302 
   11303    case 0x12:
   11304       /* 66 0F 12 = MOVLPD -- move from mem to low half of XMM. */
   11305       /* Identical to MOVLPS ? */
   11306       if (have66noF2noF3(pfx)
   11307           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   11308          modrm = getUChar(delta);
   11309          if (epartIsReg(modrm)) {
   11310             /* fall through; apparently reg-reg is not possible */
   11311          } else {
   11312             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11313             delta += alen;
   11314             putXMMRegLane64( gregOfRexRM(pfx,modrm),
   11315                              0/*lower lane*/,
   11316                              loadLE(Ity_I64, mkexpr(addr)) );
   11317             DIP("movlpd %s, %s\n",
   11318                 dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
   11319             goto decode_success;
   11320          }
   11321       }
   11322       /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
   11323       /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
   11324       if (haveNo66noF2noF3(pfx)
   11325           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   11326          modrm = getUChar(delta);
   11327          if (epartIsReg(modrm)) {
   11328             delta += 1;
   11329             putXMMRegLane64( gregOfRexRM(pfx,modrm),
   11330                              0/*lower lane*/,
   11331                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ));
   11332             DIP("movhlps %s, %s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   11333                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   11334          } else {
   11335             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11336             delta += alen;
   11337             putXMMRegLane64( gregOfRexRM(pfx,modrm),  0/*lower lane*/,
   11338                              loadLE(Ity_I64, mkexpr(addr)) );
   11339             DIP("movlps %s, %s\n",
   11340                 dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
   11341          }
   11342          goto decode_success;
   11343       }
   11344       break;
   11345 
   11346    case 0x13:
   11347       /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
   11348       if (haveNo66noF2noF3(pfx)
   11349           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   11350          modrm = getUChar(delta);
   11351          if (!epartIsReg(modrm)) {
   11352             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11353             delta += alen;
   11354             storeLE( mkexpr(addr),
   11355                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
   11356                                       0/*lower lane*/ ) );
   11357             DIP("movlps %s, %s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   11358                                    dis_buf);
   11359             goto decode_success;
   11360          }
   11361          /* else fall through */
   11362       }
   11363       /* 66 0F 13 = MOVLPD -- move from low half of XMM to mem. */
   11364       /* Identical to MOVLPS ? */
   11365       if (have66noF2noF3(pfx)
   11366           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   11367          modrm = getUChar(delta);
   11368          if (!epartIsReg(modrm)) {
   11369             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11370             delta += alen;
   11371             storeLE( mkexpr(addr),
   11372                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
   11373                                       0/*lower lane*/ ) );
   11374             DIP("movlpd %s, %s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   11375                                    dis_buf);
   11376             goto decode_success;
   11377          }
   11378          /* else fall through */
   11379       }
   11380       break;
   11381 
   11382    case 0x14:
   11383    case 0x15:
   11384       /* 0F 14 = UNPCKLPS -- unpack and interleave low part F32s */
   11385       /* 0F 15 = UNPCKHPS -- unpack and interleave high part F32s */
   11386       /* These just appear to be special cases of SHUFPS */
   11387       if (haveNo66noF2noF3(pfx) && sz == 4) {
   11388          Bool   hi = toBool(opc == 0x15);
   11389          IRTemp sV = newTemp(Ity_V128);
   11390          IRTemp dV = newTemp(Ity_V128);
   11391          modrm = getUChar(delta);
   11392          UInt   rG = gregOfRexRM(pfx,modrm);
   11393          assign( dV, getXMMReg(rG) );
   11394          if (epartIsReg(modrm)) {
   11395             UInt rE = eregOfRexRM(pfx,modrm);
   11396             assign( sV, getXMMReg(rE) );
   11397             delta += 1;
   11398             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   11399                 nameXMMReg(rE), nameXMMReg(rG));
   11400          } else {
   11401             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11402             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11403             delta += alen;
   11404             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   11405                 dis_buf, nameXMMReg(rG));
   11406          }
   11407          IRTemp res = math_UNPCKxPS_128( sV, dV, hi );
   11408          putXMMReg( rG, mkexpr(res) );
   11409          goto decode_success;
   11410       }
   11411       /* 66 0F 15 = UNPCKHPD -- unpack and interleave high part F64s */
   11412       /* 66 0F 14 = UNPCKLPD -- unpack and interleave low part F64s */
   11413       /* These just appear to be special cases of SHUFPS */
   11414       if (have66noF2noF3(pfx)
   11415           && sz == 2 /* could be 8 if rex also present */) {
   11416          Bool   hi = toBool(opc == 0x15);
   11417          IRTemp sV = newTemp(Ity_V128);
   11418          IRTemp dV = newTemp(Ity_V128);
   11419          modrm = getUChar(delta);
   11420          UInt   rG = gregOfRexRM(pfx,modrm);
   11421          assign( dV, getXMMReg(rG) );
   11422          if (epartIsReg(modrm)) {
   11423             UInt rE = eregOfRexRM(pfx,modrm);
   11424             assign( sV, getXMMReg(rE) );
   11425             delta += 1;
   11426             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   11427                 nameXMMReg(rE), nameXMMReg(rG));
   11428          } else {
   11429             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11430             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11431             delta += alen;
   11432             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   11433                 dis_buf, nameXMMReg(rG));
   11434          }
   11435          IRTemp res = math_UNPCKxPD_128( sV, dV, hi );
   11436          putXMMReg( rG, mkexpr(res) );
   11437          goto decode_success;
   11438       }
   11439       break;
   11440 
   11441    case 0x16:
   11442       /* 66 0F 16 = MOVHPD -- move from mem to high half of XMM. */
   11443       /* These seems identical to MOVHPS.  This instruction encoding is
   11444          completely crazy. */
   11445       if (have66noF2noF3(pfx)
   11446           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   11447          modrm = getUChar(delta);
   11448          if (epartIsReg(modrm)) {
   11449             /* fall through; apparently reg-reg is not possible */
   11450          } else {
   11451             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11452             delta += alen;
   11453             putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
   11454                              loadLE(Ity_I64, mkexpr(addr)) );
   11455             DIP("movhpd %s,%s\n", dis_buf,
   11456                                   nameXMMReg( gregOfRexRM(pfx,modrm) ));
   11457             goto decode_success;
   11458          }
   11459       }
   11460       /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
   11461       /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
   11462       if (haveNo66noF2noF3(pfx)
   11463           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   11464          modrm = getUChar(delta);
   11465          if (epartIsReg(modrm)) {
   11466             delta += 1;
   11467             putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
   11468                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ) );
   11469             DIP("movhps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   11470                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   11471          } else {
   11472             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11473             delta += alen;
   11474             putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
   11475                              loadLE(Ity_I64, mkexpr(addr)) );
   11476             DIP("movhps %s,%s\n", dis_buf,
   11477                                   nameXMMReg( gregOfRexRM(pfx,modrm) ));
   11478          }
   11479          goto decode_success;
   11480       }
   11481       break;
   11482 
   11483    case 0x17:
   11484       /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
   11485       if (haveNo66noF2noF3(pfx)
   11486           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   11487          modrm = getUChar(delta);
   11488          if (!epartIsReg(modrm)) {
   11489             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11490             delta += alen;
   11491             storeLE( mkexpr(addr),
   11492                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
   11493                                       1/*upper lane*/ ) );
   11494             DIP("movhps %s,%s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   11495                                   dis_buf);
   11496             goto decode_success;
   11497          }
   11498          /* else fall through */
   11499       }
   11500       /* 66 0F 17 = MOVHPD -- move from high half of XMM to mem. */
   11501       /* Again, this seems identical to MOVHPS. */
   11502       if (have66noF2noF3(pfx)
   11503           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   11504          modrm = getUChar(delta);
   11505          if (!epartIsReg(modrm)) {
   11506             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11507             delta += alen;
   11508             storeLE( mkexpr(addr),
   11509                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
   11510                                       1/*upper lane*/ ) );
   11511             DIP("movhpd %s,%s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   11512                                   dis_buf);
   11513             goto decode_success;
   11514          }
   11515          /* else fall through */
   11516       }
   11517       break;
   11518 
   11519    case 0x18:
   11520       /* 0F 18 /0 = PREFETCHNTA -- prefetch into caches, */
   11521       /* 0F 18 /1 = PREFETCH0   -- with various different hints */
   11522       /* 0F 18 /2 = PREFETCH1 */
   11523       /* 0F 18 /3 = PREFETCH2 */
   11524       if (haveNo66noF2noF3(pfx)
   11525           && !epartIsReg(getUChar(delta))
   11526           && gregLO3ofRM(getUChar(delta)) >= 0
   11527           && gregLO3ofRM(getUChar(delta)) <= 3) {
   11528          HChar* hintstr = "??";
   11529 
   11530          modrm = getUChar(delta);
   11531          vassert(!epartIsReg(modrm));
   11532 
   11533          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11534          delta += alen;
   11535 
   11536          switch (gregLO3ofRM(modrm)) {
   11537             case 0: hintstr = "nta"; break;
   11538             case 1: hintstr = "t0"; break;
   11539             case 2: hintstr = "t1"; break;
   11540             case 3: hintstr = "t2"; break;
   11541             default: vassert(0);
   11542          }
   11543 
   11544          DIP("prefetch%s %s\n", hintstr, dis_buf);
   11545          goto decode_success;
   11546       }
   11547       break;
   11548 
   11549    case 0x28:
   11550       /* 66 0F 28 = MOVAPD -- move from E (mem or xmm) to G (xmm). */
   11551       if (have66noF2noF3(pfx)
   11552           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   11553          modrm = getUChar(delta);
   11554          if (epartIsReg(modrm)) {
   11555             putXMMReg( gregOfRexRM(pfx,modrm),
   11556                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   11557             DIP("movapd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   11558                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   11559             delta += 1;
   11560          } else {
   11561             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11562             gen_SEGV_if_not_16_aligned( addr );
   11563             putXMMReg( gregOfRexRM(pfx,modrm),
   11564                        loadLE(Ity_V128, mkexpr(addr)) );
   11565             DIP("movapd %s,%s\n", dis_buf,
   11566                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   11567             delta += alen;
   11568          }
   11569          goto decode_success;
   11570       }
   11571       /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
   11572       if (haveNo66noF2noF3(pfx)
   11573           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   11574          modrm = getUChar(delta);
   11575          if (epartIsReg(modrm)) {
   11576             putXMMReg( gregOfRexRM(pfx,modrm),
   11577                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   11578             DIP("movaps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   11579                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   11580             delta += 1;
   11581          } else {
   11582             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11583             gen_SEGV_if_not_16_aligned( addr );
   11584             putXMMReg( gregOfRexRM(pfx,modrm),
   11585                        loadLE(Ity_V128, mkexpr(addr)) );
   11586             DIP("movaps %s,%s\n", dis_buf,
   11587                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   11588             delta += alen;
   11589          }
   11590          goto decode_success;
   11591       }
   11592       break;
   11593 
   11594    case 0x29:
   11595       /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
   11596       if (haveNo66noF2noF3(pfx)
   11597           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   11598          modrm = getUChar(delta);
   11599          if (epartIsReg(modrm)) {
   11600             /* fall through; awaiting test case */
   11601             putXMMReg( eregOfRexRM(pfx,modrm),
   11602                        getXMMReg( gregOfRexRM(pfx,modrm) ));
   11603             DIP("movaps %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11604                                   nameXMMReg(eregOfRexRM(pfx,modrm)));
   11605             delta += 1;
   11606          } else {
   11607             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11608             gen_SEGV_if_not_16_aligned( addr );
   11609             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   11610             DIP("movaps %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11611                                   dis_buf );
   11612             delta += alen;
   11613          }
   11614          goto decode_success;
   11615       }
   11616       /* 66 0F 29 = MOVAPD -- move from G (xmm) to E (mem or xmm). */
   11617       if (have66noF2noF3(pfx)
   11618           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   11619          modrm = getUChar(delta);
   11620          if (epartIsReg(modrm)) {
   11621             putXMMReg( eregOfRexRM(pfx,modrm),
   11622    		    getXMMReg( gregOfRexRM(pfx,modrm) ) );
   11623             DIP("movapd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11624    	                       nameXMMReg(eregOfRexRM(pfx,modrm)));
   11625             delta += 1;
   11626          } else {
   11627             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11628             gen_SEGV_if_not_16_aligned( addr );
   11629             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   11630             DIP("movapd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11631                                   dis_buf );
   11632             delta += alen;
   11633          }
   11634          goto decode_success;
   11635       }
   11636       break;
   11637 
   11638    case 0x2A:
   11639       /* 0F 2A = CVTPI2PS -- convert 2 x I32 in mem/mmx to 2 x F32 in low
   11640          half xmm */
   11641       if (haveNo66noF2noF3(pfx) && sz == 4) {
   11642          IRTemp arg64 = newTemp(Ity_I64);
   11643          IRTemp rmode = newTemp(Ity_I32);
   11644 
   11645          modrm = getUChar(delta);
   11646          do_MMX_preamble();
   11647          if (epartIsReg(modrm)) {
   11648             assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
   11649             delta += 1;
   11650             DIP("cvtpi2ps %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   11651                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   11652          } else {
   11653             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11654             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   11655             delta += alen;
   11656             DIP("cvtpi2ps %s,%s\n", dis_buf,
   11657                                     nameXMMReg(gregOfRexRM(pfx,modrm)) );
   11658          }
   11659 
   11660          assign( rmode, get_sse_roundingmode() );
   11661 
   11662          putXMMRegLane32F(
   11663             gregOfRexRM(pfx,modrm), 0,
   11664             binop(Iop_F64toF32,
   11665                   mkexpr(rmode),
   11666                   unop(Iop_I32StoF64,
   11667                        unop(Iop_64to32, mkexpr(arg64)) )) );
   11668 
   11669          putXMMRegLane32F(
   11670             gregOfRexRM(pfx,modrm), 1,
   11671             binop(Iop_F64toF32,
   11672                   mkexpr(rmode),
   11673                   unop(Iop_I32StoF64,
   11674                        unop(Iop_64HIto32, mkexpr(arg64)) )) );
   11675 
   11676          goto decode_success;
   11677       }
   11678       /* F3 0F 2A = CVTSI2SS
   11679          -- sz==4: convert I32 in mem/ireg to F32 in low quarter xmm
   11680          -- sz==8: convert I64 in mem/ireg to F32 in low quarter xmm */
   11681       if (haveF3no66noF2(pfx) && (sz == 4 || sz == 8)) {
   11682          IRTemp rmode = newTemp(Ity_I32);
   11683          assign( rmode, get_sse_roundingmode() );
   11684          modrm = getUChar(delta);
   11685          if (sz == 4) {
   11686             IRTemp arg32 = newTemp(Ity_I32);
   11687             if (epartIsReg(modrm)) {
   11688                assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
   11689                delta += 1;
   11690                DIP("cvtsi2ss %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   11691                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
   11692             } else {
   11693                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11694                assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   11695                delta += alen;
   11696                DIP("cvtsi2ss %s,%s\n", dis_buf,
   11697                                        nameXMMReg(gregOfRexRM(pfx,modrm)) );
   11698             }
   11699             putXMMRegLane32F(
   11700                gregOfRexRM(pfx,modrm), 0,
   11701                binop(Iop_F64toF32,
   11702                      mkexpr(rmode),
   11703                      unop(Iop_I32StoF64, mkexpr(arg32)) ) );
   11704          } else {
   11705             /* sz == 8 */
   11706             IRTemp arg64 = newTemp(Ity_I64);
   11707             if (epartIsReg(modrm)) {
   11708                assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
   11709                delta += 1;
   11710                DIP("cvtsi2ssq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   11711                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   11712             } else {
   11713                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11714                assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   11715                delta += alen;
   11716                DIP("cvtsi2ssq %s,%s\n", dis_buf,
   11717                                         nameXMMReg(gregOfRexRM(pfx,modrm)) );
   11718             }
   11719             putXMMRegLane32F(
   11720                gregOfRexRM(pfx,modrm), 0,
   11721                binop(Iop_F64toF32,
   11722                      mkexpr(rmode),
   11723                      binop(Iop_I64StoF64, mkexpr(rmode), mkexpr(arg64)) ) );
   11724          }
   11725          goto decode_success;
   11726       }
   11727       /* F2 0F 2A = CVTSI2SD
   11728          when sz==4 -- convert I32 in mem/ireg to F64 in low half xmm
   11729          when sz==8 -- convert I64 in mem/ireg to F64 in low half xmm
   11730       */
   11731       if (haveF2no66noF3(pfx) && (sz == 4 || sz == 8)) {
   11732          modrm = getUChar(delta);
   11733          if (sz == 4) {
   11734             IRTemp arg32 = newTemp(Ity_I32);
   11735             if (epartIsReg(modrm)) {
   11736                assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
   11737                delta += 1;
   11738                DIP("cvtsi2sdl %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   11739                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   11740             } else {
   11741                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11742                assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   11743                delta += alen;
   11744                DIP("cvtsi2sdl %s,%s\n", dis_buf,
   11745                                         nameXMMReg(gregOfRexRM(pfx,modrm)) );
   11746             }
   11747             putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
   11748                               unop(Iop_I32StoF64, mkexpr(arg32))
   11749             );
   11750          } else {
   11751             /* sz == 8 */
   11752             IRTemp arg64 = newTemp(Ity_I64);
   11753             if (epartIsReg(modrm)) {
   11754                assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
   11755                delta += 1;
   11756                DIP("cvtsi2sdq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   11757                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   11758             } else {
   11759                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11760                assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   11761                delta += alen;
   11762                DIP("cvtsi2sdq %s,%s\n", dis_buf,
   11763                                         nameXMMReg(gregOfRexRM(pfx,modrm)) );
   11764             }
   11765             putXMMRegLane64F(
   11766                gregOfRexRM(pfx,modrm),
   11767                0,
   11768                binop( Iop_I64StoF64,
   11769                       get_sse_roundingmode(),
   11770                       mkexpr(arg64)
   11771                )
   11772             );
   11773          }
   11774          goto decode_success;
   11775       }
   11776       /* 66 0F 2A = CVTPI2PD -- convert 2 x I32 in mem/mmx to 2 x F64 in
   11777          xmm(G) */
   11778       if (have66noF2noF3(pfx) && sz == 2) {
   11779          IRTemp arg64 = newTemp(Ity_I64);
   11780 
   11781          modrm = getUChar(delta);
   11782          if (epartIsReg(modrm)) {
   11783             /* Only switch to MMX mode if the source is a MMX register.
   11784                This is inconsistent with all other instructions which
   11785                convert between XMM and (M64 or MMX), which always switch
   11786                to MMX mode even if 64-bit operand is M64 and not MMX.  At
   11787                least, that's what the Intel docs seem to me to say.
   11788                Fixes #210264. */
   11789             do_MMX_preamble();
   11790             assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
   11791             delta += 1;
   11792             DIP("cvtpi2pd %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   11793                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   11794          } else {
   11795             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11796             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   11797             delta += alen;
   11798             DIP("cvtpi2pd %s,%s\n", dis_buf,
   11799                                     nameXMMReg(gregOfRexRM(pfx,modrm)) );
   11800          }
   11801 
   11802          putXMMRegLane64F(
   11803             gregOfRexRM(pfx,modrm), 0,
   11804             unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)) )
   11805          );
   11806 
   11807          putXMMRegLane64F(
   11808             gregOfRexRM(pfx,modrm), 1,
   11809             unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)) )
   11810          );
   11811 
   11812          goto decode_success;
   11813       }
   11814       break;
   11815 
   11816    case 0x2B:
   11817       /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
   11818       /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
   11819       if ( (haveNo66noF2noF3(pfx) && sz == 4)
   11820            || (have66noF2noF3(pfx) && sz == 2) ) {
   11821          modrm = getUChar(delta);
   11822          if (!epartIsReg(modrm)) {
   11823             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11824             gen_SEGV_if_not_16_aligned( addr );
   11825             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   11826             DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
   11827                                     dis_buf,
   11828                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   11829             delta += alen;
   11830             goto decode_success;
   11831          }
   11832          /* else fall through */
   11833       }
   11834       break;
   11835 
   11836    case 0x2C:
   11837    case 0x2D:
   11838       /* 0F 2D = CVTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   11839          I32 in mmx, according to prevailing SSE rounding mode */
   11840       /* 0F 2C = CVTTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   11841          I32 in mmx, rounding towards zero */
   11842       if (haveNo66noF2noF3(pfx) && sz == 4) {
   11843          IRTemp dst64  = newTemp(Ity_I64);
   11844          IRTemp rmode  = newTemp(Ity_I32);
   11845          IRTemp f32lo  = newTemp(Ity_F32);
   11846          IRTemp f32hi  = newTemp(Ity_F32);
   11847          Bool   r2zero = toBool(opc == 0x2C);
   11848 
   11849          do_MMX_preamble();
   11850          modrm = getUChar(delta);
   11851 
   11852          if (epartIsReg(modrm)) {
   11853             delta += 1;
   11854             assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
   11855             assign(f32hi, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 1));
   11856             DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   11857                                       nameXMMReg(eregOfRexRM(pfx,modrm)),
   11858                                       nameMMXReg(gregLO3ofRM(modrm)));
   11859          } else {
   11860             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11861             assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   11862             assign(f32hi, loadLE(Ity_F32, binop( Iop_Add64,
   11863                                                  mkexpr(addr),
   11864                                                  mkU64(4) )));
   11865             delta += alen;
   11866             DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   11867                                       dis_buf,
   11868                                       nameMMXReg(gregLO3ofRM(modrm)));
   11869          }
   11870 
   11871          if (r2zero) {
   11872             assign(rmode, mkU32((UInt)Irrm_ZERO) );
   11873          } else {
   11874             assign( rmode, get_sse_roundingmode() );
   11875          }
   11876 
   11877          assign(
   11878             dst64,
   11879             binop( Iop_32HLto64,
   11880                    binop( Iop_F64toI32S,
   11881                           mkexpr(rmode),
   11882                           unop( Iop_F32toF64, mkexpr(f32hi) ) ),
   11883                    binop( Iop_F64toI32S,
   11884                           mkexpr(rmode),
   11885                           unop( Iop_F32toF64, mkexpr(f32lo) ) )
   11886                  )
   11887          );
   11888 
   11889          putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
   11890          goto decode_success;
   11891       }
   11892       /* F3 0F 2D = CVTSS2SI
   11893          when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg,
   11894                        according to prevailing SSE rounding mode
   11895          when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg,
   11896                        according to prevailing SSE rounding mode
   11897       */
   11898       /* F3 0F 2C = CVTTSS2SI
   11899          when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg,
   11900                        truncating towards zero
   11901          when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg,
   11902                        truncating towards zero
   11903       */
   11904       if (haveF3no66noF2(pfx) && (sz == 4 || sz == 8)) {
   11905          delta = dis_CVTxSS2SI( vbi, pfx, delta, False/*!isAvx*/, opc, sz);
   11906          goto decode_success;
   11907       }
   11908       /* F2 0F 2D = CVTSD2SI
   11909          when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg,
   11910                        according to prevailing SSE rounding mode
   11911          when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg,
   11912                        according to prevailing SSE rounding mode
   11913       */
   11914       /* F2 0F 2C = CVTTSD2SI
   11915          when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg,
   11916                        truncating towards zero
   11917          when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg,
   11918                        truncating towards zero
   11919       */
   11920       if (haveF2no66noF3(pfx) && (sz == 4 || sz == 8)) {
   11921          delta = dis_CVTxSD2SI( vbi, pfx, delta, False/*!isAvx*/, opc, sz);
   11922          goto decode_success;
   11923       }
   11924       /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   11925          I32 in mmx, according to prevailing SSE rounding mode */
   11926       /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   11927          I32 in mmx, rounding towards zero */
   11928       if (have66noF2noF3(pfx) && sz == 2) {
   11929          IRTemp dst64  = newTemp(Ity_I64);
   11930          IRTemp rmode  = newTemp(Ity_I32);
   11931          IRTemp f64lo  = newTemp(Ity_F64);
   11932          IRTemp f64hi  = newTemp(Ity_F64);
   11933          Bool   r2zero = toBool(opc == 0x2C);
   11934 
   11935          do_MMX_preamble();
   11936          modrm = getUChar(delta);
   11937 
   11938          if (epartIsReg(modrm)) {
   11939             delta += 1;
   11940             assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
   11941             assign(f64hi, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 1));
   11942             DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "",
   11943                                       nameXMMReg(eregOfRexRM(pfx,modrm)),
   11944                                       nameMMXReg(gregLO3ofRM(modrm)));
   11945          } else {
   11946             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11947             assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   11948             assign(f64hi, loadLE(Ity_F64, binop( Iop_Add64,
   11949                                                  mkexpr(addr),
   11950                                                  mkU64(8) )));
   11951             delta += alen;
   11952             DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "",
   11953                                       dis_buf,
   11954                                       nameMMXReg(gregLO3ofRM(modrm)));
   11955          }
   11956 
   11957          if (r2zero) {
   11958             assign(rmode, mkU32((UInt)Irrm_ZERO) );
   11959          } else {
   11960             assign( rmode, get_sse_roundingmode() );
   11961          }
   11962 
   11963          assign(
   11964             dst64,
   11965             binop( Iop_32HLto64,
   11966                    binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64hi) ),
   11967                    binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo) )
   11968                  )
   11969          );
   11970 
   11971          putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
   11972          goto decode_success;
   11973       }
   11974       break;
   11975 
   11976    case 0x2E:
   11977    case 0x2F:
   11978       /* 66 0F 2E = UCOMISD -- 64F0x2 comparison G,E, and set ZCP */
   11979       /* 66 0F 2F = COMISD  -- 64F0x2 comparison G,E, and set ZCP */
   11980       if (have66noF2noF3(pfx) && sz == 2) {
   11981          delta = dis_COMISD( vbi, pfx, delta, False/*!isAvx*/, opc );
   11982          goto decode_success;
   11983       }
   11984       /* 0F 2E = UCOMISS -- 32F0x4 comparison G,E, and set ZCP */
   11985       /* 0F 2F = COMISS  -- 32F0x4 comparison G,E, and set ZCP */
   11986       if (haveNo66noF2noF3(pfx) && sz == 4) {
   11987          delta = dis_COMISS( vbi, pfx, delta, False/*!isAvx*/, opc );
   11988          goto decode_success;
   11989       }
   11990       break;
   11991 
   11992    case 0x50:
   11993       /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
   11994          to 4 lowest bits of ireg(G) */
   11995       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   11996           && epartIsReg(getUChar(delta))) {
   11997          /* sz == 8 is a kludge to handle insns with REX.W redundantly
   11998             set to 1, which has been known to happen:
   11999 
   12000             4c 0f 50 d9             rex64X movmskps %xmm1,%r11d
   12001 
   12002             20071106: Intel docs say that REX.W isn't redundant: when
   12003             present, a 64-bit register is written; when not present, only
   12004             the 32-bit half is written.  However, testing on a Core2
   12005             machine suggests the entire 64 bit register is written
   12006             irrespective of the status of REX.W.  That could be because
   12007             of the default rule that says "if the lower half of a 32-bit
   12008             register is written, the upper half is zeroed".  By using
   12009             putIReg32 here we inadvertantly produce the same behaviour as
   12010             the Core2, for the same reason -- putIReg32 implements said
   12011             rule.
   12012 
   12013             AMD docs give no indication that REX.W is even valid for this
   12014             insn. */
   12015          delta = dis_MOVMSKPS_128( vbi, pfx, delta, False/*!isAvx*/ );
   12016          goto decode_success;
   12017       }
   12018       /* 66 0F 50 = MOVMSKPD - move 2 sign bits from 2 x F64 in xmm(E) to
   12019          2 lowest bits of ireg(G) */
   12020       if (have66noF2noF3(pfx) && (sz == 2 || sz == 8)) {
   12021          /* sz == 8 is a kludge to handle insns with REX.W redundantly
   12022             set to 1, which has been known to happen:
   12023             66 4c 0f 50 d9          rex64X movmskpd %xmm1,%r11d
   12024             20071106: see further comments on MOVMSKPS implementation above.
   12025          */
   12026          delta = dis_MOVMSKPD_128( vbi, pfx, delta, False/*!isAvx*/ );
   12027          goto decode_success;
   12028       }
   12029       break;
   12030 
   12031    case 0x51:
   12032       /* F3 0F 51 = SQRTSS -- approx sqrt 32F0x4 from R/M to R */
   12033       if (haveF3no66noF2(pfx) && sz == 4) {
   12034          delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
   12035                                             "sqrtss", Iop_Sqrt32F0x4 );
   12036          goto decode_success;
   12037       }
   12038       /* 0F 51 = SQRTPS -- approx sqrt 32Fx4 from R/M to R */
   12039       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12040          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
   12041                                            "sqrtps", Iop_Sqrt32Fx4 );
   12042          goto decode_success;
   12043       }
   12044       /* F2 0F 51 = SQRTSD -- approx sqrt 64F0x2 from R/M to R */
   12045       if (haveF2no66noF3(pfx) && sz == 4) {
   12046          delta = dis_SSE_E_to_G_unary_lo64( vbi, pfx, delta,
   12047                                             "sqrtsd", Iop_Sqrt64F0x2 );
   12048          goto decode_success;
   12049       }
   12050       /* 66 0F 51 = SQRTPD -- approx sqrt 64Fx2 from R/M to R */
   12051       if (have66noF2noF3(pfx) && sz == 2) {
   12052          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
   12053                                            "sqrtpd", Iop_Sqrt64Fx2 );
   12054          goto decode_success;
   12055       }
   12056       break;
   12057 
   12058    case 0x52:
   12059       /* F3 0F 52 = RSQRTSS -- approx reciprocal sqrt 32F0x4 from R/M to R */
   12060       if (haveF3no66noF2(pfx) && sz == 4) {
   12061          delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
   12062                                             "rsqrtss", Iop_RSqrt32F0x4 );
   12063          goto decode_success;
   12064       }
   12065       /* 0F 52 = RSQRTPS -- approx reciprocal sqrt 32Fx4 from R/M to R */
   12066       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12067          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
   12068                                            "rsqrtps", Iop_RSqrt32Fx4 );
   12069          goto decode_success;
   12070       }
   12071       break;
   12072 
   12073    case 0x53:
   12074       /* F3 0F 53 = RCPSS -- approx reciprocal 32F0x4 from R/M to R */
   12075       if (haveF3no66noF2(pfx) && sz == 4) {
   12076          delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
   12077                                             "rcpss", Iop_Recip32F0x4 );
   12078          goto decode_success;
   12079       }
   12080       /* 0F 53 = RCPPS -- approx reciprocal 32Fx4 from R/M to R */
   12081       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12082          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
   12083                                            "rcpps", Iop_Recip32Fx4 );
   12084          goto decode_success;
   12085       }
   12086       break;
   12087 
   12088    case 0x54:
   12089       /* 0F 54 = ANDPS -- G = G and E */
   12090       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12091          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "andps", Iop_AndV128 );
   12092          goto decode_success;
   12093       }
   12094       /* 66 0F 54 = ANDPD -- G = G and E */
   12095       if (have66noF2noF3(pfx) && sz == 2) {
   12096          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "andpd", Iop_AndV128 );
   12097          goto decode_success;
   12098       }
   12099       break;
   12100 
   12101    case 0x55:
   12102       /* 0F 55 = ANDNPS -- G = (not G) and E */
   12103       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12104          delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "andnps",
   12105                                                            Iop_AndV128 );
   12106          goto decode_success;
   12107       }
   12108       /* 66 0F 55 = ANDNPD -- G = (not G) and E */
   12109       if (have66noF2noF3(pfx) && sz == 2) {
   12110          delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "andnpd",
   12111                                                            Iop_AndV128 );
   12112          goto decode_success;
   12113       }
   12114       break;
   12115 
   12116    case 0x56:
   12117       /* 0F 56 = ORPS -- G = G and E */
   12118       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12119          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "orps", Iop_OrV128 );
   12120          goto decode_success;
   12121       }
   12122       /* 66 0F 56 = ORPD -- G = G and E */
   12123       if (have66noF2noF3(pfx) && sz == 2) {
   12124          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "orpd", Iop_OrV128 );
   12125          goto decode_success;
   12126       }
   12127       break;
   12128 
   12129    case 0x57:
   12130       /* 66 0F 57 = XORPD -- G = G xor E */
   12131       if (have66noF2noF3(pfx) && sz == 2) {
   12132          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "xorpd", Iop_XorV128 );
   12133          goto decode_success;
   12134       }
   12135       /* 0F 57 = XORPS -- G = G xor E */
   12136       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12137          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "xorps", Iop_XorV128 );
   12138          goto decode_success;
   12139       }
   12140       break;
   12141 
   12142    case 0x58:
   12143       /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
   12144       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12145          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "addps", Iop_Add32Fx4 );
   12146          goto decode_success;
   12147       }
   12148       /* F3 0F 58 = ADDSS -- add 32F0x4 from R/M to R */
   12149       if (haveF3no66noF2(pfx) && sz == 4) {
   12150          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "addss", Iop_Add32F0x4 );
   12151          goto decode_success;
   12152       }
   12153       /* F2 0F 58 = ADDSD -- add 64F0x2 from R/M to R */
   12154       if (haveF2no66noF3(pfx)
   12155           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12156          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "addsd", Iop_Add64F0x2 );
   12157          goto decode_success;
   12158       }
   12159       /* 66 0F 58 = ADDPD -- add 32Fx4 from R/M to R */
   12160       if (have66noF2noF3(pfx)
   12161           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12162          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "addpd", Iop_Add64Fx2 );
   12163          goto decode_success;
   12164       }
   12165       break;
   12166 
   12167    case 0x59:
   12168       /* F2 0F 59 = MULSD -- mul 64F0x2 from R/M to R */
   12169       if (haveF2no66noF3(pfx)
   12170           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12171          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "mulsd", Iop_Mul64F0x2 );
   12172          goto decode_success;
   12173       }
   12174       /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
   12175       if (haveF3no66noF2(pfx) && sz == 4) {
   12176          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "mulss", Iop_Mul32F0x4 );
   12177          goto decode_success;
   12178       }
   12179       /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
   12180       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12181          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "mulps", Iop_Mul32Fx4 );
   12182          goto decode_success;
   12183       }
   12184       /* 66 0F 59 = MULPD -- mul 64Fx2 from R/M to R */
   12185       if (have66noF2noF3(pfx)
   12186           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12187          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "mulpd", Iop_Mul64Fx2 );
   12188          goto decode_success;
   12189       }
   12190       break;
   12191 
   12192    case 0x5A:
   12193       /* 0F 5A = CVTPS2PD -- convert 2 x F32 in low half mem/xmm to 2 x
   12194          F64 in xmm(G). */
   12195       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12196          delta = dis_CVTPS2PD_128( vbi, pfx, delta, False/*!isAvx*/ );
   12197          goto decode_success;
   12198       }
   12199       /* F3 0F 5A = CVTSS2SD -- convert F32 in mem/low 1/4 xmm to F64 in
   12200          low half xmm(G) */
   12201       if (haveF3no66noF2(pfx) && sz == 4) {
   12202          IRTemp f32lo = newTemp(Ity_F32);
   12203 
   12204          modrm = getUChar(delta);
   12205          if (epartIsReg(modrm)) {
   12206             delta += 1;
   12207             assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
   12208             DIP("cvtss2sd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12209                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12210          } else {
   12211             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12212             assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   12213             delta += alen;
   12214             DIP("cvtss2sd %s,%s\n", dis_buf,
   12215                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12216          }
   12217 
   12218          putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
   12219                            unop( Iop_F32toF64, mkexpr(f32lo) ) );
   12220 
   12221          goto decode_success;
   12222       }
   12223       /* F2 0F 5A = CVTSD2SS -- convert F64 in mem/low half xmm to F32 in
   12224          low 1/4 xmm(G), according to prevailing SSE rounding mode */
   12225       if (haveF2no66noF3(pfx) && sz == 4) {
   12226          IRTemp rmode = newTemp(Ity_I32);
   12227          IRTemp f64lo = newTemp(Ity_F64);
   12228 
   12229          modrm = getUChar(delta);
   12230          if (epartIsReg(modrm)) {
   12231             delta += 1;
   12232             assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
   12233             DIP("cvtsd2ss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12234                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12235          } else {
   12236             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12237             assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   12238             delta += alen;
   12239             DIP("cvtsd2ss %s,%s\n", dis_buf,
   12240                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12241          }
   12242 
   12243          assign( rmode, get_sse_roundingmode() );
   12244          putXMMRegLane32F(
   12245             gregOfRexRM(pfx,modrm), 0,
   12246             binop( Iop_F64toF32, mkexpr(rmode), mkexpr(f64lo) )
   12247          );
   12248 
   12249          goto decode_success;
   12250       }
   12251       /* 66 0F 5A = CVTPD2PS -- convert 2 x F64 in mem/xmm to 2 x F32 in
   12252          lo half xmm(G), rounding according to prevailing SSE rounding
   12253          mode, and zero upper half */
   12254       /* Note, this is practically identical to CVTPD2DQ.  It would have
   12255          be nice to merge them together. */
   12256       if (have66noF2noF3(pfx) && sz == 2) {
   12257          delta = dis_CVTPD2PS_128( vbi, pfx, delta, False/*!isAvx*/ );
   12258          goto decode_success;
   12259       }
   12260       break;
   12261 
   12262    case 0x5B:
   12263       /* F3 0F 5B = CVTTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   12264          xmm(G), rounding towards zero */
   12265       /* 66 0F 5B = CVTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   12266          xmm(G), as per the prevailing rounding mode */
   12267       if ( (have66noF2noF3(pfx) && sz == 2)
   12268            || (haveF3no66noF2(pfx) && sz == 4) ) {
   12269          Bool r2zero = toBool(sz == 4); // FIXME -- unreliable (???)
   12270          delta = dis_CVTxPS2DQ_128( vbi, pfx, delta, False/*!isAvx*/, r2zero );
   12271          goto decode_success;
   12272       }
   12273       /* 0F 5B = CVTDQ2PS -- convert 4 x I32 in mem/xmm to 4 x F32 in
   12274          xmm(G) */
   12275       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12276          delta = dis_CVTDQ2PS_128( vbi, pfx, delta, False/*!isAvx*/ );
   12277          goto decode_success;
   12278       }
   12279       break;
   12280 
   12281    case 0x5C:
   12282       /* F3 0F 5C = SUBSS -- sub 32F0x4 from R/M to R */
   12283       if (haveF3no66noF2(pfx) && sz == 4) {
   12284          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "subss", Iop_Sub32F0x4 );
   12285          goto decode_success;
   12286       }
   12287       /* F2 0F 5C = SUBSD -- sub 64F0x2 from R/M to R */
   12288       if (haveF2no66noF3(pfx)
   12289           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12290          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "subsd", Iop_Sub64F0x2 );
   12291          goto decode_success;
   12292       }
   12293       /* 0F 5C = SUBPS -- sub 32Fx4 from R/M to R */
   12294       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12295          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "subps", Iop_Sub32Fx4 );
   12296          goto decode_success;
   12297       }
   12298       /* 66 0F 5C = SUBPD -- sub 64Fx2 from R/M to R */
   12299       if (have66noF2noF3(pfx) && sz == 2) {
   12300          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "subpd", Iop_Sub64Fx2 );
   12301          goto decode_success;
   12302       }
   12303       break;
   12304 
   12305    case 0x5D:
   12306       /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
   12307       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12308          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "minps", Iop_Min32Fx4 );
   12309          goto decode_success;
   12310       }
   12311       /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
   12312       if (haveF3no66noF2(pfx) && sz == 4) {
   12313          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "minss", Iop_Min32F0x4 );
   12314          goto decode_success;
   12315       }
   12316       /* F2 0F 5D = MINSD -- min 64F0x2 from R/M to R */
   12317       if (haveF2no66noF3(pfx) && sz == 4) {
   12318          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "minsd", Iop_Min64F0x2 );
   12319          goto decode_success;
   12320       }
   12321       /* 66 0F 5D = MINPD -- min 64Fx2 from R/M to R */
   12322       if (have66noF2noF3(pfx) && sz == 2) {
   12323          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "minpd", Iop_Min64Fx2 );
   12324          goto decode_success;
   12325       }
   12326       break;
   12327 
   12328    case 0x5E:
   12329       /* F2 0F 5E = DIVSD -- div 64F0x2 from R/M to R */
   12330       if (haveF2no66noF3(pfx) && sz == 4) {
   12331          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "divsd", Iop_Div64F0x2 );
   12332          goto decode_success;
   12333       }
   12334       /* 0F 5E = DIVPS -- div 32Fx4 from R/M to R */
   12335       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12336          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "divps", Iop_Div32Fx4 );
   12337          goto decode_success;
   12338       }
   12339       /* F3 0F 5E = DIVSS -- div 32F0x4 from R/M to R */
   12340       if (haveF3no66noF2(pfx) && sz == 4) {
   12341          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "divss", Iop_Div32F0x4 );
   12342          goto decode_success;
   12343       }
   12344       /* 66 0F 5E = DIVPD -- div 64Fx2 from R/M to R */
   12345       if (have66noF2noF3(pfx) && sz == 2) {
   12346          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "divpd", Iop_Div64Fx2 );
   12347          goto decode_success;
   12348       }
   12349       break;
   12350 
   12351    case 0x5F:
   12352       /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
   12353       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12354          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "maxps", Iop_Max32Fx4 );
   12355          goto decode_success;
   12356       }
   12357       /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
   12358       if (haveF3no66noF2(pfx) && sz == 4) {
   12359          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "maxss", Iop_Max32F0x4 );
   12360          goto decode_success;
   12361       }
   12362       /* F2 0F 5F = MAXSD -- max 64F0x2 from R/M to R */
   12363       if (haveF2no66noF3(pfx) && sz == 4) {
   12364          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "maxsd", Iop_Max64F0x2 );
   12365          goto decode_success;
   12366       }
   12367       /* 66 0F 5F = MAXPD -- max 64Fx2 from R/M to R */
   12368       if (have66noF2noF3(pfx) && sz == 2) {
   12369          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "maxpd", Iop_Max64Fx2 );
   12370          goto decode_success;
   12371       }
   12372       break;
   12373 
   12374    case 0x60:
   12375       /* 66 0F 60 = PUNPCKLBW */
   12376       if (have66noF2noF3(pfx) && sz == 2) {
   12377          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   12378                                     "punpcklbw",
   12379                                     Iop_InterleaveLO8x16, True );
   12380          goto decode_success;
   12381       }
   12382       break;
   12383 
   12384    case 0x61:
   12385       /* 66 0F 61 = PUNPCKLWD */
   12386       if (have66noF2noF3(pfx) && sz == 2) {
   12387          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   12388                                     "punpcklwd",
   12389                                     Iop_InterleaveLO16x8, True );
   12390          goto decode_success;
   12391       }
   12392       break;
   12393 
   12394    case 0x62:
   12395       /* 66 0F 62 = PUNPCKLDQ */
   12396       if (have66noF2noF3(pfx) && sz == 2) {
   12397          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   12398                                     "punpckldq",
   12399                                     Iop_InterleaveLO32x4, True );
   12400          goto decode_success;
   12401       }
   12402       break;
   12403 
   12404    case 0x63:
   12405       /* 66 0F 63 = PACKSSWB */
   12406       if (have66noF2noF3(pfx) && sz == 2) {
   12407          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   12408                                     "packsswb",
   12409                                     Iop_QNarrowBin16Sto8Sx16, True );
   12410          goto decode_success;
   12411       }
   12412       break;
   12413 
   12414    case 0x64:
   12415       /* 66 0F 64 = PCMPGTB */
   12416       if (have66noF2noF3(pfx) && sz == 2) {
   12417          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   12418                                     "pcmpgtb", Iop_CmpGT8Sx16, False );
   12419          goto decode_success;
   12420       }
   12421       break;
   12422 
   12423    case 0x65:
   12424       /* 66 0F 65 = PCMPGTW */
   12425       if (have66noF2noF3(pfx) && sz == 2) {
   12426          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   12427                                     "pcmpgtw", Iop_CmpGT16Sx8, False );
   12428          goto decode_success;
   12429       }
   12430       break;
   12431 
   12432    case 0x66:
   12433       /* 66 0F 66 = PCMPGTD */
   12434       if (have66noF2noF3(pfx) && sz == 2) {
   12435          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   12436                                     "pcmpgtd", Iop_CmpGT32Sx4, False );
   12437          goto decode_success;
   12438       }
   12439       break;
   12440 
   12441    case 0x67:
   12442       /* 66 0F 67 = PACKUSWB */
   12443       if (have66noF2noF3(pfx) && sz == 2) {
   12444          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   12445                                     "packuswb",
   12446                                     Iop_QNarrowBin16Sto8Ux16, True );
   12447          goto decode_success;
   12448       }
   12449       break;
   12450 
   12451    case 0x68:
   12452       /* 66 0F 68 = PUNPCKHBW */
   12453       if (have66noF2noF3(pfx) && sz == 2) {
   12454          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   12455                                     "punpckhbw",
   12456                                     Iop_InterleaveHI8x16, True );
   12457          goto decode_success;
   12458       }
   12459       break;
   12460 
   12461    case 0x69:
   12462       /* 66 0F 69 = PUNPCKHWD */
   12463       if (have66noF2noF3(pfx) && sz == 2) {
   12464          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   12465                                     "punpckhwd",
   12466                                     Iop_InterleaveHI16x8, True );
   12467          goto decode_success;
   12468       }
   12469       break;
   12470 
   12471    case 0x6A:
   12472       /* 66 0F 6A = PUNPCKHDQ */
   12473       if (have66noF2noF3(pfx) && sz == 2) {
   12474          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   12475                                     "punpckhdq",
   12476                                     Iop_InterleaveHI32x4, True );
   12477          goto decode_success;
   12478       }
   12479       break;
   12480 
   12481    case 0x6B:
   12482       /* 66 0F 6B = PACKSSDW */
   12483       if (have66noF2noF3(pfx) && sz == 2) {
   12484          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   12485                                     "packssdw",
   12486                                     Iop_QNarrowBin32Sto16Sx8, True );
   12487          goto decode_success;
   12488       }
   12489       break;
   12490 
   12491    case 0x6C:
   12492       /* 66 0F 6C = PUNPCKLQDQ */
   12493       if (have66noF2noF3(pfx) && sz == 2) {
   12494          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   12495                                     "punpcklqdq",
   12496                                     Iop_InterleaveLO64x2, True );
   12497          goto decode_success;
   12498       }
   12499       break;
   12500 
   12501    case 0x6D:
   12502       /* 66 0F 6D = PUNPCKHQDQ */
   12503       if (have66noF2noF3(pfx) && sz == 2) {
   12504          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   12505                                     "punpckhqdq",
   12506                                     Iop_InterleaveHI64x2, True );
   12507          goto decode_success;
   12508       }
   12509       break;
   12510 
   12511    case 0x6E:
   12512       /* 66 0F 6E = MOVD from ireg32/m32 to xmm lo 1/4,
   12513                     zeroing high 3/4 of xmm. */
   12514       /*              or from ireg64/m64 to xmm lo 1/2,
   12515                     zeroing high 1/2 of xmm. */
   12516       if (have66noF2noF3(pfx)) {
   12517          vassert(sz == 2 || sz == 8);
   12518          if (sz == 2) sz = 4;
   12519          modrm = getUChar(delta);
   12520          if (epartIsReg(modrm)) {
   12521             delta += 1;
   12522             if (sz == 4) {
   12523                putXMMReg(
   12524                   gregOfRexRM(pfx,modrm),
   12525                   unop( Iop_32UtoV128, getIReg32(eregOfRexRM(pfx,modrm)) )
   12526                );
   12527                DIP("movd %s, %s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   12528                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12529             } else {
   12530                putXMMReg(
   12531                   gregOfRexRM(pfx,modrm),
   12532                   unop( Iop_64UtoV128, getIReg64(eregOfRexRM(pfx,modrm)) )
   12533                );
   12534                DIP("movq %s, %s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   12535                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12536             }
   12537          } else {
   12538             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   12539             delta += alen;
   12540             putXMMReg(
   12541                gregOfRexRM(pfx,modrm),
   12542                sz == 4
   12543                   ?  unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)) )
   12544                   :  unop( Iop_64UtoV128,loadLE(Ity_I64, mkexpr(addr)) )
   12545             );
   12546             DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q', dis_buf,
   12547                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12548          }
   12549          goto decode_success;
   12550       }
   12551       break;
   12552 
   12553    case 0x6F:
   12554       if (have66noF2noF3(pfx)
   12555           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12556          /* 66 0F 6F = MOVDQA -- move from E (mem or xmm) to G (xmm). */
   12557          modrm = getUChar(delta);
   12558          if (epartIsReg(modrm)) {
   12559             putXMMReg( gregOfRexRM(pfx,modrm),
   12560                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   12561             DIP("movdqa %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12562                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12563             delta += 1;
   12564          } else {
   12565             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12566             gen_SEGV_if_not_16_aligned( addr );
   12567             putXMMReg( gregOfRexRM(pfx,modrm),
   12568                        loadLE(Ity_V128, mkexpr(addr)) );
   12569             DIP("movdqa %s,%s\n", dis_buf,
   12570                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12571             delta += alen;
   12572          }
   12573          goto decode_success;
   12574       }
   12575       if (haveF3no66noF2(pfx) && sz == 4) {
   12576          /* F3 0F 6F = MOVDQU -- move from E (mem or xmm) to G (xmm). */
   12577          modrm = getUChar(delta);
   12578          if (epartIsReg(modrm)) {
   12579             putXMMReg( gregOfRexRM(pfx,modrm),
   12580                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   12581             DIP("movdqu %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12582                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12583             delta += 1;
   12584          } else {
   12585             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12586             putXMMReg( gregOfRexRM(pfx,modrm),
   12587                        loadLE(Ity_V128, mkexpr(addr)) );
   12588             DIP("movdqu %s,%s\n", dis_buf,
   12589                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12590             delta += alen;
   12591          }
   12592          goto decode_success;
   12593       }
   12594       break;
   12595 
   12596    case 0x70:
   12597       /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */
   12598       if (have66noF2noF3(pfx) && sz == 2) {
   12599          delta = dis_PSHUFD_32x4( vbi, pfx, delta, False/*!writesYmm*/);
   12600          goto decode_success;
   12601       }
   12602       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   12603       /* 0F 70 = PSHUFW -- rearrange 4x16 from E(mmx or mem) to G(mmx) */
   12604       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12605          Int order;
   12606          IRTemp sV, dV, s3, s2, s1, s0;
   12607          s3 = s2 = s1 = s0 = IRTemp_INVALID;
   12608          sV = newTemp(Ity_I64);
   12609          dV = newTemp(Ity_I64);
   12610          do_MMX_preamble();
   12611          modrm = getUChar(delta);
   12612          if (epartIsReg(modrm)) {
   12613             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   12614             order = (Int)getUChar(delta+1);
   12615             delta += 1+1;
   12616             DIP("pshufw $%d,%s,%s\n", order,
   12617                                       nameMMXReg(eregLO3ofRM(modrm)),
   12618                                       nameMMXReg(gregLO3ofRM(modrm)));
   12619          } else {
   12620             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   12621                               1/*extra byte after amode*/ );
   12622             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   12623             order = (Int)getUChar(delta+alen);
   12624             delta += 1+alen;
   12625             DIP("pshufw $%d,%s,%s\n", order,
   12626                                       dis_buf,
   12627                                       nameMMXReg(gregLO3ofRM(modrm)));
   12628          }
   12629          breakup64to16s( sV, &s3, &s2, &s1, &s0 );
   12630 #        define SEL(n) \
   12631                    ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   12632          assign(dV,
   12633    	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   12634                              SEL((order>>2)&3), SEL((order>>0)&3) )
   12635          );
   12636          putMMXReg(gregLO3ofRM(modrm), mkexpr(dV));
   12637 #        undef SEL
   12638          goto decode_success;
   12639       }
   12640       /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or
   12641          mem) to G(xmm), and copy upper half */
   12642       if (haveF2no66noF3(pfx) && sz == 4) {
   12643          delta = dis_PSHUFxW_128( vbi, pfx, delta,
   12644                                   False/*!isAvx*/, False/*!xIsH*/ );
   12645          goto decode_success;
   12646       }
   12647       /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or
   12648          mem) to G(xmm), and copy lower half */
   12649       if (haveF3no66noF2(pfx) && sz == 4) {
   12650          delta = dis_PSHUFxW_128( vbi, pfx, delta,
   12651                                   False/*!isAvx*/, True/*xIsH*/ );
   12652          goto decode_success;
   12653       }
   12654       break;
   12655 
   12656    case 0x71:
   12657       /* 66 0F 71 /2 ib = PSRLW by immediate */
   12658       if (have66noF2noF3(pfx) && sz == 2
   12659           && epartIsReg(getUChar(delta))
   12660           && gregLO3ofRM(getUChar(delta)) == 2) {
   12661          delta = dis_SSE_shiftE_imm( pfx, delta, "psrlw", Iop_ShrN16x8 );
   12662          goto decode_success;
   12663       }
   12664       /* 66 0F 71 /4 ib = PSRAW by immediate */
   12665       if (have66noF2noF3(pfx) && sz == 2
   12666           && epartIsReg(getUChar(delta))
   12667           && gregLO3ofRM(getUChar(delta)) == 4) {
   12668          delta = dis_SSE_shiftE_imm( pfx, delta, "psraw", Iop_SarN16x8 );
   12669          goto decode_success;
   12670       }
   12671       /* 66 0F 71 /6 ib = PSLLW by immediate */
   12672       if (have66noF2noF3(pfx) && sz == 2
   12673           && epartIsReg(getUChar(delta))
   12674           && gregLO3ofRM(getUChar(delta)) == 6) {
   12675          delta = dis_SSE_shiftE_imm( pfx, delta, "psllw", Iop_ShlN16x8 );
   12676          goto decode_success;
   12677       }
   12678       break;
   12679 
   12680    case 0x72:
   12681       /* 66 0F 72 /2 ib = PSRLD by immediate */
   12682       if (have66noF2noF3(pfx) && sz == 2
   12683           && epartIsReg(getUChar(delta))
   12684           && gregLO3ofRM(getUChar(delta)) == 2) {
   12685          delta = dis_SSE_shiftE_imm( pfx, delta, "psrld", Iop_ShrN32x4 );
   12686          goto decode_success;
   12687       }
   12688       /* 66 0F 72 /4 ib = PSRAD by immediate */
   12689       if (have66noF2noF3(pfx) && sz == 2
   12690           && epartIsReg(getUChar(delta))
   12691           && gregLO3ofRM(getUChar(delta)) == 4) {
   12692          delta = dis_SSE_shiftE_imm( pfx, delta, "psrad", Iop_SarN32x4 );
   12693          goto decode_success;
   12694       }
   12695       /* 66 0F 72 /6 ib = PSLLD by immediate */
   12696       if (have66noF2noF3(pfx) && sz == 2
   12697           && epartIsReg(getUChar(delta))
   12698           && gregLO3ofRM(getUChar(delta)) == 6) {
   12699          delta = dis_SSE_shiftE_imm( pfx, delta, "pslld", Iop_ShlN32x4 );
   12700          goto decode_success;
   12701       }
   12702       break;
   12703 
   12704    case 0x73:
   12705       /* 66 0F 73 /3 ib = PSRLDQ by immediate */
   12706       /* note, if mem case ever filled in, 1 byte after amode */
   12707       if (have66noF2noF3(pfx) && sz == 2
   12708           && epartIsReg(getUChar(delta))
   12709           && gregLO3ofRM(getUChar(delta)) == 3) {
   12710          Int imm = (Int)getUChar(delta+1);
   12711          Int reg = eregOfRexRM(pfx,getUChar(delta));
   12712          DIP("psrldq $%d,%s\n", imm, nameXMMReg(reg));
   12713          delta += 2;
   12714          IRTemp sV = newTemp(Ity_V128);
   12715          assign( sV, getXMMReg(reg) );
   12716          putXMMReg(reg, mkexpr(math_PSRLDQ( sV, imm )));
   12717          goto decode_success;
   12718       }
   12719       /* 66 0F 73 /7 ib = PSLLDQ by immediate */
   12720       /* note, if mem case ever filled in, 1 byte after amode */
   12721       if (have66noF2noF3(pfx) && sz == 2
   12722           && epartIsReg(getUChar(delta))
   12723           && gregLO3ofRM(getUChar(delta)) == 7) {
   12724          Int imm = (Int)getUChar(delta+1);
   12725          Int reg = eregOfRexRM(pfx,getUChar(delta));
   12726          DIP("pslldq $%d,%s\n", imm, nameXMMReg(reg));
   12727          vassert(imm >= 0 && imm <= 255);
   12728          delta += 2;
   12729          IRTemp sV = newTemp(Ity_V128);
   12730          assign( sV, getXMMReg(reg) );
   12731          putXMMReg(reg, mkexpr(math_PSLLDQ( sV, imm )));
   12732          goto decode_success;
   12733       }
   12734       /* 66 0F 73 /2 ib = PSRLQ by immediate */
   12735       if (have66noF2noF3(pfx) && sz == 2
   12736           && epartIsReg(getUChar(delta))
   12737           && gregLO3ofRM(getUChar(delta)) == 2) {
   12738          delta = dis_SSE_shiftE_imm( pfx, delta, "psrlq", Iop_ShrN64x2 );
   12739          goto decode_success;
   12740       }
   12741       /* 66 0F 73 /6 ib = PSLLQ by immediate */
   12742       if (have66noF2noF3(pfx) && sz == 2
   12743           && epartIsReg(getUChar(delta))
   12744           && gregLO3ofRM(getUChar(delta)) == 6) {
   12745          delta = dis_SSE_shiftE_imm( pfx, delta, "psllq", Iop_ShlN64x2 );
   12746          goto decode_success;
   12747       }
   12748       break;
   12749 
   12750    case 0x74:
   12751       /* 66 0F 74 = PCMPEQB */
   12752       if (have66noF2noF3(pfx) && sz == 2) {
   12753          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   12754                                     "pcmpeqb", Iop_CmpEQ8x16, False );
   12755          goto decode_success;
   12756       }
   12757       break;
   12758 
   12759    case 0x75:
   12760       /* 66 0F 75 = PCMPEQW */
   12761       if (have66noF2noF3(pfx) && sz == 2) {
   12762          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   12763                                     "pcmpeqw", Iop_CmpEQ16x8, False );
   12764          goto decode_success;
   12765       }
   12766       break;
   12767 
   12768    case 0x76:
   12769       /* 66 0F 76 = PCMPEQD */
   12770       if (have66noF2noF3(pfx) && sz == 2) {
   12771          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   12772                                     "pcmpeqd", Iop_CmpEQ32x4, False );
   12773          goto decode_success;
   12774       }
   12775       break;
   12776 
   12777    case 0x7E:
   12778       /* F3 0F 7E = MOVQ -- move 64 bits from E (mem or lo half xmm) to
   12779          G (lo half xmm).  Upper half of G is zeroed out. */
   12780       if (haveF3no66noF2(pfx)
   12781           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12782          modrm = getUChar(delta);
   12783          if (epartIsReg(modrm)) {
   12784             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   12785                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
   12786                /* zero bits 127:64 */
   12787                putXMMRegLane64( gregOfRexRM(pfx,modrm), 1, mkU64(0) );
   12788             DIP("movsd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12789                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   12790             delta += 1;
   12791          } else {
   12792             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12793             putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
   12794             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   12795                              loadLE(Ity_I64, mkexpr(addr)) );
   12796             DIP("movsd %s,%s\n", dis_buf,
   12797                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   12798             delta += alen;
   12799          }
   12800          goto decode_success;
   12801       }
   12802       /* 66 0F 7E = MOVD from xmm low 1/4 to ireg32 or m32. */
   12803       /*              or from xmm low 1/2 to ireg64 or m64. */
   12804          if (have66noF2noF3(pfx) && (sz == 2 || sz == 8)) {
   12805          if (sz == 2) sz = 4;
   12806          modrm = getUChar(delta);
   12807          if (epartIsReg(modrm)) {
   12808             delta += 1;
   12809             if (sz == 4) {
   12810                putIReg32( eregOfRexRM(pfx,modrm),
   12811                           getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
   12812                DIP("movd %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12813                                     nameIReg32(eregOfRexRM(pfx,modrm)));
   12814    	 } else {
   12815                putIReg64( eregOfRexRM(pfx,modrm),
   12816                           getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
   12817                DIP("movq %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12818                                     nameIReg64(eregOfRexRM(pfx,modrm)));
   12819    	 }
   12820          } else {
   12821             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   12822             delta += alen;
   12823             storeLE( mkexpr(addr),
   12824                      sz == 4
   12825                         ? getXMMRegLane32(gregOfRexRM(pfx,modrm),0)
   12826                         : getXMMRegLane64(gregOfRexRM(pfx,modrm),0) );
   12827             DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q',
   12828                                   nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
   12829          }
   12830          goto decode_success;
   12831       }
   12832       break;
   12833 
   12834    case 0x7F:
   12835       /* F3 0F 7F = MOVDQU -- move from G (xmm) to E (mem or xmm). */
   12836       if (haveF3no66noF2(pfx) && sz == 4) {
   12837          modrm = getUChar(delta);
   12838          if (epartIsReg(modrm)) {
   12839             goto decode_failure; /* awaiting test case */
   12840             delta += 1;
   12841             putXMMReg( eregOfRexRM(pfx,modrm),
   12842                        getXMMReg(gregOfRexRM(pfx,modrm)) );
   12843             DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12844                                    nameXMMReg(eregOfRexRM(pfx,modrm)));
   12845          } else {
   12846             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   12847             delta += alen;
   12848             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   12849             DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
   12850          }
   12851          goto decode_success;
   12852       }
   12853       /* 66 0F 7F = MOVDQA -- move from G (xmm) to E (mem or xmm). */
   12854       if (have66noF2noF3(pfx) && sz == 2) {
   12855          modrm = getUChar(delta);
   12856          if (epartIsReg(modrm)) {
   12857             delta += 1;
   12858             putXMMReg( eregOfRexRM(pfx,modrm),
   12859                        getXMMReg(gregOfRexRM(pfx,modrm)) );
   12860             DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12861                                    nameXMMReg(eregOfRexRM(pfx,modrm)));
   12862          } else {
   12863             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   12864             gen_SEGV_if_not_16_aligned( addr );
   12865             delta += alen;
   12866             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   12867             DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
   12868          }
   12869          goto decode_success;
   12870       }
   12871       break;
   12872 
   12873    case 0xAE:
   12874       /* 0F AE /7 = SFENCE -- flush pending operations to memory */
   12875       if (haveNo66noF2noF3(pfx)
   12876           && epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 7
   12877           && sz == 4) {
   12878          delta += 1;
   12879          /* Insert a memory fence.  It's sometimes important that these
   12880             are carried through to the generated code. */
   12881          stmt( IRStmt_MBE(Imbe_Fence) );
   12882          DIP("sfence\n");
   12883          goto decode_success;
   12884       }
   12885       /* mindless duplication follows .. */
   12886       /* 0F AE /5 = LFENCE -- flush pending operations to memory */
   12887       /* 0F AE /6 = MFENCE -- flush pending operations to memory */
   12888       if (haveNo66noF2noF3(pfx)
   12889           && epartIsReg(getUChar(delta))
   12890           && (gregLO3ofRM(getUChar(delta)) == 5
   12891               || gregLO3ofRM(getUChar(delta)) == 6)
   12892           && sz == 4) {
   12893          delta += 1;
   12894          /* Insert a memory fence.  It's sometimes important that these
   12895             are carried through to the generated code. */
   12896          stmt( IRStmt_MBE(Imbe_Fence) );
   12897          DIP("%sfence\n", gregLO3ofRM(getUChar(delta-1))==5 ? "l" : "m");
   12898          goto decode_success;
   12899       }
   12900 
   12901       /* 0F AE /7 = CLFLUSH -- flush cache line */
   12902       if (haveNo66noF2noF3(pfx)
   12903           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 7
   12904           && sz == 4) {
   12905 
   12906          /* This is something of a hack.  We need to know the size of
   12907             the cache line containing addr.  Since we don't (easily),
   12908             assume 256 on the basis that no real cache would have a
   12909             line that big.  It's safe to invalidate more stuff than we
   12910             need, just inefficient. */
   12911          ULong lineszB = 256ULL;
   12912 
   12913          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12914          delta += alen;
   12915 
   12916          /* Round addr down to the start of the containing block. */
   12917          stmt( IRStmt_Put(
   12918                   OFFB_TISTART,
   12919                   binop( Iop_And64,
   12920                          mkexpr(addr),
   12921                          mkU64( ~(lineszB-1) ))) );
   12922 
   12923          stmt( IRStmt_Put(OFFB_TILEN, mkU64(lineszB) ) );
   12924 
   12925          jmp_lit(dres, Ijk_TInval, (Addr64)(guest_RIP_bbstart+delta));
   12926 
   12927          DIP("clflush %s\n", dis_buf);
   12928          goto decode_success;
   12929       }
   12930 
   12931       /* 0F AE /3 = STMXCSR m32 -- store %mxcsr */
   12932       if (haveNo66noF2noF3(pfx)
   12933           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 3
   12934           && sz == 4) {
   12935          delta = dis_STMXCSR(vbi, pfx, delta, False/*!isAvx*/);
   12936          goto decode_success;
   12937       }
   12938       /* 0F AE /2 = LDMXCSR m32 -- load %mxcsr */
   12939       if (haveNo66noF2noF3(pfx)
   12940           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 2
   12941           && sz == 4) {
   12942          delta = dis_LDMXCSR(vbi, pfx, delta, False/*!isAvx*/);
   12943          goto decode_success;
   12944       }
   12945       /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory.
   12946          Note that the presence or absence of REX.W slightly affects the
   12947          written format: whether the saved FPU IP and DP pointers are 64
   12948          or 32 bits.  But the helper function we call simply writes zero
   12949          bits in the relevant fields (which are 64 bits regardless of
   12950          what REX.W is) and so it's good enough (iow, equally broken) in
   12951          both cases. */
   12952       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   12953           && !epartIsReg(getUChar(delta))
   12954           && gregOfRexRM(pfx,getUChar(delta)) == 0) {
   12955           IRDirty* d;
   12956          modrm = getUChar(delta);
   12957          vassert(!epartIsReg(modrm));
   12958 
   12959          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12960          delta += alen;
   12961          gen_SEGV_if_not_16_aligned(addr);
   12962 
   12963          DIP("%sfxsave %s\n", sz==8 ? "rex64/" : "", dis_buf);
   12964 
   12965          /* Uses dirty helper:
   12966                void amd64g_do_FXSAVE ( VexGuestAMD64State*, ULong ) */
   12967          d = unsafeIRDirty_0_N (
   12968                 0/*regparms*/,
   12969                 "amd64g_dirtyhelper_FXSAVE",
   12970                 &amd64g_dirtyhelper_FXSAVE,
   12971                 mkIRExprVec_1( mkexpr(addr) )
   12972              );
   12973          d->needsBBP = True;
   12974 
   12975          /* declare we're writing memory */
   12976          d->mFx   = Ifx_Write;
   12977          d->mAddr = mkexpr(addr);
   12978          d->mSize = 464; /* according to recent Intel docs */
   12979 
   12980          /* declare we're reading guest state */
   12981          d->nFxState = 7;
   12982          vex_bzero(&d->fxState, sizeof(d->fxState));
   12983 
   12984          d->fxState[0].fx     = Ifx_Read;
   12985          d->fxState[0].offset = OFFB_FTOP;
   12986          d->fxState[0].size   = sizeof(UInt);
   12987 
   12988          d->fxState[1].fx     = Ifx_Read;
   12989          d->fxState[1].offset = OFFB_FPREGS;
   12990          d->fxState[1].size   = 8 * sizeof(ULong);
   12991 
   12992          d->fxState[2].fx     = Ifx_Read;
   12993          d->fxState[2].offset = OFFB_FPTAGS;
   12994          d->fxState[2].size   = 8 * sizeof(UChar);
   12995 
   12996          d->fxState[3].fx     = Ifx_Read;
   12997          d->fxState[3].offset = OFFB_FPROUND;
   12998          d->fxState[3].size   = sizeof(ULong);
   12999 
   13000          d->fxState[4].fx     = Ifx_Read;
   13001          d->fxState[4].offset = OFFB_FC3210;
   13002          d->fxState[4].size   = sizeof(ULong);
   13003 
   13004          d->fxState[5].fx     = Ifx_Read;
   13005          d->fxState[5].offset = OFFB_YMM0;
   13006          d->fxState[5].size   = sizeof(U128);
   13007          /* plus 15 more of the above, spaced out in YMM sized steps */
   13008          d->fxState[5].nRepeats  = 15;
   13009          d->fxState[5].repeatLen = sizeof(U256);
   13010 
   13011          d->fxState[6].fx     = Ifx_Read;
   13012          d->fxState[6].offset = OFFB_SSEROUND;
   13013          d->fxState[6].size   = sizeof(ULong);
   13014 
   13015          /* Be paranoid ... this assertion tries to ensure the 16 %ymm
   13016             images are packed back-to-back.  If not, the settings for
   13017             d->fxState[5] are wrong. */
   13018          vassert(32 == sizeof(U256));
   13019          vassert(OFFB_YMM15 == (OFFB_YMM0 + 15 * 32));
   13020 
   13021          stmt( IRStmt_Dirty(d) );
   13022 
   13023          goto decode_success;
   13024       }
   13025       /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory.
   13026          As with FXSAVE above we ignore the value of REX.W since we're
   13027          not bothering with the FPU DP and IP fields. */
   13028       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   13029           && !epartIsReg(getUChar(delta))
   13030           && gregOfRexRM(pfx,getUChar(delta)) == 1) {
   13031          IRDirty* d;
   13032          modrm = getUChar(delta);
   13033          vassert(!epartIsReg(modrm));
   13034 
   13035          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13036          delta += alen;
   13037          gen_SEGV_if_not_16_aligned(addr);
   13038 
   13039          DIP("%sfxrstor %s\n", sz==8 ? "rex64/" : "", dis_buf);
   13040 
   13041          /* Uses dirty helper:
   13042                VexEmWarn amd64g_do_FXRSTOR ( VexGuestAMD64State*, ULong )
   13043             NOTE:
   13044                the VexEmWarn value is simply ignored
   13045          */
   13046          d = unsafeIRDirty_0_N (
   13047                 0/*regparms*/,
   13048                 "amd64g_dirtyhelper_FXRSTOR",
   13049                 &amd64g_dirtyhelper_FXRSTOR,
   13050                 mkIRExprVec_1( mkexpr(addr) )
   13051              );
   13052          d->needsBBP = True;
   13053 
   13054          /* declare we're reading memory */
   13055          d->mFx   = Ifx_Read;
   13056          d->mAddr = mkexpr(addr);
   13057          d->mSize = 464; /* according to recent Intel docs */
   13058 
   13059          /* declare we're writing guest state */
   13060          d->nFxState = 7;
   13061          vex_bzero(&d->fxState, sizeof(d->fxState));
   13062 
   13063          d->fxState[0].fx     = Ifx_Write;
   13064          d->fxState[0].offset = OFFB_FTOP;
   13065          d->fxState[0].size   = sizeof(UInt);
   13066 
   13067          d->fxState[1].fx     = Ifx_Write;
   13068          d->fxState[1].offset = OFFB_FPREGS;
   13069          d->fxState[1].size   = 8 * sizeof(ULong);
   13070 
   13071          d->fxState[2].fx     = Ifx_Write;
   13072          d->fxState[2].offset = OFFB_FPTAGS;
   13073          d->fxState[2].size   = 8 * sizeof(UChar);
   13074 
   13075          d->fxState[3].fx     = Ifx_Write;
   13076          d->fxState[3].offset = OFFB_FPROUND;
   13077          d->fxState[3].size   = sizeof(ULong);
   13078 
   13079          d->fxState[4].fx     = Ifx_Write;
   13080          d->fxState[4].offset = OFFB_FC3210;
   13081          d->fxState[4].size   = sizeof(ULong);
   13082 
   13083          d->fxState[5].fx     = Ifx_Write;
   13084          d->fxState[5].offset = OFFB_YMM0;
   13085          d->fxState[5].size   = sizeof(U128);
   13086          /* plus 15 more of the above, spaced out in YMM sized steps */
   13087          d->fxState[5].nRepeats  = 15;
   13088          d->fxState[5].repeatLen = sizeof(U256);
   13089 
   13090          d->fxState[6].fx     = Ifx_Write;
   13091          d->fxState[6].offset = OFFB_SSEROUND;
   13092          d->fxState[6].size   = sizeof(ULong);
   13093 
   13094          /* Be paranoid ... this assertion tries to ensure the 16 %ymm
   13095             images are packed back-to-back.  If not, the settings for
   13096             d->fxState[5] are wrong. */
   13097          vassert(32 == sizeof(U256));
   13098          vassert(OFFB_YMM15 == (OFFB_YMM0 + 15 * 32));
   13099 
   13100          stmt( IRStmt_Dirty(d) );
   13101 
   13102          goto decode_success;
   13103       }
   13104       break;
   13105 
   13106    case 0xC2:
   13107       /* 0F C2 = CMPPS -- 32Fx4 comparison from R/M to R */
   13108       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13109          Long delta0 = delta;
   13110          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpps", True, 4 );
   13111          if (delta > delta0) goto decode_success;
   13112       }
   13113       /* F3 0F C2 = CMPSS -- 32F0x4 comparison from R/M to R */
   13114       if (haveF3no66noF2(pfx) && sz == 4) {
   13115          Long delta0 = delta;
   13116          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpss", False, 4 );
   13117          if (delta > delta0) goto decode_success;
   13118       }
   13119       /* F2 0F C2 = CMPSD -- 64F0x2 comparison from R/M to R */
   13120       if (haveF2no66noF3(pfx) && sz == 4) {
   13121          Long delta0 = delta;
   13122          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpsd", False, 8 );
   13123          if (delta > delta0) goto decode_success;
   13124       }
   13125       /* 66 0F C2 = CMPPD -- 64Fx2 comparison from R/M to R */
   13126       if (have66noF2noF3(pfx) && sz == 2) {
   13127          Long delta0 = delta;
   13128          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmppd", True, 8 );
   13129          if (delta > delta0) goto decode_success;
   13130       }
   13131       break;
   13132 
   13133    case 0xC3:
   13134       /* 0F C3 = MOVNTI -- for us, just a plain ireg store. */
   13135       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)) {
   13136          modrm = getUChar(delta);
   13137          if (!epartIsReg(modrm)) {
   13138             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13139             storeLE( mkexpr(addr), getIRegG(sz, pfx, modrm) );
   13140             DIP("movnti %s,%s\n", dis_buf,
   13141                                   nameIRegG(sz, pfx, modrm));
   13142             delta += alen;
   13143             goto decode_success;
   13144          }
   13145          /* else fall through */
   13146       }
   13147       break;
   13148 
   13149    case 0xC4:
   13150       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   13151       /* 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   13152          put it into the specified lane of mmx(G). */
   13153       if (haveNo66noF2noF3(pfx)
   13154           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   13155          /* Use t0 .. t3 to hold the 4 original 16-bit lanes of the
   13156             mmx reg.  t4 is the new lane value.  t5 is the original
   13157             mmx value. t6 is the new mmx value. */
   13158          Int lane;
   13159          t4 = newTemp(Ity_I16);
   13160          t5 = newTemp(Ity_I64);
   13161          t6 = newTemp(Ity_I64);
   13162          modrm = getUChar(delta);
   13163          do_MMX_preamble();
   13164 
   13165          assign(t5, getMMXReg(gregLO3ofRM(modrm)));
   13166          breakup64to16s( t5, &t3, &t2, &t1, &t0 );
   13167 
   13168          if (epartIsReg(modrm)) {
   13169             assign(t4, getIReg16(eregOfRexRM(pfx,modrm)));
   13170             delta += 1+1;
   13171             lane = getUChar(delta-1);
   13172             DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   13173                                       nameIReg16(eregOfRexRM(pfx,modrm)),
   13174                                       nameMMXReg(gregLO3ofRM(modrm)));
   13175          } else {
   13176             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   13177             delta += 1+alen;
   13178             lane = getUChar(delta-1);
   13179             assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   13180             DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   13181                                       dis_buf,
   13182                                       nameMMXReg(gregLO3ofRM(modrm)));
   13183          }
   13184 
   13185          switch (lane & 3) {
   13186             case 0:  assign(t6, mk64from16s(t3,t2,t1,t4)); break;
   13187             case 1:  assign(t6, mk64from16s(t3,t2,t4,t0)); break;
   13188             case 2:  assign(t6, mk64from16s(t3,t4,t1,t0)); break;
   13189             case 3:  assign(t6, mk64from16s(t4,t2,t1,t0)); break;
   13190             default: vassert(0);
   13191          }
   13192          putMMXReg(gregLO3ofRM(modrm), mkexpr(t6));
   13193          goto decode_success;
   13194       }
   13195       /* 66 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   13196          put it into the specified lane of xmm(G). */
   13197       if (have66noF2noF3(pfx)
   13198           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   13199          Int lane;
   13200          t4 = newTemp(Ity_I16);
   13201          modrm = getUChar(delta);
   13202          UInt rG = gregOfRexRM(pfx,modrm);
   13203          if (epartIsReg(modrm)) {
   13204             UInt rE = eregOfRexRM(pfx,modrm);
   13205             assign(t4, getIReg16(rE));
   13206             delta += 1+1;
   13207             lane = getUChar(delta-1);
   13208             DIP("pinsrw $%d,%s,%s\n",
   13209                 (Int)lane, nameIReg16(rE), nameXMMReg(rG));
   13210          } else {
   13211             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   13212                               1/*byte after the amode*/ );
   13213             delta += 1+alen;
   13214             lane = getUChar(delta-1);
   13215             assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   13216             DIP("pinsrw $%d,%s,%s\n",
   13217                 (Int)lane, dis_buf, nameXMMReg(rG));
   13218          }
   13219          IRTemp src_vec = newTemp(Ity_V128);
   13220          assign(src_vec, getXMMReg(rG));
   13221          IRTemp res_vec = math_PINSRW_128( src_vec, t4, lane & 7);
   13222          putXMMReg(rG, mkexpr(res_vec));
   13223          goto decode_success;
   13224       }
   13225       break;
   13226 
   13227    case 0xC5:
   13228       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   13229       /* 0F C5 = PEXTRW -- extract 16-bit field from mmx(E) and put
   13230          zero-extend of it in ireg(G). */
   13231       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)) {
   13232          modrm = getUChar(delta);
   13233          if (epartIsReg(modrm)) {
   13234             IRTemp sV = newTemp(Ity_I64);
   13235             t5 = newTemp(Ity_I16);
   13236             do_MMX_preamble();
   13237             assign(sV, getMMXReg(eregLO3ofRM(modrm)));
   13238             breakup64to16s( sV, &t3, &t2, &t1, &t0 );
   13239             switch (getUChar(delta+1) & 3) {
   13240                case 0:  assign(t5, mkexpr(t0)); break;
   13241                case 1:  assign(t5, mkexpr(t1)); break;
   13242                case 2:  assign(t5, mkexpr(t2)); break;
   13243                case 3:  assign(t5, mkexpr(t3)); break;
   13244                default: vassert(0);
   13245             }
   13246             if (sz == 8)
   13247                putIReg64(gregOfRexRM(pfx,modrm), unop(Iop_16Uto64, mkexpr(t5)));
   13248             else
   13249                putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_16Uto32, mkexpr(t5)));
   13250             DIP("pextrw $%d,%s,%s\n",
   13251                 (Int)getUChar(delta+1),
   13252                 nameMMXReg(eregLO3ofRM(modrm)),
   13253                 sz==8 ? nameIReg64(gregOfRexRM(pfx,modrm))
   13254                       : nameIReg32(gregOfRexRM(pfx,modrm))
   13255             );
   13256             delta += 2;
   13257             goto decode_success;
   13258          }
   13259          /* else fall through */
   13260          /* note, for anyone filling in the mem case: this insn has one
   13261             byte after the amode and therefore you must pass 1 as the
   13262             last arg to disAMode */
   13263       }
   13264       /* 66 0F C5 = PEXTRW -- extract 16-bit field from xmm(E) and put
   13265          zero-extend of it in ireg(G). */
   13266       if (have66noF2noF3(pfx)
   13267           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   13268          Long delta0 = delta;
   13269          delta = dis_PEXTRW_128_EregOnly_toG( vbi, pfx, delta,
   13270                                               False/*!isAvx*/ );
   13271          if (delta > delta0) goto decode_success;
   13272          /* else fall through -- decoding has failed */
   13273       }
   13274       break;
   13275 
   13276    case 0xC6:
   13277       /* 0F C6 /r ib = SHUFPS -- shuffle packed F32s */
   13278       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13279          Int    imm8 = 0;
   13280          IRTemp sV   = newTemp(Ity_V128);
   13281          IRTemp dV   = newTemp(Ity_V128);
   13282          modrm = getUChar(delta);
   13283          UInt rG = gregOfRexRM(pfx,modrm);
   13284          assign( dV, getXMMReg(rG) );
   13285          if (epartIsReg(modrm)) {
   13286             UInt rE = eregOfRexRM(pfx,modrm);
   13287             assign( sV, getXMMReg(rE) );
   13288             imm8 = (Int)getUChar(delta+1);
   13289             delta += 1+1;
   13290             DIP("shufps $%d,%s,%s\n", imm8, nameXMMReg(rE), nameXMMReg(rG));
   13291          } else {
   13292             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   13293             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   13294             imm8 = (Int)getUChar(delta+alen);
   13295             delta += 1+alen;
   13296             DIP("shufps $%d,%s,%s\n", imm8, dis_buf, nameXMMReg(rG));
   13297          }
   13298          IRTemp res = math_SHUFPS_128( sV, dV, imm8 );
   13299          putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
   13300          goto decode_success;
   13301       }
   13302       /* 66 0F C6 /r ib = SHUFPD -- shuffle packed F64s */
   13303       if (have66noF2noF3(pfx) && sz == 2) {
   13304          Int    select;
   13305          IRTemp sV = newTemp(Ity_V128);
   13306          IRTemp dV = newTemp(Ity_V128);
   13307 
   13308          modrm = getUChar(delta);
   13309          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   13310 
   13311          if (epartIsReg(modrm)) {
   13312             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   13313             select = (Int)getUChar(delta+1);
   13314             delta += 1+1;
   13315             DIP("shufpd $%d,%s,%s\n", select,
   13316                                       nameXMMReg(eregOfRexRM(pfx,modrm)),
   13317                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
   13318          } else {
   13319             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   13320             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   13321             select = getUChar(delta+alen);
   13322             delta += 1+alen;
   13323             DIP("shufpd $%d,%s,%s\n", select,
   13324                                       dis_buf,
   13325                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
   13326          }
   13327 
   13328          IRTemp res = math_SHUFPD_128( sV, dV, select );
   13329          putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
   13330          goto decode_success;
   13331       }
   13332       break;
   13333 
   13334    case 0xD1:
   13335       /* 66 0F D1 = PSRLW by E */
   13336       if (have66noF2noF3(pfx) && sz == 2) {
   13337          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrlw", Iop_ShrN16x8 );
   13338          goto decode_success;
   13339       }
   13340       break;
   13341 
   13342    case 0xD2:
   13343       /* 66 0F D2 = PSRLD by E */
   13344       if (have66noF2noF3(pfx) && sz == 2) {
   13345          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrld", Iop_ShrN32x4 );
   13346          goto decode_success;
   13347       }
   13348       break;
   13349 
   13350    case 0xD3:
   13351       /* 66 0F D3 = PSRLQ by E */
   13352       if (have66noF2noF3(pfx) && sz == 2) {
   13353          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrlq", Iop_ShrN64x2 );
   13354          goto decode_success;
   13355       }
   13356       break;
   13357 
   13358    case 0xD4:
   13359       /* 66 0F D4 = PADDQ */
   13360       if (have66noF2noF3(pfx) && sz == 2) {
   13361          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13362                                     "paddq", Iop_Add64x2, False );
   13363          goto decode_success;
   13364       }
   13365       /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   13366       /* 0F D4 = PADDQ -- add 64x1 */
   13367       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13368          do_MMX_preamble();
   13369          delta = dis_MMXop_regmem_to_reg (
   13370                    vbi, pfx, delta, opc, "paddq", False );
   13371          goto decode_success;
   13372       }
   13373       break;
   13374 
   13375    case 0xD5:
   13376       /* 66 0F D5 = PMULLW -- 16x8 multiply */
   13377       if (have66noF2noF3(pfx) && sz == 2) {
   13378          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13379                                     "pmullw", Iop_Mul16x8, False );
   13380          goto decode_success;
   13381       }
   13382       break;
   13383 
   13384    case 0xD6:
   13385       /* F3 0F D6 = MOVQ2DQ -- move from E (mmx) to G (lo half xmm, zero
   13386          hi half). */
   13387       if (haveF3no66noF2(pfx) && sz == 4) {
   13388          modrm = getUChar(delta);
   13389          if (epartIsReg(modrm)) {
   13390             do_MMX_preamble();
   13391             putXMMReg( gregOfRexRM(pfx,modrm),
   13392                        unop(Iop_64UtoV128, getMMXReg( eregLO3ofRM(modrm) )) );
   13393             DIP("movq2dq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   13394                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   13395             delta += 1;
   13396             goto decode_success;
   13397          }
   13398          /* apparently no mem case for this insn */
   13399       }
   13400       /* 66 0F D6 = MOVQ -- move 64 bits from G (lo half xmm) to E (mem
   13401          or lo half xmm).  */
   13402       if (have66noF2noF3(pfx)
   13403           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   13404          modrm = getUChar(delta);
   13405          if (epartIsReg(modrm)) {
   13406             /* fall through, awaiting test case */
   13407             /* dst: lo half copied, hi half zeroed */
   13408          } else {
   13409             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13410             storeLE( mkexpr(addr),
   13411                      getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
   13412             DIP("movq %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf );
   13413             delta += alen;
   13414             goto decode_success;
   13415          }
   13416       }
   13417       /* F2 0F D6 = MOVDQ2Q -- move from E (lo half xmm, not mem) to G (mmx). */
   13418       if (haveF2no66noF3(pfx) && sz == 4) {
   13419          modrm = getUChar(delta);
   13420          if (epartIsReg(modrm)) {
   13421             do_MMX_preamble();
   13422             putMMXReg( gregLO3ofRM(modrm),
   13423                        getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
   13424             DIP("movdq2q %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13425                                    nameMMXReg(gregLO3ofRM(modrm)));
   13426             delta += 1;
   13427             goto decode_success;
   13428          }
   13429          /* apparently no mem case for this insn */
   13430       }
   13431       break;
   13432 
   13433    case 0xD7:
   13434       /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16
   13435          lanes in xmm(E), turn them into a byte, and put
   13436          zero-extend of it in ireg(G).  Doing this directly is just
   13437          too cumbersome; give up therefore and call a helper. */
   13438       if (have66noF2noF3(pfx)
   13439           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
   13440           && epartIsReg(getUChar(delta))) { /* no memory case, it seems */
   13441          delta = dis_PMOVMSKB_128( vbi, pfx, delta, False/*!isAvx*/ );
   13442          goto decode_success;
   13443       }
   13444       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   13445       /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
   13446          mmx(G), turn them into a byte, and put zero-extend of it in
   13447          ireg(G). */
   13448       if (haveNo66noF2noF3(pfx)
   13449           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   13450          modrm = getUChar(delta);
   13451          if (epartIsReg(modrm)) {
   13452             do_MMX_preamble();
   13453             t0 = newTemp(Ity_I64);
   13454             t1 = newTemp(Ity_I64);
   13455             assign(t0, getMMXReg(eregLO3ofRM(modrm)));
   13456             assign(t1, mkIRExprCCall(
   13457                           Ity_I64, 0/*regparms*/,
   13458                           "amd64g_calculate_mmx_pmovmskb",
   13459                           &amd64g_calculate_mmx_pmovmskb,
   13460                           mkIRExprVec_1(mkexpr(t0))));
   13461             putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_64to32,mkexpr(t1)));
   13462             DIP("pmovmskb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   13463                                     nameIReg32(gregOfRexRM(pfx,modrm)));
   13464             delta += 1;
   13465             goto decode_success;
   13466          }
   13467          /* else fall through */
   13468       }
   13469       break;
   13470 
   13471    case 0xD8:
   13472       /* 66 0F D8 = PSUBUSB */
   13473       if (have66noF2noF3(pfx) && sz == 2) {
   13474          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13475                                     "psubusb", Iop_QSub8Ux16, False );
   13476          goto decode_success;
   13477       }
   13478       break;
   13479 
   13480    case 0xD9:
   13481       /* 66 0F D9 = PSUBUSW */
   13482       if (have66noF2noF3(pfx) && sz == 2) {
   13483          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13484                                     "psubusw", Iop_QSub16Ux8, False );
   13485          goto decode_success;
   13486       }
   13487       break;
   13488 
   13489    case 0xDA:
   13490       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   13491       /* 0F DA = PMINUB -- 8x8 unsigned min */
   13492       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13493          do_MMX_preamble();
   13494          delta = dis_MMXop_regmem_to_reg (
   13495                     vbi, pfx, delta, opc, "pminub", False );
   13496          goto decode_success;
   13497       }
   13498       /* 66 0F DA = PMINUB -- 8x16 unsigned min */
   13499       if (have66noF2noF3(pfx) && sz == 2) {
   13500          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13501                                     "pminub", Iop_Min8Ux16, False );
   13502          goto decode_success;
   13503       }
   13504       break;
   13505 
   13506    case 0xDB:
   13507       /* 66 0F DB = PAND */
   13508       if (have66noF2noF3(pfx) && sz == 2) {
   13509          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "pand", Iop_AndV128 );
   13510          goto decode_success;
   13511       }
   13512       break;
   13513 
   13514    case 0xDC:
   13515       /* 66 0F DC = PADDUSB */
   13516       if (have66noF2noF3(pfx) && sz == 2) {
   13517          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13518                                     "paddusb", Iop_QAdd8Ux16, False );
   13519          goto decode_success;
   13520       }
   13521       break;
   13522 
   13523    case 0xDD:
   13524       /* 66 0F DD = PADDUSW */
   13525       if (have66noF2noF3(pfx) && sz == 2) {
   13526          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13527                                     "paddusw", Iop_QAdd16Ux8, False );
   13528          goto decode_success;
   13529       }
   13530       break;
   13531 
   13532    case 0xDE:
   13533       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   13534       /* 0F DE = PMAXUB -- 8x8 unsigned max */
   13535       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13536          do_MMX_preamble();
   13537          delta = dis_MMXop_regmem_to_reg (
   13538                     vbi, pfx, delta, opc, "pmaxub", False );
   13539          goto decode_success;
   13540       }
   13541       /* 66 0F DE = PMAXUB -- 8x16 unsigned max */
   13542       if (have66noF2noF3(pfx) && sz == 2) {
   13543          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13544                                     "pmaxub", Iop_Max8Ux16, False );
   13545          goto decode_success;
   13546       }
   13547       break;
   13548 
   13549    case 0xDF:
   13550       /* 66 0F DF = PANDN */
   13551       if (have66noF2noF3(pfx) && sz == 2) {
   13552          delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "pandn", Iop_AndV128 );
   13553          goto decode_success;
   13554       }
   13555       break;
   13556 
   13557    case 0xE0:
   13558       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   13559       /* 0F E0 = PAVGB -- 8x8 unsigned Packed Average, with rounding */
   13560       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13561          do_MMX_preamble();
   13562          delta = dis_MMXop_regmem_to_reg (
   13563                     vbi, pfx, delta, opc, "pavgb", False );
   13564          goto decode_success;
   13565       }
   13566       /* 66 0F E0 = PAVGB */
   13567       if (have66noF2noF3(pfx) && sz == 2) {
   13568          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13569                                     "pavgb", Iop_Avg8Ux16, False );
   13570          goto decode_success;
   13571       }
   13572       break;
   13573 
   13574    case 0xE1:
   13575       /* 66 0F E1 = PSRAW by E */
   13576       if (have66noF2noF3(pfx) && sz == 2) {
   13577          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psraw", Iop_SarN16x8 );
   13578          goto decode_success;
   13579       }
   13580       break;
   13581 
   13582    case 0xE2:
   13583       /* 66 0F E2 = PSRAD by E */
   13584       if (have66noF2noF3(pfx) && sz == 2) {
   13585          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrad", Iop_SarN32x4 );
   13586          goto decode_success;
   13587       }
   13588       break;
   13589 
   13590    case 0xE3:
   13591       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   13592       /* 0F E3 = PAVGW -- 16x4 unsigned Packed Average, with rounding */
   13593       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13594          do_MMX_preamble();
   13595          delta = dis_MMXop_regmem_to_reg (
   13596                     vbi, pfx, delta, opc, "pavgw", False );
   13597          goto decode_success;
   13598       }
   13599       /* 66 0F E3 = PAVGW */
   13600       if (have66noF2noF3(pfx) && sz == 2) {
   13601          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13602                                     "pavgw", Iop_Avg16Ux8, False );
   13603          goto decode_success;
   13604       }
   13605       break;
   13606 
   13607    case 0xE4:
   13608       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   13609       /* 0F E4 = PMULUH -- 16x4 hi-half of unsigned widening multiply */
   13610       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13611          do_MMX_preamble();
   13612          delta = dis_MMXop_regmem_to_reg (
   13613                     vbi, pfx, delta, opc, "pmuluh", False );
   13614          goto decode_success;
   13615       }
   13616       /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */
   13617       if (have66noF2noF3(pfx) && sz == 2) {
   13618          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13619                                     "pmulhuw", Iop_MulHi16Ux8, False );
   13620          goto decode_success;
   13621       }
   13622       break;
   13623 
   13624    case 0xE5:
   13625       /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */
   13626       if (have66noF2noF3(pfx) && sz == 2) {
   13627          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13628                                     "pmulhw", Iop_MulHi16Sx8, False );
   13629          goto decode_success;
   13630       }
   13631       break;
   13632 
   13633    case 0xE6:
   13634       /* 66 0F E6 = CVTTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   13635          lo half xmm(G), and zero upper half, rounding towards zero */
   13636       /* F2 0F E6 = CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   13637          lo half xmm(G), according to prevailing rounding mode, and zero
   13638          upper half */
   13639       if ( (haveF2no66noF3(pfx) && sz == 4)
   13640            || (have66noF2noF3(pfx) && sz == 2) ) {
   13641          delta = dis_CVTxPD2DQ_128( vbi, pfx, delta, False/*!isAvx*/,
   13642                                     toBool(sz == 2)/*r2zero*/);
   13643          goto decode_success;
   13644       }
   13645       /* F3 0F E6 = CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
   13646          F64 in xmm(G) */
   13647       if (haveF3no66noF2(pfx) && sz == 4) {
   13648          delta = dis_CVTDQ2PD_128(vbi, pfx, delta, False/*!isAvx*/);
   13649          goto decode_success;
   13650       }
   13651       break;
   13652 
   13653    case 0xE7:
   13654       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   13655       /* 0F E7 = MOVNTQ -- for us, just a plain MMX store.  Note, the
   13656          Intel manual does not say anything about the usual business of
   13657          the FP reg tags getting trashed whenever an MMX insn happens.
   13658          So we just leave them alone.
   13659       */
   13660       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13661          modrm = getUChar(delta);
   13662          if (!epartIsReg(modrm)) {
   13663             /* do_MMX_preamble(); Intel docs don't specify this */
   13664             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13665             storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
   13666             DIP("movntq %s,%s\n", dis_buf,
   13667                                   nameMMXReg(gregLO3ofRM(modrm)));
   13668             delta += alen;
   13669             goto decode_success;
   13670          }
   13671          /* else fall through */
   13672       }
   13673       /* 66 0F E7 = MOVNTDQ -- for us, just a plain SSE store. */
   13674       if (have66noF2noF3(pfx) && sz == 2) {
   13675          modrm = getUChar(delta);
   13676          if (!epartIsReg(modrm)) {
   13677             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13678             gen_SEGV_if_not_16_aligned( addr );
   13679             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   13680             DIP("movntdq %s,%s\n", dis_buf,
   13681                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   13682             delta += alen;
   13683             goto decode_success;
   13684          }
   13685          /* else fall through */
   13686       }
   13687       break;
   13688 
   13689    case 0xE8:
   13690       /* 66 0F E8 = PSUBSB */
   13691       if (have66noF2noF3(pfx) && sz == 2) {
   13692          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13693                                     "psubsb", Iop_QSub8Sx16, False );
   13694          goto decode_success;
   13695       }
   13696       break;
   13697 
   13698    case 0xE9:
   13699       /* 66 0F E9 = PSUBSW */
   13700       if (have66noF2noF3(pfx) && sz == 2) {
   13701          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13702                                     "psubsw", Iop_QSub16Sx8, False );
   13703          goto decode_success;
   13704       }
   13705       break;
   13706 
   13707    case 0xEA:
   13708       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   13709       /* 0F EA = PMINSW -- 16x4 signed min */
   13710       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13711          do_MMX_preamble();
   13712          delta = dis_MMXop_regmem_to_reg (
   13713                     vbi, pfx, delta, opc, "pminsw", False );
   13714          goto decode_success;
   13715       }
   13716       /* 66 0F EA = PMINSW -- 16x8 signed min */
   13717       if (have66noF2noF3(pfx) && sz == 2) {
   13718          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13719                                     "pminsw", Iop_Min16Sx8, False );
   13720          goto decode_success;
   13721       }
   13722       break;
   13723 
   13724    case 0xEB:
   13725       /* 66 0F EB = POR */
   13726       if (have66noF2noF3(pfx) && sz == 2) {
   13727          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "por", Iop_OrV128 );
   13728          goto decode_success;
   13729       }
   13730       break;
   13731 
   13732    case 0xEC:
   13733       /* 66 0F EC = PADDSB */
   13734       if (have66noF2noF3(pfx) && sz == 2) {
   13735          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13736                                     "paddsb", Iop_QAdd8Sx16, False );
   13737          goto decode_success;
   13738       }
   13739       break;
   13740 
   13741    case 0xED:
   13742       /* 66 0F ED = PADDSW */
   13743       if (have66noF2noF3(pfx) && sz == 2) {
   13744          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13745                                     "paddsw", Iop_QAdd16Sx8, False );
   13746          goto decode_success;
   13747       }
   13748       break;
   13749 
   13750    case 0xEE:
   13751       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   13752       /* 0F EE = PMAXSW -- 16x4 signed max */
   13753       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13754          do_MMX_preamble();
   13755          delta = dis_MMXop_regmem_to_reg (
   13756                     vbi, pfx, delta, opc, "pmaxsw", False );
   13757          goto decode_success;
   13758       }
   13759       /* 66 0F EE = PMAXSW -- 16x8 signed max */
   13760       if (have66noF2noF3(pfx) && sz == 2) {
   13761          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13762                                     "pmaxsw", Iop_Max16Sx8, False );
   13763          goto decode_success;
   13764       }
   13765       break;
   13766 
   13767    case 0xEF:
   13768       /* 66 0F EF = PXOR */
   13769       if (have66noF2noF3(pfx) && sz == 2) {
   13770          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "pxor", Iop_XorV128 );
   13771          goto decode_success;
   13772       }
   13773       break;
   13774 
   13775    case 0xF1:
   13776       /* 66 0F F1 = PSLLW by E */
   13777       if (have66noF2noF3(pfx) && sz == 2) {
   13778          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psllw", Iop_ShlN16x8 );
   13779          goto decode_success;
   13780       }
   13781       break;
   13782 
   13783    case 0xF2:
   13784       /* 66 0F F2 = PSLLD by E */
   13785       if (have66noF2noF3(pfx) && sz == 2) {
   13786          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "pslld", Iop_ShlN32x4 );
   13787          goto decode_success;
   13788       }
   13789       break;
   13790 
   13791    case 0xF3:
   13792       /* 66 0F F3 = PSLLQ by E */
   13793       if (have66noF2noF3(pfx) && sz == 2) {
   13794          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psllq", Iop_ShlN64x2 );
   13795          goto decode_success;
   13796       }
   13797       break;
   13798 
   13799    case 0xF4:
   13800       /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   13801          0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
   13802          half */
   13803       if (have66noF2noF3(pfx) && sz == 2) {
   13804          IRTemp sV = newTemp(Ity_V128);
   13805          IRTemp dV = newTemp(Ity_V128);
   13806          modrm = getUChar(delta);
   13807          UInt rG = gregOfRexRM(pfx,modrm);
   13808          assign( dV, getXMMReg(rG) );
   13809          if (epartIsReg(modrm)) {
   13810             UInt rE = eregOfRexRM(pfx,modrm);
   13811             assign( sV, getXMMReg(rE) );
   13812             delta += 1;
   13813             DIP("pmuludq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   13814          } else {
   13815             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13816             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   13817             delta += alen;
   13818             DIP("pmuludq %s,%s\n", dis_buf, nameXMMReg(rG));
   13819          }
   13820          putXMMReg( rG, mkexpr(math_PMULUDQ_128( sV, dV )) );
   13821          goto decode_success;
   13822       }
   13823       /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   13824       /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   13825          0 to form 64-bit result */
   13826       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13827          IRTemp sV = newTemp(Ity_I64);
   13828          IRTemp dV = newTemp(Ity_I64);
   13829          t1 = newTemp(Ity_I32);
   13830          t0 = newTemp(Ity_I32);
   13831          modrm = getUChar(delta);
   13832 
   13833          do_MMX_preamble();
   13834          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   13835 
   13836          if (epartIsReg(modrm)) {
   13837             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   13838             delta += 1;
   13839             DIP("pmuludq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   13840                                    nameMMXReg(gregLO3ofRM(modrm)));
   13841          } else {
   13842             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13843             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   13844             delta += alen;
   13845             DIP("pmuludq %s,%s\n", dis_buf,
   13846                                    nameMMXReg(gregLO3ofRM(modrm)));
   13847          }
   13848 
   13849          assign( t0, unop(Iop_64to32, mkexpr(dV)) );
   13850          assign( t1, unop(Iop_64to32, mkexpr(sV)) );
   13851          putMMXReg( gregLO3ofRM(modrm),
   13852                     binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) );
   13853          goto decode_success;
   13854       }
   13855       break;
   13856 
   13857    case 0xF5:
   13858       /* 66 0F F5 = PMADDWD -- Multiply and add packed integers from
   13859          E(xmm or mem) to G(xmm) */
   13860       if (have66noF2noF3(pfx) && sz == 2) {
   13861          IRTemp sV = newTemp(Ity_V128);
   13862          IRTemp dV = newTemp(Ity_V128);
   13863          modrm     = getUChar(delta);
   13864          UInt   rG = gregOfRexRM(pfx,modrm);
   13865          if (epartIsReg(modrm)) {
   13866             UInt rE = eregOfRexRM(pfx,modrm);
   13867             assign( sV, getXMMReg(rE) );
   13868             delta += 1;
   13869             DIP("pmaddwd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   13870          } else {
   13871             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13872             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   13873             delta += alen;
   13874             DIP("pmaddwd %s,%s\n", dis_buf, nameXMMReg(rG));
   13875          }
   13876          assign( dV, getXMMReg(rG) );
   13877          putXMMReg( rG, mkexpr(math_PMADDWD_128(dV, sV)) );
   13878          goto decode_success;
   13879       }
   13880       break;
   13881 
   13882    case 0xF6:
   13883       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   13884       /* 0F F6 = PSADBW -- sum of 8Ux8 absolute differences */
   13885       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13886          do_MMX_preamble();
   13887          delta = dis_MMXop_regmem_to_reg (
   13888                     vbi, pfx, delta, opc, "psadbw", False );
   13889          goto decode_success;
   13890       }
   13891       /* 66 0F F6 = PSADBW -- 2 x (8x8 -> 48 zeroes ++ u16) Sum Abs Diffs
   13892          from E(xmm or mem) to G(xmm) */
   13893       if (have66noF2noF3(pfx) && sz == 2) {
   13894          IRTemp sV  = newTemp(Ity_V128);
   13895          IRTemp dV  = newTemp(Ity_V128);
   13896          modrm = getUChar(delta);
   13897          UInt   rG   = gregOfRexRM(pfx,modrm);
   13898          if (epartIsReg(modrm)) {
   13899             UInt rE = eregOfRexRM(pfx,modrm);
   13900             assign( sV, getXMMReg(rE) );
   13901             delta += 1;
   13902             DIP("psadbw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   13903          } else {
   13904             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13905             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   13906             delta += alen;
   13907             DIP("psadbw %s,%s\n", dis_buf, nameXMMReg(rG));
   13908          }
   13909          assign( dV, getXMMReg(rG) );
   13910          putXMMReg( rG, mkexpr( math_PSADBW_128 ( dV, sV ) ) );
   13911 
   13912          goto decode_success;
   13913       }
   13914       break;
   13915 
   13916    case 0xF7:
   13917       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   13918       /* 0F F7 = MASKMOVQ -- 8x8 masked store */
   13919       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13920          Bool ok = False;
   13921          delta = dis_MMX( &ok, vbi, pfx, sz, delta-1 );
   13922          if (ok) goto decode_success;
   13923       }
   13924       /* 66 0F F7 = MASKMOVDQU -- store selected bytes of double quadword */
   13925       if (have66noF2noF3(pfx) && sz == 2 && epartIsReg(getUChar(delta))) {
   13926          delta = dis_MASKMOVDQU( vbi, pfx, delta, False/*!isAvx*/ );
   13927          goto decode_success;
   13928       }
   13929       break;
   13930 
   13931    case 0xF8:
   13932       /* 66 0F F8 = PSUBB */
   13933       if (have66noF2noF3(pfx) && sz == 2) {
   13934          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13935                                     "psubb", Iop_Sub8x16, False );
   13936          goto decode_success;
   13937       }
   13938       break;
   13939 
   13940    case 0xF9:
   13941       /* 66 0F F9 = PSUBW */
   13942       if (have66noF2noF3(pfx) && sz == 2) {
   13943          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13944                                     "psubw", Iop_Sub16x8, False );
   13945          goto decode_success;
   13946       }
   13947       break;
   13948 
   13949    case 0xFA:
   13950       /* 66 0F FA = PSUBD */
   13951       if (have66noF2noF3(pfx) && sz == 2) {
   13952          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13953                                     "psubd", Iop_Sub32x4, False );
   13954          goto decode_success;
   13955       }
   13956       break;
   13957 
   13958    case 0xFB:
   13959       /* 66 0F FB = PSUBQ */
   13960       if (have66noF2noF3(pfx) && sz == 2) {
   13961          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13962                                     "psubq", Iop_Sub64x2, False );
   13963          goto decode_success;
   13964       }
   13965       /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   13966       /* 0F FB = PSUBQ -- sub 64x1 */
   13967       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13968          do_MMX_preamble();
   13969          delta = dis_MMXop_regmem_to_reg (
   13970                    vbi, pfx, delta, opc, "psubq", False );
   13971          goto decode_success;
   13972       }
   13973       break;
   13974 
   13975    case 0xFC:
   13976       /* 66 0F FC = PADDB */
   13977       if (have66noF2noF3(pfx) && sz == 2) {
   13978          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13979                                     "paddb", Iop_Add8x16, False );
   13980          goto decode_success;
   13981       }
   13982       break;
   13983 
   13984    case 0xFD:
   13985       /* 66 0F FD = PADDW */
   13986       if (have66noF2noF3(pfx) && sz == 2) {
   13987          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13988                                     "paddw", Iop_Add16x8, False );
   13989          goto decode_success;
   13990       }
   13991       break;
   13992 
   13993    case 0xFE:
   13994       /* 66 0F FE = PADDD */
   13995       if (have66noF2noF3(pfx) && sz == 2) {
   13996          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13997                                     "paddd", Iop_Add32x4, False );
   13998          goto decode_success;
   13999       }
   14000       break;
   14001 
   14002    default:
   14003       goto decode_failure;
   14004 
   14005    }
   14006 
   14007   decode_failure:
   14008    *decode_OK = False;
   14009    return deltaIN;
   14010 
   14011   decode_success:
   14012    *decode_OK = True;
   14013    return delta;
   14014 }
   14015 
   14016 
   14017 /*------------------------------------------------------------*/
   14018 /*---                                                      ---*/
   14019 /*--- Top-level SSE3 (not SupSSE3): dis_ESC_0F__SSE3       ---*/
   14020 /*---                                                      ---*/
   14021 /*------------------------------------------------------------*/
   14022 
   14023 static Long dis_MOVDDUP_128 ( VexAbiInfo* vbi, Prefix pfx,
   14024                               Long delta, Bool isAvx )
   14025 {
   14026    IRTemp addr   = IRTemp_INVALID;
   14027    Int    alen   = 0;
   14028    HChar  dis_buf[50];
   14029    IRTemp sV    = newTemp(Ity_V128);
   14030    IRTemp d0    = newTemp(Ity_I64);
   14031    UChar  modrm = getUChar(delta);
   14032    UInt   rG    = gregOfRexRM(pfx,modrm);
   14033    if (epartIsReg(modrm)) {
   14034       UInt rE = eregOfRexRM(pfx,modrm);
   14035       assign( sV, getXMMReg(rE) );
   14036       DIP("%smovddup %s,%s\n",
   14037           isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
   14038       delta += 1;
   14039       assign ( d0, unop(Iop_V128to64, mkexpr(sV)) );
   14040    } else {
   14041       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14042       assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
   14043       DIP("%smovddup %s,%s\n",
   14044           isAvx ? "v" : "", dis_buf, nameXMMReg(rG));
   14045       delta += alen;
   14046    }
   14047    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   14048       ( rG, binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) );
   14049    return delta;
   14050 }
   14051 
   14052 
   14053 static Long dis_MOVDDUP_256 ( VexAbiInfo* vbi, Prefix pfx,
   14054                               Long delta )
   14055 {
   14056    IRTemp addr   = IRTemp_INVALID;
   14057    Int    alen   = 0;
   14058    HChar  dis_buf[50];
   14059    IRTemp d0    = newTemp(Ity_I64);
   14060    IRTemp d1    = newTemp(Ity_I64);
   14061    UChar  modrm = getUChar(delta);
   14062    UInt   rG    = gregOfRexRM(pfx,modrm);
   14063    if (epartIsReg(modrm)) {
   14064       UInt rE = eregOfRexRM(pfx,modrm);
   14065       DIP("vmovddup %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   14066       delta += 1;
   14067       assign ( d0, getYMMRegLane64(rE, 0) );
   14068       assign ( d1, getYMMRegLane64(rE, 2) );
   14069    } else {
   14070       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14071       assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
   14072       assign( d1, loadLE(Ity_I64, binop(Iop_Add64,
   14073                                         mkexpr(addr), mkU64(16))) );
   14074       DIP("vmovddup %s,%s\n", dis_buf, nameYMMReg(rG));
   14075       delta += alen;
   14076    }
   14077    putYMMRegLane64( rG, 0, mkexpr(d0) );
   14078    putYMMRegLane64( rG, 1, mkexpr(d0) );
   14079    putYMMRegLane64( rG, 2, mkexpr(d1) );
   14080    putYMMRegLane64( rG, 3, mkexpr(d1) );
   14081    return delta;
   14082 }
   14083 
   14084 
   14085 static Long dis_MOVSxDUP_128 ( VexAbiInfo* vbi, Prefix pfx,
   14086                                Long delta, Bool isAvx, Bool isL )
   14087 {
   14088    IRTemp addr  = IRTemp_INVALID;
   14089    Int    alen  = 0;
   14090    HChar  dis_buf[50];
   14091    IRTemp sV    = newTemp(Ity_V128);
   14092    UChar  modrm = getUChar(delta);
   14093    UInt   rG    = gregOfRexRM(pfx,modrm);
   14094    IRTemp s3, s2, s1, s0;
   14095    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   14096    if (epartIsReg(modrm)) {
   14097       UInt rE = eregOfRexRM(pfx,modrm);
   14098       assign( sV, getXMMReg(rE) );
   14099       DIP("%smovs%cdup %s,%s\n",
   14100           isAvx ? "v" : "", isL ? 'l' : 'h', nameXMMReg(rE), nameXMMReg(rG));
   14101       delta += 1;
   14102    } else {
   14103       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14104       if (!isAvx)
   14105          gen_SEGV_if_not_16_aligned( addr );
   14106       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   14107       DIP("%smovs%cdup %s,%s\n",
   14108           isAvx ? "v" : "", isL ? 'l' : 'h', dis_buf, nameXMMReg(rG));
   14109       delta += alen;
   14110    }
   14111    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   14112    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   14113       ( rG, isL ? mkV128from32s( s2, s2, s0, s0 )
   14114                 : mkV128from32s( s3, s3, s1, s1 ) );
   14115    return delta;
   14116 }
   14117 
   14118 
   14119 static Long dis_MOVSxDUP_256 ( VexAbiInfo* vbi, Prefix pfx,
   14120                                Long delta, Bool isL )
   14121 {
   14122    IRTemp addr  = IRTemp_INVALID;
   14123    Int    alen  = 0;
   14124    HChar  dis_buf[50];
   14125    IRTemp sV    = newTemp(Ity_V256);
   14126    UChar  modrm = getUChar(delta);
   14127    UInt   rG    = gregOfRexRM(pfx,modrm);
   14128    IRTemp s7, s6, s5, s4, s3, s2, s1, s0;
   14129    s7 = s6 = s5 = s4 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   14130    if (epartIsReg(modrm)) {
   14131       UInt rE = eregOfRexRM(pfx,modrm);
   14132       assign( sV, getYMMReg(rE) );
   14133       DIP("vmovs%cdup %s,%s\n",
   14134           isL ? 'l' : 'h', nameYMMReg(rE), nameYMMReg(rG));
   14135       delta += 1;
   14136    } else {
   14137       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14138       assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   14139       DIP("vmovs%cdup %s,%s\n",
   14140           isL ? 'l' : 'h', dis_buf, nameYMMReg(rG));
   14141       delta += alen;
   14142    }
   14143    breakupV256to32s( sV, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
   14144    putYMMRegLane128( rG, 1, isL ? mkV128from32s( s6, s6, s4, s4 )
   14145                                 : mkV128from32s( s7, s7, s5, s5 ) );
   14146    putYMMRegLane128( rG, 0, isL ? mkV128from32s( s2, s2, s0, s0 )
   14147                                 : mkV128from32s( s3, s3, s1, s1 ) );
   14148    return delta;
   14149 }
   14150 
   14151 
   14152 static IRTemp math_HADDPS_128 ( IRTemp dV, IRTemp sV, Bool isAdd )
   14153 {
   14154    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   14155    IRTemp leftV  = newTemp(Ity_V128);
   14156    IRTemp rightV = newTemp(Ity_V128);
   14157    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   14158 
   14159    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   14160    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   14161 
   14162    assign( leftV,  mkV128from32s( s2, s0, d2, d0 ) );
   14163    assign( rightV, mkV128from32s( s3, s1, d3, d1 ) );
   14164 
   14165    IRTemp res = newTemp(Ity_V128);
   14166    assign( res, binop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4,
   14167                               mkexpr(leftV), mkexpr(rightV) ) );
   14168    return res;
   14169 }
   14170 
   14171 
   14172 static IRTemp math_HADDPD_128 ( IRTemp dV, IRTemp sV, Bool isAdd )
   14173 {
   14174    IRTemp s1, s0, d1, d0;
   14175    IRTemp leftV  = newTemp(Ity_V128);
   14176    IRTemp rightV = newTemp(Ity_V128);
   14177    s1 = s0 = d1 = d0 = IRTemp_INVALID;
   14178 
   14179    breakupV128to64s( sV, &s1, &s0 );
   14180    breakupV128to64s( dV, &d1, &d0 );
   14181 
   14182    assign( leftV,  binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) );
   14183    assign( rightV, binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
   14184 
   14185    IRTemp res = newTemp(Ity_V128);
   14186    assign( res, binop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2,
   14187                               mkexpr(leftV), mkexpr(rightV) ) );
   14188    return res;
   14189 }
   14190 
   14191 
   14192 __attribute__((noinline))
   14193 static
   14194 Long dis_ESC_0F__SSE3 ( Bool* decode_OK,
   14195                         VexAbiInfo* vbi,
   14196                         Prefix pfx, Int sz, Long deltaIN )
   14197 {
   14198    IRTemp addr  = IRTemp_INVALID;
   14199    UChar  modrm = 0;
   14200    Int    alen  = 0;
   14201    HChar  dis_buf[50];
   14202 
   14203    *decode_OK = False;
   14204 
   14205    Long   delta = deltaIN;
   14206    UChar  opc   = getUChar(delta);
   14207    delta++;
   14208    switch (opc) {
   14209 
   14210    case 0x12:
   14211       /* F3 0F 12 = MOVSLDUP -- move from E (mem or xmm) to G (xmm),
   14212          duplicating some lanes (2:2:0:0). */
   14213       if (haveF3no66noF2(pfx) && sz == 4) {
   14214          delta = dis_MOVSxDUP_128( vbi, pfx, delta, False/*!isAvx*/,
   14215                                    True/*isL*/ );
   14216          goto decode_success;
   14217       }
   14218       /* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
   14219          duplicating some lanes (0:1:0:1). */
   14220       if (haveF2no66noF3(pfx)
   14221           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   14222          delta = dis_MOVDDUP_128( vbi, pfx, delta, False/*!isAvx*/ );
   14223          goto decode_success;
   14224       }
   14225       break;
   14226 
   14227    case 0x16:
   14228       /* F3 0F 16 = MOVSHDUP -- move from E (mem or xmm) to G (xmm),
   14229          duplicating some lanes (3:3:1:1). */
   14230       if (haveF3no66noF2(pfx) && sz == 4) {
   14231          delta = dis_MOVSxDUP_128( vbi, pfx, delta, False/*!isAvx*/,
   14232                                    False/*!isL*/ );
   14233          goto decode_success;
   14234       }
   14235       break;
   14236 
   14237    case 0x7C:
   14238    case 0x7D:
   14239       /* F2 0F 7C = HADDPS -- 32x4 add across from E (mem or xmm) to G (xmm). */
   14240       /* F2 0F 7D = HSUBPS -- 32x4 sub across from E (mem or xmm) to G (xmm). */
   14241       if (haveF2no66noF3(pfx) && sz == 4) {
   14242          IRTemp eV     = newTemp(Ity_V128);
   14243          IRTemp gV     = newTemp(Ity_V128);
   14244          Bool   isAdd  = opc == 0x7C;
   14245          HChar* str    = isAdd ? "add" : "sub";
   14246          modrm         = getUChar(delta);
   14247          UInt   rG     = gregOfRexRM(pfx,modrm);
   14248          if (epartIsReg(modrm)) {
   14249             UInt rE = eregOfRexRM(pfx,modrm);
   14250             assign( eV, getXMMReg(rE) );
   14251             DIP("h%sps %s,%s\n", str, nameXMMReg(rE), nameXMMReg(rG));
   14252             delta += 1;
   14253          } else {
   14254             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14255             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   14256             DIP("h%sps %s,%s\n", str, dis_buf, nameXMMReg(rG));
   14257             delta += alen;
   14258          }
   14259 
   14260          assign( gV, getXMMReg(rG) );
   14261          putXMMReg( rG, mkexpr( math_HADDPS_128 ( gV, eV, isAdd ) ) );
   14262          goto decode_success;
   14263       }
   14264       /* 66 0F 7C = HADDPD -- 64x2 add across from E (mem or xmm) to G (xmm). */
   14265       /* 66 0F 7D = HSUBPD -- 64x2 sub across from E (mem or xmm) to G (xmm). */
   14266       if (have66noF2noF3(pfx) && sz == 2) {
   14267          IRTemp eV     = newTemp(Ity_V128);
   14268          IRTemp gV     = newTemp(Ity_V128);
   14269          Bool   isAdd  = opc == 0x7C;
   14270          HChar* str    = isAdd ? "add" : "sub";
   14271          modrm         = getUChar(delta);
   14272          UInt   rG     = gregOfRexRM(pfx,modrm);
   14273          if (epartIsReg(modrm)) {
   14274             UInt rE = eregOfRexRM(pfx,modrm);
   14275             assign( eV, getXMMReg(rE) );
   14276             DIP("h%spd %s,%s\n", str, nameXMMReg(rE), nameXMMReg(rG));
   14277             delta += 1;
   14278          } else {
   14279             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14280             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   14281             DIP("h%spd %s,%s\n", str, dis_buf, nameXMMReg(rG));
   14282             delta += alen;
   14283          }
   14284 
   14285          assign( gV, getXMMReg(rG) );
   14286          putXMMReg( rG, mkexpr( math_HADDPD_128 ( gV, eV, isAdd ) ) );
   14287          goto decode_success;
   14288       }
   14289       break;
   14290 
   14291    case 0xD0:
   14292       /* 66 0F D0 = ADDSUBPD -- 64x4 +/- from E (mem or xmm) to G (xmm). */
   14293       if (have66noF2noF3(pfx) && sz == 2) {
   14294          IRTemp eV   = newTemp(Ity_V128);
   14295          IRTemp gV   = newTemp(Ity_V128);
   14296          modrm       = getUChar(delta);
   14297          UInt   rG   = gregOfRexRM(pfx,modrm);
   14298          if (epartIsReg(modrm)) {
   14299             UInt rE = eregOfRexRM(pfx,modrm);
   14300             assign( eV, getXMMReg(rE) );
   14301             DIP("addsubpd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   14302             delta += 1;
   14303          } else {
   14304             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14305             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   14306             DIP("addsubpd %s,%s\n", dis_buf, nameXMMReg(rG));
   14307             delta += alen;
   14308          }
   14309 
   14310          assign( gV, getXMMReg(rG) );
   14311          putXMMReg( rG, mkexpr( math_ADDSUBPD_128 ( gV, eV ) ) );
   14312          goto decode_success;
   14313       }
   14314       /* F2 0F D0 = ADDSUBPS -- 32x4 +/-/+/- from E (mem or xmm) to G (xmm). */
   14315       if (haveF2no66noF3(pfx) && sz == 4) {
   14316          IRTemp eV   = newTemp(Ity_V128);
   14317          IRTemp gV   = newTemp(Ity_V128);
   14318          modrm       = getUChar(delta);
   14319          UInt   rG   = gregOfRexRM(pfx,modrm);
   14320 
   14321          modrm = getUChar(delta);
   14322          if (epartIsReg(modrm)) {
   14323             UInt rE = eregOfRexRM(pfx,modrm);
   14324             assign( eV, getXMMReg(rE) );
   14325             DIP("addsubps %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   14326             delta += 1;
   14327          } else {
   14328             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14329             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   14330             DIP("addsubps %s,%s\n", dis_buf, nameXMMReg(rG));
   14331             delta += alen;
   14332          }
   14333 
   14334          assign( gV, getXMMReg(rG) );
   14335          putXMMReg( rG, mkexpr( math_ADDSUBPS_128 ( gV, eV ) ) );
   14336          goto decode_success;
   14337       }
   14338       break;
   14339 
   14340    case 0xF0:
   14341       /* F2 0F F0 = LDDQU -- move from E (mem or xmm) to G (xmm). */
   14342       if (haveF2no66noF3(pfx) && sz == 4) {
   14343          modrm = getUChar(delta);
   14344          if (epartIsReg(modrm)) {
   14345             goto decode_failure;
   14346          } else {
   14347             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14348             putXMMReg( gregOfRexRM(pfx,modrm),
   14349                        loadLE(Ity_V128, mkexpr(addr)) );
   14350             DIP("lddqu %s,%s\n", dis_buf,
   14351                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   14352             delta += alen;
   14353          }
   14354          goto decode_success;
   14355       }
   14356       break;
   14357 
   14358    default:
   14359       goto decode_failure;
   14360 
   14361    }
   14362 
   14363   decode_failure:
   14364    *decode_OK = False;
   14365    return deltaIN;
   14366 
   14367   decode_success:
   14368    *decode_OK = True;
   14369    return delta;
   14370 }
   14371 
   14372 
   14373 /*------------------------------------------------------------*/
   14374 /*---                                                      ---*/
   14375 /*--- Top-level SSSE3: dis_ESC_0F38__SupSSE3               ---*/
   14376 /*---                                                      ---*/
   14377 /*------------------------------------------------------------*/
   14378 
   14379 static
   14380 IRTemp math_PSHUFB_XMM ( IRTemp dV/*data to perm*/, IRTemp sV/*perm*/ )
   14381 {
   14382    IRTemp sHi        = newTemp(Ity_I64);
   14383    IRTemp sLo        = newTemp(Ity_I64);
   14384    IRTemp dHi        = newTemp(Ity_I64);
   14385    IRTemp dLo        = newTemp(Ity_I64);
   14386    IRTemp rHi        = newTemp(Ity_I64);
   14387    IRTemp rLo        = newTemp(Ity_I64);
   14388    IRTemp sevens     = newTemp(Ity_I64);
   14389    IRTemp mask0x80hi = newTemp(Ity_I64);
   14390    IRTemp mask0x80lo = newTemp(Ity_I64);
   14391    IRTemp maskBit3hi = newTemp(Ity_I64);
   14392    IRTemp maskBit3lo = newTemp(Ity_I64);
   14393    IRTemp sAnd7hi    = newTemp(Ity_I64);
   14394    IRTemp sAnd7lo    = newTemp(Ity_I64);
   14395    IRTemp permdHi    = newTemp(Ity_I64);
   14396    IRTemp permdLo    = newTemp(Ity_I64);
   14397    IRTemp res        = newTemp(Ity_V128);
   14398 
   14399    assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   14400    assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   14401    assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   14402    assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   14403 
   14404    assign( sevens, mkU64(0x0707070707070707ULL) );
   14405 
   14406    /* mask0x80hi = Not(SarN8x8(sHi,7))
   14407       maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7)
   14408       sAnd7hi    = And(sHi,sevens)
   14409       permdHi    = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi),
   14410       And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) )
   14411       rHi        = And(permdHi,mask0x80hi)
   14412    */
   14413    assign(
   14414       mask0x80hi,
   14415       unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7))));
   14416 
   14417    assign(
   14418       maskBit3hi,
   14419       binop(Iop_SarN8x8,
   14420             binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)),
   14421             mkU8(7)));
   14422 
   14423    assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens)));
   14424 
   14425    assign(
   14426       permdHi,
   14427       binop(
   14428          Iop_Or64,
   14429          binop(Iop_And64,
   14430                binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)),
   14431                mkexpr(maskBit3hi)),
   14432          binop(Iop_And64,
   14433                binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)),
   14434                unop(Iop_Not64,mkexpr(maskBit3hi))) ));
   14435 
   14436    assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) );
   14437 
   14438    /* And the same for the lower half of the result.  What fun. */
   14439 
   14440    assign(
   14441       mask0x80lo,
   14442       unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7))));
   14443 
   14444    assign(
   14445       maskBit3lo,
   14446       binop(Iop_SarN8x8,
   14447             binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)),
   14448             mkU8(7)));
   14449 
   14450    assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens)));
   14451 
   14452    assign(
   14453       permdLo,
   14454       binop(
   14455          Iop_Or64,
   14456          binop(Iop_And64,
   14457                binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)),
   14458                mkexpr(maskBit3lo)),
   14459          binop(Iop_And64,
   14460                binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)),
   14461                unop(Iop_Not64,mkexpr(maskBit3lo))) ));
   14462 
   14463    assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) );
   14464 
   14465    assign(res, binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo)));
   14466    return res;
   14467 }
   14468 
   14469 
   14470 static Long dis_PHADD_128 ( VexAbiInfo* vbi, Prefix pfx, Long delta,
   14471                             Bool isAvx, UChar opc )
   14472 {
   14473    IRTemp addr   = IRTemp_INVALID;
   14474    Int    alen   = 0;
   14475    HChar  dis_buf[50];
   14476    HChar* str    = "???";
   14477    IROp   opV64  = Iop_INVALID;
   14478    IROp   opCatO = Iop_CatOddLanes16x4;
   14479    IROp   opCatE = Iop_CatEvenLanes16x4;
   14480    IRTemp sV     = newTemp(Ity_V128);
   14481    IRTemp dV     = newTemp(Ity_V128);
   14482    IRTemp sHi    = newTemp(Ity_I64);
   14483    IRTemp sLo    = newTemp(Ity_I64);
   14484    IRTemp dHi    = newTemp(Ity_I64);
   14485    IRTemp dLo    = newTemp(Ity_I64);
   14486    UChar  modrm  = getUChar(delta);
   14487    UInt   rG     = gregOfRexRM(pfx,modrm);
   14488    UInt   rV     = isAvx ? getVexNvvvv(pfx) : rG;
   14489 
   14490    switch (opc) {
   14491       case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   14492       case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   14493       case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   14494       case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   14495       case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   14496       case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   14497       default: vassert(0);
   14498    }
   14499    if (opc == 0x02 || opc == 0x06) {
   14500       opCatO = Iop_InterleaveHI32x2;
   14501       opCatE = Iop_InterleaveLO32x2;
   14502    }
   14503 
   14504    assign( dV, getXMMReg(rV) );
   14505 
   14506    if (epartIsReg(modrm)) {
   14507       UInt rE = eregOfRexRM(pfx,modrm);
   14508       assign( sV, getXMMReg(rE) );
   14509       DIP("ph%s %s,%s\n", str, nameXMMReg(rE), nameXMMReg(rG));
   14510       delta += 1;
   14511    } else {
   14512       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14513       if (!isAvx)
   14514          gen_SEGV_if_not_16_aligned( addr );
   14515       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   14516       DIP("ph%s %s,%s\n", str, dis_buf, nameXMMReg(rG));
   14517       delta += alen;
   14518    }
   14519 
   14520    assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   14521    assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   14522    assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   14523    assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   14524 
   14525    /* This isn't a particularly efficient way to compute the
   14526       result, but at least it avoids a proliferation of IROps,
   14527       hence avoids complication all the backends. */
   14528 
   14529    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   14530       ( rG,
   14531         binop(Iop_64HLtoV128,
   14532               binop(opV64,
   14533                     binop(opCatE,mkexpr(sHi),mkexpr(sLo)),
   14534                     binop(opCatO,mkexpr(sHi),mkexpr(sLo)) ),
   14535               binop(opV64,
   14536                     binop(opCatE,mkexpr(dHi),mkexpr(dLo)),
   14537                     binop(opCatO,mkexpr(dHi),mkexpr(dLo)) ) ) );
   14538    return delta;
   14539 }
   14540 
   14541 
   14542 static IRTemp math_PMADDUBSW_128 ( IRTemp dV, IRTemp sV )
   14543 {
   14544    IRTemp sVoddsSX  = newTemp(Ity_V128);
   14545    IRTemp sVevensSX = newTemp(Ity_V128);
   14546    IRTemp dVoddsZX  = newTemp(Ity_V128);
   14547    IRTemp dVevensZX = newTemp(Ity_V128);
   14548    /* compute dV unsigned x sV signed */
   14549    assign( sVoddsSX, binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) );
   14550    assign( sVevensSX, binop(Iop_SarN16x8,
   14551                             binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)),
   14552                             mkU8(8)) );
   14553    assign( dVoddsZX, binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) );
   14554    assign( dVevensZX, binop(Iop_ShrN16x8,
   14555                             binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)),
   14556                             mkU8(8)) );
   14557 
   14558    IRTemp res = newTemp(Ity_V128);
   14559    assign( res, binop(Iop_QAdd16Sx8,
   14560                       binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
   14561                       binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX))
   14562                      )
   14563          );
   14564    return res;
   14565 }
   14566 
   14567 
   14568 __attribute__((noinline))
   14569 static
   14570 Long dis_ESC_0F38__SupSSE3 ( Bool* decode_OK,
   14571                              VexAbiInfo* vbi,
   14572                              Prefix pfx, Int sz, Long deltaIN )
   14573 {
   14574    IRTemp addr  = IRTemp_INVALID;
   14575    UChar  modrm = 0;
   14576    Int    alen  = 0;
   14577    HChar  dis_buf[50];
   14578 
   14579    *decode_OK = False;
   14580 
   14581    Long   delta = deltaIN;
   14582    UChar  opc   = getUChar(delta);
   14583    delta++;
   14584    switch (opc) {
   14585 
   14586    case 0x00:
   14587       /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */
   14588       if (have66noF2noF3(pfx)
   14589           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   14590          IRTemp sV = newTemp(Ity_V128);
   14591          IRTemp dV = newTemp(Ity_V128);
   14592 
   14593          modrm = getUChar(delta);
   14594          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   14595 
   14596          if (epartIsReg(modrm)) {
   14597             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   14598             delta += 1;
   14599             DIP("pshufb %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   14600                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   14601          } else {
   14602             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14603             gen_SEGV_if_not_16_aligned( addr );
   14604             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   14605             delta += alen;
   14606             DIP("pshufb %s,%s\n", dis_buf,
   14607                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   14608          }
   14609 
   14610          IRTemp res = math_PSHUFB_XMM( dV, sV );
   14611          putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(res));
   14612          goto decode_success;
   14613       }
   14614       /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */
   14615       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14616          IRTemp sV      = newTemp(Ity_I64);
   14617          IRTemp dV      = newTemp(Ity_I64);
   14618 
   14619          modrm = getUChar(delta);
   14620          do_MMX_preamble();
   14621          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   14622 
   14623          if (epartIsReg(modrm)) {
   14624             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   14625             delta += 1;
   14626             DIP("pshufb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   14627                                   nameMMXReg(gregLO3ofRM(modrm)));
   14628          } else {
   14629             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14630             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   14631             delta += alen;
   14632             DIP("pshufb %s,%s\n", dis_buf,
   14633                                   nameMMXReg(gregLO3ofRM(modrm)));
   14634          }
   14635 
   14636          putMMXReg(
   14637             gregLO3ofRM(modrm),
   14638             binop(
   14639                Iop_And64,
   14640                /* permute the lanes */
   14641                binop(
   14642                   Iop_Perm8x8,
   14643                   mkexpr(dV),
   14644                   binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL))
   14645                ),
   14646                /* mask off lanes which have (index & 0x80) == 0x80 */
   14647                unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7)))
   14648             )
   14649          );
   14650          goto decode_success;
   14651       }
   14652       break;
   14653 
   14654    case 0x01:
   14655    case 0x02:
   14656    case 0x03:
   14657    case 0x05:
   14658    case 0x06:
   14659    case 0x07:
   14660       /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and
   14661          G to G (xmm). */
   14662       /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and
   14663          G to G (xmm). */
   14664       /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or
   14665          xmm) and G to G (xmm). */
   14666       /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and
   14667          G to G (xmm). */
   14668       /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and
   14669          G to G (xmm). */
   14670       /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or
   14671          xmm) and G to G (xmm). */
   14672       if (have66noF2noF3(pfx)
   14673           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   14674          delta = dis_PHADD_128( vbi, pfx, delta, False/*isAvx*/, opc );
   14675          goto decode_success;
   14676       }
   14677       /* ***--- these are MMX class insns introduced in SSSE3 ---*** */
   14678       /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G
   14679          to G (mmx). */
   14680       /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G
   14681          to G (mmx). */
   14682       /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or
   14683          mmx) and G to G (mmx). */
   14684       /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G
   14685          to G (mmx). */
   14686       /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G
   14687          to G (mmx). */
   14688       /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or
   14689          mmx) and G to G (mmx). */
   14690       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14691          HChar* str    = "???";
   14692          IROp   opV64  = Iop_INVALID;
   14693          IROp   opCatO = Iop_CatOddLanes16x4;
   14694          IROp   opCatE = Iop_CatEvenLanes16x4;
   14695          IRTemp sV     = newTemp(Ity_I64);
   14696          IRTemp dV     = newTemp(Ity_I64);
   14697 
   14698          modrm = getUChar(delta);
   14699 
   14700          switch (opc) {
   14701             case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   14702             case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   14703             case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   14704             case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   14705             case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   14706             case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   14707             default: vassert(0);
   14708          }
   14709          if (opc == 0x02 || opc == 0x06) {
   14710             opCatO = Iop_InterleaveHI32x2;
   14711             opCatE = Iop_InterleaveLO32x2;
   14712          }
   14713 
   14714          do_MMX_preamble();
   14715          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   14716 
   14717          if (epartIsReg(modrm)) {
   14718             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   14719             delta += 1;
   14720             DIP("ph%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
   14721                                      nameMMXReg(gregLO3ofRM(modrm)));
   14722          } else {
   14723             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14724             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   14725             delta += alen;
   14726             DIP("ph%s %s,%s\n", str, dis_buf,
   14727                                      nameMMXReg(gregLO3ofRM(modrm)));
   14728          }
   14729 
   14730          putMMXReg(
   14731             gregLO3ofRM(modrm),
   14732             binop(opV64,
   14733                   binop(opCatE,mkexpr(sV),mkexpr(dV)),
   14734                   binop(opCatO,mkexpr(sV),mkexpr(dV))
   14735             )
   14736          );
   14737          goto decode_success;
   14738       }
   14739       break;
   14740 
   14741    case 0x04:
   14742       /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
   14743          Unsigned Bytes (XMM) */
   14744       if (have66noF2noF3(pfx)
   14745           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   14746          IRTemp sV = newTemp(Ity_V128);
   14747          IRTemp dV = newTemp(Ity_V128);
   14748          modrm     = getUChar(delta);
   14749          UInt   rG = gregOfRexRM(pfx,modrm);
   14750 
   14751          assign( dV, getXMMReg(rG) );
   14752 
   14753          if (epartIsReg(modrm)) {
   14754             UInt rE = eregOfRexRM(pfx,modrm);
   14755             assign( sV, getXMMReg(rE) );
   14756             delta += 1;
   14757             DIP("pmaddubsw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   14758          } else {
   14759             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14760             gen_SEGV_if_not_16_aligned( addr );
   14761             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   14762             delta += alen;
   14763             DIP("pmaddubsw %s,%s\n", dis_buf, nameXMMReg(rG));
   14764          }
   14765 
   14766          putXMMReg( rG, mkexpr( math_PMADDUBSW_128( dV, sV ) ) );
   14767          goto decode_success;
   14768       }
   14769       /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
   14770          Unsigned Bytes (MMX) */
   14771       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14772          IRTemp sV        = newTemp(Ity_I64);
   14773          IRTemp dV        = newTemp(Ity_I64);
   14774          IRTemp sVoddsSX  = newTemp(Ity_I64);
   14775          IRTemp sVevensSX = newTemp(Ity_I64);
   14776          IRTemp dVoddsZX  = newTemp(Ity_I64);
   14777          IRTemp dVevensZX = newTemp(Ity_I64);
   14778 
   14779          modrm = getUChar(delta);
   14780          do_MMX_preamble();
   14781          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   14782 
   14783          if (epartIsReg(modrm)) {
   14784             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   14785             delta += 1;
   14786             DIP("pmaddubsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   14787                                      nameMMXReg(gregLO3ofRM(modrm)));
   14788          } else {
   14789             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14790             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   14791             delta += alen;
   14792             DIP("pmaddubsw %s,%s\n", dis_buf,
   14793                                      nameMMXReg(gregLO3ofRM(modrm)));
   14794          }
   14795 
   14796          /* compute dV unsigned x sV signed */
   14797          assign( sVoddsSX,
   14798                  binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) );
   14799          assign( sVevensSX,
   14800                  binop(Iop_SarN16x4,
   14801                        binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)),
   14802                        mkU8(8)) );
   14803          assign( dVoddsZX,
   14804                  binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) );
   14805          assign( dVevensZX,
   14806                  binop(Iop_ShrN16x4,
   14807                        binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)),
   14808                        mkU8(8)) );
   14809 
   14810          putMMXReg(
   14811             gregLO3ofRM(modrm),
   14812             binop(Iop_QAdd16Sx4,
   14813                   binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
   14814                   binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX))
   14815             )
   14816          );
   14817          goto decode_success;
   14818       }
   14819       break;
   14820 
   14821    case 0x08:
   14822    case 0x09:
   14823    case 0x0A:
   14824       /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */
   14825       /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */
   14826       /* 66 0F 38 0A = PSIGND -- Packed Sign 32x4 (XMM) */
   14827       if (have66noF2noF3(pfx)
   14828           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   14829          IRTemp sV      = newTemp(Ity_V128);
   14830          IRTemp dV      = newTemp(Ity_V128);
   14831          IRTemp sHi     = newTemp(Ity_I64);
   14832          IRTemp sLo     = newTemp(Ity_I64);
   14833          IRTemp dHi     = newTemp(Ity_I64);
   14834          IRTemp dLo     = newTemp(Ity_I64);
   14835          HChar* str     = "???";
   14836          Int    laneszB = 0;
   14837 
   14838          switch (opc) {
   14839             case 0x08: laneszB = 1; str = "b"; break;
   14840             case 0x09: laneszB = 2; str = "w"; break;
   14841             case 0x0A: laneszB = 4; str = "d"; break;
   14842             default: vassert(0);
   14843          }
   14844 
   14845          modrm = getUChar(delta);
   14846          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   14847 
   14848          if (epartIsReg(modrm)) {
   14849             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   14850             delta += 1;
   14851             DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
   14852                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   14853          } else {
   14854             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14855             gen_SEGV_if_not_16_aligned( addr );
   14856             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   14857             delta += alen;
   14858             DIP("psign%s %s,%s\n", str, dis_buf,
   14859                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   14860          }
   14861 
   14862          assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   14863          assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   14864          assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   14865          assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   14866 
   14867          putXMMReg(
   14868             gregOfRexRM(pfx,modrm),
   14869             binop(Iop_64HLtoV128,
   14870                   dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
   14871                   dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
   14872             )
   14873          );
   14874          goto decode_success;
   14875       }
   14876       /* 0F 38 08 = PSIGNB -- Packed Sign 8x8  (MMX) */
   14877       /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */
   14878       /* 0F 38 0A = PSIGND -- Packed Sign 32x2 (MMX) */
   14879       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14880          IRTemp sV      = newTemp(Ity_I64);
   14881          IRTemp dV      = newTemp(Ity_I64);
   14882          HChar* str     = "???";
   14883          Int    laneszB = 0;
   14884 
   14885          switch (opc) {
   14886             case 0x08: laneszB = 1; str = "b"; break;
   14887             case 0x09: laneszB = 2; str = "w"; break;
   14888             case 0x0A: laneszB = 4; str = "d"; break;
   14889             default: vassert(0);
   14890          }
   14891 
   14892          modrm = getUChar(delta);
   14893          do_MMX_preamble();
   14894          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   14895 
   14896          if (epartIsReg(modrm)) {
   14897             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   14898             delta += 1;
   14899             DIP("psign%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
   14900                                         nameMMXReg(gregLO3ofRM(modrm)));
   14901          } else {
   14902             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14903             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   14904             delta += alen;
   14905             DIP("psign%s %s,%s\n", str, dis_buf,
   14906                                         nameMMXReg(gregLO3ofRM(modrm)));
   14907          }
   14908 
   14909          putMMXReg(
   14910             gregLO3ofRM(modrm),
   14911             dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB )
   14912          );
   14913          goto decode_success;
   14914       }
   14915       break;
   14916 
   14917    case 0x0B:
   14918       /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and
   14919          Scale (XMM) */
   14920       if (have66noF2noF3(pfx)
   14921           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   14922          IRTemp sV  = newTemp(Ity_V128);
   14923          IRTemp dV  = newTemp(Ity_V128);
   14924          IRTemp sHi = newTemp(Ity_I64);
   14925          IRTemp sLo = newTemp(Ity_I64);
   14926          IRTemp dHi = newTemp(Ity_I64);
   14927          IRTemp dLo = newTemp(Ity_I64);
   14928 
   14929          modrm = getUChar(delta);
   14930          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   14931 
   14932          if (epartIsReg(modrm)) {
   14933             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   14934             delta += 1;
   14935             DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   14936                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   14937          } else {
   14938             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14939             gen_SEGV_if_not_16_aligned( addr );
   14940             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   14941             delta += alen;
   14942             DIP("pmulhrsw %s,%s\n", dis_buf,
   14943                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   14944          }
   14945 
   14946          assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   14947          assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   14948          assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   14949          assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   14950 
   14951          putXMMReg(
   14952             gregOfRexRM(pfx,modrm),
   14953             binop(Iop_64HLtoV128,
   14954                   dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
   14955                   dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
   14956             )
   14957          );
   14958          goto decode_success;
   14959       }
   14960       /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale
   14961          (MMX) */
   14962       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14963          IRTemp sV = newTemp(Ity_I64);
   14964          IRTemp dV = newTemp(Ity_I64);
   14965 
   14966          modrm = getUChar(delta);
   14967          do_MMX_preamble();
   14968          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   14969 
   14970          if (epartIsReg(modrm)) {
   14971             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   14972             delta += 1;
   14973             DIP("pmulhrsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   14974                                     nameMMXReg(gregLO3ofRM(modrm)));
   14975          } else {
   14976             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14977             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   14978             delta += alen;
   14979             DIP("pmulhrsw %s,%s\n", dis_buf,
   14980                                     nameMMXReg(gregLO3ofRM(modrm)));
   14981          }
   14982 
   14983          putMMXReg(
   14984             gregLO3ofRM(modrm),
   14985             dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) )
   14986          );
   14987          goto decode_success;
   14988       }
   14989       break;
   14990 
   14991    case 0x1C:
   14992    case 0x1D:
   14993    case 0x1E:
   14994       /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */
   14995       /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */
   14996       /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */
   14997       if (have66noF2noF3(pfx)
   14998           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   14999          IRTemp sV  = newTemp(Ity_V128);
   15000          HChar* str = "???";
   15001          Int    laneszB = 0;
   15002 
   15003          switch (opc) {
   15004             case 0x1C: laneszB = 1; str = "b"; break;
   15005             case 0x1D: laneszB = 2; str = "w"; break;
   15006             case 0x1E: laneszB = 4; str = "d"; break;
   15007             default: vassert(0);
   15008          }
   15009 
   15010          modrm = getUChar(delta);
   15011          if (epartIsReg(modrm)) {
   15012             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   15013             delta += 1;
   15014             DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
   15015                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
   15016          } else {
   15017             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15018             gen_SEGV_if_not_16_aligned( addr );
   15019             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15020             delta += alen;
   15021             DIP("pabs%s %s,%s\n", str, dis_buf,
   15022                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
   15023          }
   15024 
   15025          putXMMReg( gregOfRexRM(pfx,modrm),
   15026                     mkexpr(math_PABS_XMM(sV, laneszB)) );
   15027          goto decode_success;
   15028       }
   15029       /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8  (MMX) */
   15030       /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */
   15031       /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */
   15032       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15033          IRTemp sV      = newTemp(Ity_I64);
   15034          HChar* str     = "???";
   15035          Int    laneszB = 0;
   15036 
   15037          switch (opc) {
   15038             case 0x1C: laneszB = 1; str = "b"; break;
   15039             case 0x1D: laneszB = 2; str = "w"; break;
   15040             case 0x1E: laneszB = 4; str = "d"; break;
   15041             default: vassert(0);
   15042          }
   15043 
   15044          modrm = getUChar(delta);
   15045          do_MMX_preamble();
   15046 
   15047          if (epartIsReg(modrm)) {
   15048             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   15049             delta += 1;
   15050             DIP("pabs%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
   15051                                        nameMMXReg(gregLO3ofRM(modrm)));
   15052          } else {
   15053             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15054             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   15055             delta += alen;
   15056             DIP("pabs%s %s,%s\n", str, dis_buf,
   15057                                        nameMMXReg(gregLO3ofRM(modrm)));
   15058          }
   15059 
   15060          putMMXReg( gregLO3ofRM(modrm),
   15061                     mkexpr(math_PABS_MMX( sV, laneszB )) );
   15062          goto decode_success;
   15063       }
   15064       break;
   15065 
   15066    default:
   15067       break;
   15068 
   15069    }
   15070 
   15071   //decode_failure:
   15072    *decode_OK = False;
   15073    return deltaIN;
   15074 
   15075   decode_success:
   15076    *decode_OK = True;
   15077    return delta;
   15078 }
   15079 
   15080 
   15081 /*------------------------------------------------------------*/
   15082 /*---                                                      ---*/
   15083 /*--- Top-level SSSE3: dis_ESC_0F3A__SupSSE3               ---*/
   15084 /*---                                                      ---*/
   15085 /*------------------------------------------------------------*/
   15086 
   15087 __attribute__((noinline))
   15088 static
   15089 Long dis_ESC_0F3A__SupSSE3 ( Bool* decode_OK,
   15090                              VexAbiInfo* vbi,
   15091                              Prefix pfx, Int sz, Long deltaIN )
   15092 {
   15093    Long   d64   = 0;
   15094    IRTemp addr  = IRTemp_INVALID;
   15095    UChar  modrm = 0;
   15096    Int    alen  = 0;
   15097    HChar  dis_buf[50];
   15098 
   15099    *decode_OK = False;
   15100 
   15101    Long   delta = deltaIN;
   15102    UChar  opc   = getUChar(delta);
   15103    delta++;
   15104    switch (opc) {
   15105 
   15106    case 0x0F:
   15107       /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */
   15108       if (have66noF2noF3(pfx)
   15109           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   15110          IRTemp sV  = newTemp(Ity_V128);
   15111          IRTemp dV  = newTemp(Ity_V128);
   15112 
   15113          modrm = getUChar(delta);
   15114          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   15115 
   15116          if (epartIsReg(modrm)) {
   15117             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   15118             d64 = (Long)getUChar(delta+1);
   15119             delta += 1+1;
   15120             DIP("palignr $%d,%s,%s\n", (Int)d64,
   15121                                        nameXMMReg(eregOfRexRM(pfx,modrm)),
   15122                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
   15123          } else {
   15124             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   15125             gen_SEGV_if_not_16_aligned( addr );
   15126             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15127             d64 = (Long)getUChar(delta+alen);
   15128             delta += alen+1;
   15129             DIP("palignr $%d,%s,%s\n", (Int)d64,
   15130                                        dis_buf,
   15131                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
   15132          }
   15133 
   15134          IRTemp res = math_PALIGNR_XMM( sV, dV, d64 );
   15135          putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
   15136          goto decode_success;
   15137       }
   15138       /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */
   15139       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15140          IRTemp sV  = newTemp(Ity_I64);
   15141          IRTemp dV  = newTemp(Ity_I64);
   15142          IRTemp res = newTemp(Ity_I64);
   15143 
   15144          modrm = getUChar(delta);
   15145          do_MMX_preamble();
   15146          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   15147 
   15148          if (epartIsReg(modrm)) {
   15149             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   15150             d64 = (Long)getUChar(delta+1);
   15151             delta += 1+1;
   15152             DIP("palignr $%d,%s,%s\n",  (Int)d64,
   15153                                         nameMMXReg(eregLO3ofRM(modrm)),
   15154                                         nameMMXReg(gregLO3ofRM(modrm)));
   15155          } else {
   15156             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   15157             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   15158             d64 = (Long)getUChar(delta+alen);
   15159             delta += alen+1;
   15160             DIP("palignr $%d%s,%s\n", (Int)d64,
   15161                                       dis_buf,
   15162                                       nameMMXReg(gregLO3ofRM(modrm)));
   15163          }
   15164 
   15165          if (d64 == 0) {
   15166             assign( res, mkexpr(sV) );
   15167          }
   15168          else if (d64 >= 1 && d64 <= 7) {
   15169             assign(res,
   15170                    binop(Iop_Or64,
   15171                          binop(Iop_Shr64, mkexpr(sV), mkU8(8*d64)),
   15172                          binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d64))
   15173                         )));
   15174          }
   15175          else if (d64 == 8) {
   15176            assign( res, mkexpr(dV) );
   15177          }
   15178          else if (d64 >= 9 && d64 <= 15) {
   15179             assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d64-8))) );
   15180          }
   15181          else if (d64 >= 16 && d64 <= 255) {
   15182             assign( res, mkU64(0) );
   15183          }
   15184          else
   15185             vassert(0);
   15186 
   15187          putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
   15188          goto decode_success;
   15189       }
   15190       break;
   15191 
   15192    default:
   15193       break;
   15194 
   15195    }
   15196 
   15197   //decode_failure:
   15198    *decode_OK = False;
   15199    return deltaIN;
   15200 
   15201   decode_success:
   15202    *decode_OK = True;
   15203    return delta;
   15204 }
   15205 
   15206 
   15207 /*------------------------------------------------------------*/
   15208 /*---                                                      ---*/
   15209 /*--- Top-level SSE4: dis_ESC_0F__SSE4                     ---*/
   15210 /*---                                                      ---*/
   15211 /*------------------------------------------------------------*/
   15212 
   15213 __attribute__((noinline))
   15214 static
   15215 Long dis_ESC_0F__SSE4 ( Bool* decode_OK,
   15216                         VexArchInfo* archinfo,
   15217                         VexAbiInfo* vbi,
   15218                         Prefix pfx, Int sz, Long deltaIN )
   15219 {
   15220    IRTemp addr  = IRTemp_INVALID;
   15221    IRType ty    = Ity_INVALID;
   15222    UChar  modrm = 0;
   15223    Int    alen  = 0;
   15224    HChar  dis_buf[50];
   15225 
   15226    *decode_OK = False;
   15227 
   15228    Long   delta = deltaIN;
   15229    UChar  opc   = getUChar(delta);
   15230    delta++;
   15231    switch (opc) {
   15232 
   15233    case 0xB8:
   15234       /* F3 0F B8  = POPCNT{W,L,Q}
   15235          Count the number of 1 bits in a register
   15236       */
   15237       if (haveF3noF2(pfx) /* so both 66 and REX.W are possibilities */
   15238           && (sz == 2 || sz == 4 || sz == 8)) {
   15239          /*IRType*/ ty  = szToITy(sz);
   15240          IRTemp     src = newTemp(ty);
   15241          modrm = getUChar(delta);
   15242          if (epartIsReg(modrm)) {
   15243             assign(src, getIRegE(sz, pfx, modrm));
   15244             delta += 1;
   15245             DIP("popcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
   15246                 nameIRegG(sz, pfx, modrm));
   15247          } else {
   15248             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0);
   15249             assign(src, loadLE(ty, mkexpr(addr)));
   15250             delta += alen;
   15251             DIP("popcnt%c %s, %s\n", nameISize(sz), dis_buf,
   15252                 nameIRegG(sz, pfx, modrm));
   15253          }
   15254 
   15255          IRTemp result = gen_POPCOUNT(ty, src);
   15256          putIRegG(sz, pfx, modrm, mkexpr(result));
   15257 
   15258          // Update flags.  This is pretty lame .. perhaps can do better
   15259          // if this turns out to be performance critical.
   15260          // O S A C P are cleared.  Z is set if SRC == 0.
   15261          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   15262          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   15263          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   15264          stmt( IRStmt_Put( OFFB_CC_DEP1,
   15265                binop(Iop_Shl64,
   15266                      unop(Iop_1Uto64,
   15267                           binop(Iop_CmpEQ64,
   15268                                 widenUto64(mkexpr(src)),
   15269                                 mkU64(0))),
   15270                      mkU8(AMD64G_CC_SHIFT_Z))));
   15271 
   15272          goto decode_success;
   15273       }
   15274       break;
   15275 
   15276    case 0xBD:
   15277       /* F3 0F BD -- LZCNT (count leading zeroes.  An AMD extension,
   15278          which we can only decode if we're sure this is an AMD cpu
   15279          that supports LZCNT, since otherwise it's BSR, which behaves
   15280          differently.  Bizarrely, my Sandy Bridge also accepts these
   15281          instructions but produces different results. */
   15282       if (haveF3noF2(pfx) /* so both 66 and 48 are possibilities */
   15283           && (sz == 2 || sz == 4 || sz == 8)
   15284           && 0 != (archinfo->hwcaps & VEX_HWCAPS_AMD64_LZCNT)) {
   15285          /*IRType*/ ty  = szToITy(sz);
   15286          IRTemp     src = newTemp(ty);
   15287          modrm = getUChar(delta);
   15288          if (epartIsReg(modrm)) {
   15289             assign(src, getIRegE(sz, pfx, modrm));
   15290             delta += 1;
   15291             DIP("lzcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
   15292                 nameIRegG(sz, pfx, modrm));
   15293          } else {
   15294             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0);
   15295             assign(src, loadLE(ty, mkexpr(addr)));
   15296             delta += alen;
   15297             DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf,
   15298                 nameIRegG(sz, pfx, modrm));
   15299          }
   15300 
   15301          IRTemp res = gen_LZCNT(ty, src);
   15302          putIRegG(sz, pfx, modrm, mkexpr(res));
   15303 
   15304          // Update flags.  This is pretty lame .. perhaps can do better
   15305          // if this turns out to be performance critical.
   15306          // O S A P are cleared.  Z is set if RESULT == 0.
   15307          // C is set if SRC is zero.
   15308          IRTemp src64 = newTemp(Ity_I64);
   15309          IRTemp res64 = newTemp(Ity_I64);
   15310          assign(src64, widenUto64(mkexpr(src)));
   15311          assign(res64, widenUto64(mkexpr(res)));
   15312 
   15313          IRTemp oszacp = newTemp(Ity_I64);
   15314          assign(
   15315             oszacp,
   15316             binop(Iop_Or64,
   15317                   binop(Iop_Shl64,
   15318                         unop(Iop_1Uto64,
   15319                              binop(Iop_CmpEQ64, mkexpr(res64), mkU64(0))),
   15320                         mkU8(AMD64G_CC_SHIFT_Z)),
   15321                   binop(Iop_Shl64,
   15322                         unop(Iop_1Uto64,
   15323                              binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0))),
   15324                         mkU8(AMD64G_CC_SHIFT_C))
   15325             )
   15326          );
   15327 
   15328          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   15329          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   15330          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   15331          stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
   15332 
   15333          goto decode_success;
   15334       }
   15335       break;
   15336 
   15337    default:
   15338       break;
   15339 
   15340    }
   15341 
   15342   //decode_failure:
   15343    *decode_OK = False;
   15344    return deltaIN;
   15345 
   15346   decode_success:
   15347    *decode_OK = True;
   15348    return delta;
   15349 }
   15350 
   15351 
   15352 /*------------------------------------------------------------*/
   15353 /*---                                                      ---*/
   15354 /*--- Top-level SSE4: dis_ESC_0F38__SSE4                   ---*/
   15355 /*---                                                      ---*/
   15356 /*------------------------------------------------------------*/
   15357 
   15358 static IRTemp math_PBLENDVB_128 ( IRTemp vecE, IRTemp vecG,
   15359                                   IRTemp vec0/*controlling mask*/,
   15360                                   UInt gran, IROp opSAR )
   15361 {
   15362    /* The tricky bit is to convert vec0 into a suitable mask, by
   15363       copying the most significant bit of each lane into all positions
   15364       in the lane. */
   15365    IRTemp sh = newTemp(Ity_I8);
   15366    assign(sh, mkU8(8 * gran - 1));
   15367 
   15368    IRTemp mask = newTemp(Ity_V128);
   15369    assign(mask, binop(opSAR, mkexpr(vec0), mkexpr(sh)));
   15370 
   15371    IRTemp notmask = newTemp(Ity_V128);
   15372    assign(notmask, unop(Iop_NotV128, mkexpr(mask)));
   15373 
   15374    IRTemp res = newTemp(Ity_V128);
   15375    assign(res,  binop(Iop_OrV128,
   15376                       binop(Iop_AndV128, mkexpr(vecE), mkexpr(mask)),
   15377                       binop(Iop_AndV128, mkexpr(vecG), mkexpr(notmask))));
   15378    return res;
   15379 }
   15380 
   15381 static IRTemp math_PBLENDVB_256 ( IRTemp vecE, IRTemp vecG,
   15382                                   IRTemp vec0/*controlling mask*/,
   15383                                   UInt gran, IROp opSAR128 )
   15384 {
   15385    /* The tricky bit is to convert vec0 into a suitable mask, by
   15386       copying the most significant bit of each lane into all positions
   15387       in the lane. */
   15388    IRTemp sh = newTemp(Ity_I8);
   15389    assign(sh, mkU8(8 * gran - 1));
   15390 
   15391    IRTemp vec0Hi = IRTemp_INVALID;
   15392    IRTemp vec0Lo = IRTemp_INVALID;
   15393    breakupV256toV128s( vec0, &vec0Hi, &vec0Lo );
   15394 
   15395    IRTemp mask = newTemp(Ity_V256);
   15396    assign(mask, binop(Iop_V128HLtoV256,
   15397                       binop(opSAR128, mkexpr(vec0Hi), mkexpr(sh)),
   15398                       binop(opSAR128, mkexpr(vec0Lo), mkexpr(sh))));
   15399 
   15400    IRTemp notmask = newTemp(Ity_V256);
   15401    assign(notmask, unop(Iop_NotV256, mkexpr(mask)));
   15402 
   15403    IRTemp res = newTemp(Ity_V256);
   15404    assign(res,  binop(Iop_OrV256,
   15405                       binop(Iop_AndV256, mkexpr(vecE), mkexpr(mask)),
   15406                       binop(Iop_AndV256, mkexpr(vecG), mkexpr(notmask))));
   15407    return res;
   15408 }
   15409 
   15410 static Long dis_VBLENDV_128 ( VexAbiInfo* vbi, Prefix pfx, Long delta,
   15411                               const HChar *name, UInt gran, IROp opSAR )
   15412 {
   15413    IRTemp addr   = IRTemp_INVALID;
   15414    Int    alen   = 0;
   15415    HChar  dis_buf[50];
   15416    UChar  modrm  = getUChar(delta);
   15417    UInt   rG     = gregOfRexRM(pfx, modrm);
   15418    UInt   rV     = getVexNvvvv(pfx);
   15419    UInt   rIS4   = 0xFF; /* invalid */
   15420    IRTemp vecE   = newTemp(Ity_V128);
   15421    IRTemp vecV   = newTemp(Ity_V128);
   15422    IRTemp vecIS4 = newTemp(Ity_V128);
   15423    if (epartIsReg(modrm)) {
   15424       delta++;
   15425       UInt rE = eregOfRexRM(pfx, modrm);
   15426       assign(vecE, getXMMReg(rE));
   15427       UChar ib = getUChar(delta);
   15428       rIS4 = (ib >> 4) & 0xF;
   15429       DIP("%s %s,%s,%s,%s\n",
   15430           name, nameXMMReg(rIS4), nameXMMReg(rE),
   15431           nameXMMReg(rV), nameXMMReg(rG));
   15432    } else {
   15433       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   15434       delta += alen;
   15435       assign(vecE, loadLE(Ity_V128, mkexpr(addr)));
   15436       UChar ib = getUChar(delta);
   15437       rIS4 = (ib >> 4) & 0xF;
   15438       DIP("%s %s,%s,%s,%s\n",
   15439           name, nameXMMReg(rIS4), dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   15440    }
   15441    delta++;
   15442    assign(vecV,   getXMMReg(rV));
   15443    assign(vecIS4, getXMMReg(rIS4));
   15444    IRTemp res = math_PBLENDVB_128( vecE, vecV, vecIS4, gran, opSAR );
   15445    putYMMRegLoAndZU( rG, mkexpr(res) );
   15446    return delta;
   15447 }
   15448 
   15449 static Long dis_VBLENDV_256 ( VexAbiInfo* vbi, Prefix pfx, Long delta,
   15450                               const HChar *name, UInt gran, IROp opSAR128 )
   15451 {
   15452    IRTemp addr   = IRTemp_INVALID;
   15453    Int    alen   = 0;
   15454    HChar  dis_buf[50];
   15455    UChar  modrm  = getUChar(delta);
   15456    UInt   rG     = gregOfRexRM(pfx, modrm);
   15457    UInt   rV     = getVexNvvvv(pfx);
   15458    UInt   rIS4   = 0xFF; /* invalid */
   15459    IRTemp vecE   = newTemp(Ity_V256);
   15460    IRTemp vecV   = newTemp(Ity_V256);
   15461    IRTemp vecIS4 = newTemp(Ity_V256);
   15462    if (epartIsReg(modrm)) {
   15463       delta++;
   15464       UInt rE = eregOfRexRM(pfx, modrm);
   15465       assign(vecE, getYMMReg(rE));
   15466       UChar ib = getUChar(delta);
   15467       rIS4 = (ib >> 4) & 0xF;
   15468       DIP("%s %s,%s,%s,%s\n",
   15469           name, nameYMMReg(rIS4), nameYMMReg(rE),
   15470           nameYMMReg(rV), nameYMMReg(rG));
   15471    } else {
   15472       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   15473       delta += alen;
   15474       assign(vecE, loadLE(Ity_V256, mkexpr(addr)));
   15475       UChar ib = getUChar(delta);
   15476       rIS4 = (ib >> 4) & 0xF;
   15477       DIP("%s %s,%s,%s,%s\n",
   15478           name, nameYMMReg(rIS4), dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   15479    }
   15480    delta++;
   15481    assign(vecV,   getYMMReg(rV));
   15482    assign(vecIS4, getYMMReg(rIS4));
   15483    IRTemp res = math_PBLENDVB_256( vecE, vecV, vecIS4, gran, opSAR128 );
   15484    putYMMReg( rG, mkexpr(res) );
   15485    return delta;
   15486 }
   15487 
   15488 static void finish_xTESTy ( IRTemp andV, IRTemp andnV, Int sign )
   15489 {
   15490    /* Set Z=1 iff (vecE & vecG) == 0
   15491       Set C=1 iff (vecE & not vecG) == 0
   15492    */
   15493 
   15494    /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
   15495 
   15496    /* andV resp. andnV, reduced to 64-bit values, by or-ing the top
   15497       and bottom 64-bits together.  It relies on this trick:
   15498 
   15499       InterleaveLO64x2([a,b],[c,d]) == [b,d]    hence
   15500 
   15501       InterleaveLO64x2([a,b],[a,b]) == [b,b]    and similarly
   15502       InterleaveHI64x2([a,b],[a,b]) == [a,a]
   15503 
   15504       and so the OR of the above 2 exprs produces
   15505       [a OR b, a OR b], from which we simply take the lower half.
   15506    */
   15507    IRTemp and64  = newTemp(Ity_I64);
   15508    IRTemp andn64 = newTemp(Ity_I64);
   15509 
   15510    assign(and64,
   15511           unop(Iop_V128to64,
   15512                binop(Iop_OrV128,
   15513                      binop(Iop_InterleaveLO64x2,
   15514                            mkexpr(andV), mkexpr(andV)),
   15515                      binop(Iop_InterleaveHI64x2,
   15516                            mkexpr(andV), mkexpr(andV)))));
   15517 
   15518    assign(andn64,
   15519           unop(Iop_V128to64,
   15520                binop(Iop_OrV128,
   15521                      binop(Iop_InterleaveLO64x2,
   15522                            mkexpr(andnV), mkexpr(andnV)),
   15523                      binop(Iop_InterleaveHI64x2,
   15524                            mkexpr(andnV), mkexpr(andnV)))));
   15525 
   15526    IRTemp z64 = newTemp(Ity_I64);
   15527    IRTemp c64 = newTemp(Ity_I64);
   15528    if (sign == 64) {
   15529       /* When only interested in the most significant bit, just shift
   15530          arithmetically right and negate.  */
   15531       assign(z64,
   15532              unop(Iop_Not64,
   15533                   binop(Iop_Sar64, mkexpr(and64), mkU8(63))));
   15534 
   15535       assign(c64,
   15536              unop(Iop_Not64,
   15537                   binop(Iop_Sar64, mkexpr(andn64), mkU8(63))));
   15538    } else {
   15539       if (sign == 32) {
   15540          /* When interested in bit 31 and bit 63, mask those bits and
   15541             fallthrough into the PTEST handling.  */
   15542          IRTemp t0 = newTemp(Ity_I64);
   15543          IRTemp t1 = newTemp(Ity_I64);
   15544          IRTemp t2 = newTemp(Ity_I64);
   15545          assign(t0, mkU64(0x8000000080000000ULL));
   15546          assign(t1, binop(Iop_And64, mkexpr(and64), mkexpr(t0)));
   15547          assign(t2, binop(Iop_And64, mkexpr(andn64), mkexpr(t0)));
   15548          and64 = t1;
   15549          andn64 = t2;
   15550       }
   15551       /* Now convert and64, andn64 to all-zeroes or all-1s, so we can
   15552          slice out the Z and C bits conveniently.  We use the standard
   15553          trick all-zeroes -> all-zeroes, anything-else -> all-ones
   15554          done by "(x | -x) >>s (word-size - 1)".
   15555       */
   15556       assign(z64,
   15557              unop(Iop_Not64,
   15558                   binop(Iop_Sar64,
   15559                         binop(Iop_Or64,
   15560                               binop(Iop_Sub64, mkU64(0), mkexpr(and64)),
   15561                                     mkexpr(and64)), mkU8(63))));
   15562 
   15563       assign(c64,
   15564              unop(Iop_Not64,
   15565                   binop(Iop_Sar64,
   15566                         binop(Iop_Or64,
   15567                               binop(Iop_Sub64, mkU64(0), mkexpr(andn64)),
   15568                                     mkexpr(andn64)), mkU8(63))));
   15569    }
   15570 
   15571    /* And finally, slice out the Z and C flags and set the flags
   15572       thunk to COPY for them.  OSAP are set to zero. */
   15573    IRTemp newOSZACP = newTemp(Ity_I64);
   15574    assign(newOSZACP,
   15575           binop(Iop_Or64,
   15576                 binop(Iop_And64, mkexpr(z64), mkU64(AMD64G_CC_MASK_Z)),
   15577                 binop(Iop_And64, mkexpr(c64), mkU64(AMD64G_CC_MASK_C))));
   15578 
   15579    stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(newOSZACP)));
   15580    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   15581    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   15582    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   15583 }
   15584 
   15585 
   15586 /* Handles 128 bit versions of PTEST, VTESTPS or VTESTPD.
   15587    sign is 0 for PTEST insn, 32 for VTESTPS and 64 for VTESTPD. */
   15588 static Long dis_xTESTy_128 ( VexAbiInfo* vbi, Prefix pfx,
   15589                              Long delta, Bool isAvx, Int sign )
   15590 {
   15591    IRTemp addr   = IRTemp_INVALID;
   15592    Int    alen   = 0;
   15593    HChar  dis_buf[50];
   15594    UChar  modrm  = getUChar(delta);
   15595    UInt   rG     = gregOfRexRM(pfx, modrm);
   15596    IRTemp vecE = newTemp(Ity_V128);
   15597    IRTemp vecG = newTemp(Ity_V128);
   15598 
   15599    if ( epartIsReg(modrm) ) {
   15600       UInt rE = eregOfRexRM(pfx, modrm);
   15601       assign(vecE, getXMMReg(rE));
   15602       delta += 1;
   15603       DIP( "%s%stest%s %s,%s\n",
   15604            isAvx ? "v" : "", sign == 0 ? "p" : "",
   15605            sign == 0 ? "" : sign == 32 ? "ps" : "pd",
   15606            nameXMMReg(rE), nameXMMReg(rG) );
   15607    } else {
   15608       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   15609       if (!isAvx)
   15610          gen_SEGV_if_not_16_aligned( addr );
   15611       assign(vecE, loadLE( Ity_V128, mkexpr(addr) ));
   15612       delta += alen;
   15613       DIP( "%s%stest%s %s,%s\n",
   15614            isAvx ? "v" : "", sign == 0 ? "p" : "",
   15615            sign == 0 ? "" : sign == 32 ? "ps" : "pd",
   15616            dis_buf, nameXMMReg(rG) );
   15617    }
   15618 
   15619    assign(vecG, getXMMReg(rG));
   15620 
   15621    /* Set Z=1 iff (vecE & vecG) == 0
   15622       Set C=1 iff (vecE & not vecG) == 0
   15623    */
   15624 
   15625    /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
   15626    IRTemp andV  = newTemp(Ity_V128);
   15627    IRTemp andnV = newTemp(Ity_V128);
   15628    assign(andV,  binop(Iop_AndV128, mkexpr(vecE), mkexpr(vecG)));
   15629    assign(andnV, binop(Iop_AndV128,
   15630                        mkexpr(vecE),
   15631                        binop(Iop_XorV128, mkexpr(vecG),
   15632                                           mkV128(0xFFFF))));
   15633 
   15634    finish_xTESTy ( andV, andnV, sign );
   15635    return delta;
   15636 }
   15637 
   15638 
   15639 /* Handles 256 bit versions of PTEST, VTESTPS or VTESTPD.
   15640    sign is 0 for PTEST insn, 32 for VTESTPS and 64 for VTESTPD. */
   15641 static Long dis_xTESTy_256 ( VexAbiInfo* vbi, Prefix pfx,
   15642                              Long delta, Int sign )
   15643 {
   15644    IRTemp addr   = IRTemp_INVALID;
   15645    Int    alen   = 0;
   15646    HChar  dis_buf[50];
   15647    UChar  modrm  = getUChar(delta);
   15648    UInt   rG     = gregOfRexRM(pfx, modrm);
   15649    IRTemp vecE   = newTemp(Ity_V256);
   15650    IRTemp vecG   = newTemp(Ity_V256);
   15651 
   15652    if ( epartIsReg(modrm) ) {
   15653       UInt rE = eregOfRexRM(pfx, modrm);
   15654       assign(vecE, getYMMReg(rE));
   15655       delta += 1;
   15656       DIP( "v%stest%s %s,%s\n", sign == 0 ? "p" : "",
   15657            sign == 0 ? "" : sign == 32 ? "ps" : "pd",
   15658            nameYMMReg(rE), nameYMMReg(rG) );
   15659    } else {
   15660       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   15661       assign(vecE, loadLE( Ity_V256, mkexpr(addr) ));
   15662       delta += alen;
   15663       DIP( "v%stest%s %s,%s\n", sign == 0 ? "p" : "",
   15664            sign == 0 ? "" : sign == 32 ? "ps" : "pd",
   15665            dis_buf, nameYMMReg(rG) );
   15666    }
   15667 
   15668    assign(vecG, getYMMReg(rG));
   15669 
   15670    /* Set Z=1 iff (vecE & vecG) == 0
   15671       Set C=1 iff (vecE & not vecG) == 0
   15672    */
   15673 
   15674    /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
   15675    IRTemp andV  = newTemp(Ity_V256);
   15676    IRTemp andnV = newTemp(Ity_V256);
   15677    assign(andV,  binop(Iop_AndV256, mkexpr(vecE), mkexpr(vecG)));
   15678    assign(andnV, binop(Iop_AndV256,
   15679                        mkexpr(vecE), unop(Iop_NotV256, mkexpr(vecG))));
   15680 
   15681    IRTemp andVhi  = IRTemp_INVALID;
   15682    IRTemp andVlo  = IRTemp_INVALID;
   15683    IRTemp andnVhi = IRTemp_INVALID;
   15684    IRTemp andnVlo = IRTemp_INVALID;
   15685    breakupV256toV128s( andV, &andVhi, &andVlo );
   15686    breakupV256toV128s( andnV, &andnVhi, &andnVlo );
   15687 
   15688    IRTemp andV128  = newTemp(Ity_V128);
   15689    IRTemp andnV128 = newTemp(Ity_V128);
   15690    assign( andV128, binop( Iop_OrV128, mkexpr(andVhi), mkexpr(andVlo) ) );
   15691    assign( andnV128, binop( Iop_OrV128, mkexpr(andnVhi), mkexpr(andnVlo) ) );
   15692 
   15693    finish_xTESTy ( andV128, andnV128, sign );
   15694    return delta;
   15695 }
   15696 
   15697 
   15698 /* Handles 128 bit versions of PMOVZXBW and PMOVSXBW. */
   15699 static Long dis_PMOVxXBW_128 ( VexAbiInfo* vbi, Prefix pfx,
   15700                                Long delta, Bool isAvx, Bool xIsZ )
   15701 {
   15702    IRTemp addr   = IRTemp_INVALID;
   15703    Int    alen   = 0;
   15704    HChar  dis_buf[50];
   15705    IRTemp srcVec = newTemp(Ity_V128);
   15706    UChar  modrm  = getUChar(delta);
   15707    UChar* mbV    = isAvx ? "v" : "";
   15708    UChar  how    = xIsZ ? 'z' : 's';
   15709    UInt   rG     = gregOfRexRM(pfx, modrm);
   15710    if ( epartIsReg(modrm) ) {
   15711       UInt rE = eregOfRexRM(pfx, modrm);
   15712       assign( srcVec, getXMMReg(rE) );
   15713       delta += 1;
   15714       DIP( "%spmov%cxbw %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
   15715    } else {
   15716       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   15717       assign( srcVec,
   15718               unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
   15719       delta += alen;
   15720       DIP( "%spmov%cxbw %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
   15721    }
   15722 
   15723    IRExpr* res
   15724       = xIsZ /* do math for either zero or sign extend */
   15725         ? binop( Iop_InterleaveLO8x16,
   15726                  IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) )
   15727         : binop( Iop_SarN16x8,
   15728                  binop( Iop_ShlN16x8,
   15729                         binop( Iop_InterleaveLO8x16,
   15730                                IRExpr_Const( IRConst_V128(0) ),
   15731                                mkexpr(srcVec) ),
   15732                         mkU8(8) ),
   15733                  mkU8(8) );
   15734 
   15735    (isAvx ? putYMMRegLoAndZU : putXMMReg) ( rG, res );
   15736 
   15737    return delta;
   15738 }
   15739 
   15740 
   15741 static Long dis_PMOVxXWD_128 ( VexAbiInfo* vbi, Prefix pfx,
   15742                                Long delta, Bool isAvx, Bool xIsZ )
   15743 {
   15744    IRTemp addr   = IRTemp_INVALID;
   15745    Int    alen   = 0;
   15746    HChar  dis_buf[50];
   15747    IRTemp srcVec = newTemp(Ity_V128);
   15748    UChar  modrm  = getUChar(delta);
   15749    UChar* mbV    = isAvx ? "v" : "";
   15750    UChar  how    = xIsZ ? 'z' : 's';
   15751    UInt   rG     = gregOfRexRM(pfx, modrm);
   15752 
   15753    if ( epartIsReg(modrm) ) {
   15754       UInt rE = eregOfRexRM(pfx, modrm);
   15755       assign( srcVec, getXMMReg(rE) );
   15756       delta += 1;
   15757       DIP( "%spmov%cxwd %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
   15758    } else {
   15759       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   15760       assign( srcVec,
   15761               unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
   15762       delta += alen;
   15763       DIP( "%spmov%cxwd %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
   15764    }
   15765 
   15766    IRExpr* res
   15767       = binop( Iop_InterleaveLO16x8,
   15768                IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) );
   15769    if (!xIsZ)
   15770       res = binop(Iop_SarN32x4,
   15771                   binop(Iop_ShlN32x4, res, mkU8(16)), mkU8(16));
   15772 
   15773    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   15774       ( gregOfRexRM(pfx, modrm), res );
   15775 
   15776    return delta;
   15777 }
   15778 
   15779 
   15780 static Long dis_PMOVSXWQ_128 ( VexAbiInfo* vbi, Prefix pfx,
   15781                                Long delta, Bool isAvx )
   15782 {
   15783    IRTemp addr     = IRTemp_INVALID;
   15784    Int    alen     = 0;
   15785    HChar  dis_buf[50];
   15786    IRTemp srcBytes = newTemp(Ity_I32);
   15787    UChar  modrm    = getUChar(delta);
   15788    UChar* mbV      = isAvx ? "v" : "";
   15789    UInt   rG       = gregOfRexRM(pfx, modrm);
   15790 
   15791    if ( epartIsReg( modrm ) ) {
   15792       UInt rE = eregOfRexRM(pfx, modrm);
   15793       assign( srcBytes, getXMMRegLane32( rE, 0 ) );
   15794       delta += 1;
   15795       DIP( "%spmovsxwq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
   15796    } else {
   15797       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   15798       assign( srcBytes, loadLE( Ity_I32, mkexpr(addr) ) );
   15799       delta += alen;
   15800       DIP( "%spmovsxwq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
   15801    }
   15802 
   15803    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   15804       ( rG, binop( Iop_64HLtoV128,
   15805                    unop( Iop_16Sto64,
   15806                          unop( Iop_32HIto16, mkexpr(srcBytes) ) ),
   15807                    unop( Iop_16Sto64,
   15808                          unop( Iop_32to16, mkexpr(srcBytes) ) ) ) );
   15809    return delta;
   15810 }
   15811 
   15812 
   15813 static Long dis_PMOVZXWQ_128 ( VexAbiInfo* vbi, Prefix pfx,
   15814                                Long delta, Bool isAvx )
   15815 {
   15816    IRTemp addr     = IRTemp_INVALID;
   15817    Int    alen     = 0;
   15818    HChar  dis_buf[50];
   15819    IRTemp srcVec = newTemp(Ity_V128);
   15820    UChar  modrm    = getUChar(delta);
   15821    UChar* mbV      = isAvx ? "v" : "";
   15822    UInt   rG       = gregOfRexRM(pfx, modrm);
   15823 
   15824    if ( epartIsReg( modrm ) ) {
   15825       UInt rE = eregOfRexRM(pfx, modrm);
   15826       assign( srcVec, getXMMReg(rE) );
   15827       delta += 1;
   15828       DIP( "%spmovzxwq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
   15829    } else {
   15830       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   15831       assign( srcVec,
   15832               unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
   15833       delta += alen;
   15834       DIP( "%spmovzxwq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
   15835    }
   15836 
   15837    IRTemp zeroVec = newTemp( Ity_V128 );
   15838    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   15839 
   15840    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   15841       ( rG, binop( Iop_InterleaveLO16x8,
   15842                    mkexpr(zeroVec),
   15843                    binop( Iop_InterleaveLO16x8,
   15844                           mkexpr(zeroVec), mkexpr(srcVec) ) ) );
   15845    return delta;
   15846 }
   15847 
   15848 
   15849 /* Handles 128 bit versions of PMOVZXDQ and PMOVSXDQ. */
   15850 static Long dis_PMOVxXDQ_128 ( VexAbiInfo* vbi, Prefix pfx,
   15851                                Long delta, Bool isAvx, Bool xIsZ )
   15852 {
   15853    IRTemp addr   = IRTemp_INVALID;
   15854    Int    alen   = 0;
   15855    HChar  dis_buf[50];
   15856    IRTemp srcI64 = newTemp(Ity_I64);
   15857    IRTemp srcVec = newTemp(Ity_V128);
   15858    UChar  modrm  = getUChar(delta);
   15859    UChar* mbV    = isAvx ? "v" : "";
   15860    UChar  how    = xIsZ ? 'z' : 's';
   15861    UInt   rG     = gregOfRexRM(pfx, modrm);
   15862    /* Compute both srcI64 -- the value to expand -- and srcVec -- same
   15863       thing in a V128, with arbitrary junk in the top 64 bits.  Use
   15864       one or both of them and let iropt clean up afterwards (as
   15865       usual). */
   15866    if ( epartIsReg(modrm) ) {
   15867       UInt rE = eregOfRexRM(pfx, modrm);
   15868       assign( srcVec, getXMMReg(rE) );
   15869       assign( srcI64, unop(Iop_V128to64, mkexpr(srcVec)) );
   15870       delta += 1;
   15871       DIP( "%spmov%cxdq %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
   15872    } else {
   15873       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   15874       assign( srcI64, loadLE(Ity_I64, mkexpr(addr)) );
   15875       assign( srcVec, unop( Iop_64UtoV128, mkexpr(srcI64)) );
   15876       delta += alen;
   15877       DIP( "%spmov%cxdq %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
   15878    }
   15879 
   15880    IRExpr* res
   15881       = xIsZ /* do math for either zero or sign extend */
   15882         ? binop( Iop_InterleaveLO32x4,
   15883                  IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) )
   15884         : binop( Iop_64HLtoV128,
   15885                  unop( Iop_32Sto64,
   15886                        unop( Iop_64HIto32, mkexpr(srcI64) ) ),
   15887                  unop( Iop_32Sto64,
   15888                        unop( Iop_64to32, mkexpr(srcI64) ) ) );
   15889 
   15890    (isAvx ? putYMMRegLoAndZU : putXMMReg) ( rG, res );
   15891 
   15892    return delta;
   15893 }
   15894 
   15895 
   15896 /* Handles 128 bit versions of PMOVZXBD and PMOVSXBD. */
   15897 static Long dis_PMOVxXBD_128 ( VexAbiInfo* vbi, Prefix pfx,
   15898                                Long delta, Bool isAvx, Bool xIsZ )
   15899 {
   15900    IRTemp addr   = IRTemp_INVALID;
   15901    Int    alen   = 0;
   15902    HChar  dis_buf[50];
   15903    IRTemp srcVec = newTemp(Ity_V128);
   15904    UChar  modrm  = getUChar(delta);
   15905    UChar* mbV    = isAvx ? "v" : "";
   15906    UChar  how    = xIsZ ? 'z' : 's';
   15907    UInt   rG     = gregOfRexRM(pfx, modrm);
   15908    if ( epartIsReg(modrm) ) {
   15909       UInt rE = eregOfRexRM(pfx, modrm);
   15910       assign( srcVec, getXMMReg(rE) );
   15911       delta += 1;
   15912       DIP( "%spmov%cxbd %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
   15913    } else {
   15914       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   15915       assign( srcVec,
   15916               unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
   15917       delta += alen;
   15918       DIP( "%spmov%cxbd %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
   15919    }
   15920 
   15921    IRTemp zeroVec = newTemp(Ity_V128);
   15922    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   15923 
   15924    IRExpr* res
   15925       = binop(Iop_InterleaveLO8x16,
   15926               mkexpr(zeroVec),
   15927               binop(Iop_InterleaveLO8x16,
   15928                     mkexpr(zeroVec), mkexpr(srcVec)));
   15929    if (!xIsZ)
   15930       res = binop(Iop_SarN32x4,
   15931                   binop(Iop_ShlN32x4, res, mkU8(24)), mkU8(24));
   15932 
   15933    (isAvx ? putYMMRegLoAndZU : putXMMReg) ( rG, res );
   15934 
   15935    return delta;
   15936 }
   15937 
   15938 
   15939 /* Handles 128 bit versions of PMOVSXBQ. */
   15940 static Long dis_PMOVSXBQ_128 ( VexAbiInfo* vbi, Prefix pfx,
   15941                                Long delta, Bool isAvx )
   15942 {
   15943    IRTemp addr     = IRTemp_INVALID;
   15944    Int    alen     = 0;
   15945    HChar  dis_buf[50];
   15946    IRTemp srcBytes = newTemp(Ity_I16);
   15947    UChar  modrm    = getUChar(delta);
   15948    UChar* mbV      = isAvx ? "v" : "";
   15949    UInt   rG       = gregOfRexRM(pfx, modrm);
   15950    if ( epartIsReg(modrm) ) {
   15951       UInt rE = eregOfRexRM(pfx, modrm);
   15952       assign( srcBytes, getXMMRegLane16( rE, 0 ) );
   15953       delta += 1;
   15954       DIP( "%spmovsxbq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
   15955    } else {
   15956       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   15957       assign( srcBytes, loadLE( Ity_I16, mkexpr(addr) ) );
   15958       delta += alen;
   15959       DIP( "%spmovsxbq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
   15960    }
   15961 
   15962    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   15963       ( rG, binop( Iop_64HLtoV128,
   15964                    unop( Iop_8Sto64,
   15965                          unop( Iop_16HIto8, mkexpr(srcBytes) ) ),
   15966                    unop( Iop_8Sto64,
   15967                          unop( Iop_16to8, mkexpr(srcBytes) ) ) ) );
   15968    return delta;
   15969 }
   15970 
   15971 
   15972 /* Handles 128 bit versions of PMOVZXBQ. */
   15973 static Long dis_PMOVZXBQ_128 ( VexAbiInfo* vbi, Prefix pfx,
   15974                                Long delta, Bool isAvx )
   15975 {
   15976    IRTemp addr     = IRTemp_INVALID;
   15977    Int    alen     = 0;
   15978    HChar  dis_buf[50];
   15979    IRTemp srcVec   = newTemp(Ity_V128);
   15980    UChar  modrm    = getUChar(delta);
   15981    UChar* mbV      = isAvx ? "v" : "";
   15982    UInt   rG       = gregOfRexRM(pfx, modrm);
   15983    if ( epartIsReg(modrm) ) {
   15984       UInt rE = eregOfRexRM(pfx, modrm);
   15985       assign( srcVec, getXMMReg(rE) );
   15986       delta += 1;
   15987       DIP( "%spmovzxbq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
   15988    } else {
   15989       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   15990       assign( srcVec,
   15991               unop( Iop_32UtoV128,
   15992                     unop( Iop_16Uto32, loadLE( Ity_I16, mkexpr(addr) ))));
   15993       delta += alen;
   15994       DIP( "%spmovzxbq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
   15995    }
   15996 
   15997    IRTemp zeroVec = newTemp(Ity_V128);
   15998    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   15999 
   16000    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   16001       ( rG, binop( Iop_InterleaveLO8x16,
   16002                    mkexpr(zeroVec),
   16003                    binop( Iop_InterleaveLO8x16,
   16004                           mkexpr(zeroVec),
   16005                           binop( Iop_InterleaveLO8x16,
   16006                                  mkexpr(zeroVec), mkexpr(srcVec) ) ) ) );
   16007    return delta;
   16008 }
   16009 
   16010 
   16011 static Long dis_PHMINPOSUW_128 ( VexAbiInfo* vbi, Prefix pfx,
   16012                                  Long delta, Bool isAvx )
   16013 {
   16014    IRTemp addr   = IRTemp_INVALID;
   16015    Int    alen   = 0;
   16016    HChar  dis_buf[50];
   16017    UChar  modrm  = getUChar(delta);
   16018    UChar* mbV    = isAvx ? "v" : "";
   16019    IRTemp sV     = newTemp(Ity_V128);
   16020    IRTemp sHi    = newTemp(Ity_I64);
   16021    IRTemp sLo    = newTemp(Ity_I64);
   16022    IRTemp dLo    = newTemp(Ity_I64);
   16023    UInt   rG     = gregOfRexRM(pfx,modrm);
   16024    if (epartIsReg(modrm)) {
   16025       UInt rE = eregOfRexRM(pfx,modrm);
   16026       assign( sV, getXMMReg(rE) );
   16027       delta += 1;
   16028       DIP("%sphminposuw %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG));
   16029    } else {
   16030       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   16031       if (!isAvx)
   16032          gen_SEGV_if_not_16_aligned(addr);
   16033       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   16034       delta += alen;
   16035       DIP("%sphminposuw %s,%s\n", mbV, dis_buf, nameXMMReg(rG));
   16036    }
   16037    assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   16038    assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   16039    assign( dLo, mkIRExprCCall(
   16040                    Ity_I64, 0/*regparms*/,
   16041                    "amd64g_calculate_sse_phminposuw",
   16042                    &amd64g_calculate_sse_phminposuw,
   16043                    mkIRExprVec_2( mkexpr(sLo), mkexpr(sHi) )
   16044          ));
   16045    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   16046       (rG, unop(Iop_64UtoV128, mkexpr(dLo)));
   16047    return delta;
   16048 }
   16049 
   16050 
   16051 static Long dis_AESx ( VexAbiInfo* vbi, Prefix pfx,
   16052                        Long delta, Bool isAvx, UChar opc )
   16053 {
   16054    IRTemp addr   = IRTemp_INVALID;
   16055    Int    alen   = 0;
   16056    HChar  dis_buf[50];
   16057    UChar  modrm  = getUChar(delta);
   16058    UInt   rG     = gregOfRexRM(pfx, modrm);
   16059    UInt   regNoL = 0;
   16060    UInt   regNoR = (isAvx && opc != 0xDB) ? getVexNvvvv(pfx) : rG;
   16061 
   16062    /* This is a nasty kludge.  We need to pass 2 x V128 to the
   16063       helper.  Since we can't do that, use a dirty
   16064       helper to compute the results directly from the XMM regs in
   16065       the guest state.  That means for the memory case, we need to
   16066       move the left operand into a pseudo-register (XMM16, let's
   16067       call it). */
   16068    if (epartIsReg(modrm)) {
   16069       regNoL = eregOfRexRM(pfx, modrm);
   16070       delta += 1;
   16071    } else {
   16072       regNoL = 16; /* use XMM16 as an intermediary */
   16073       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16074       /* alignment check needed ???? */
   16075       stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) ));
   16076       delta += alen;
   16077    }
   16078 
   16079    void*  fn = &amd64g_dirtyhelper_AES;
   16080    HChar* nm = "amd64g_dirtyhelper_AES";
   16081 
   16082    /* Round up the arguments.  Note that this is a kludge -- the
   16083       use of mkU64 rather than mkIRExpr_HWord implies the
   16084       assumption that the host's word size is 64-bit. */
   16085    UInt gstOffD = ymmGuestRegOffset(rG);
   16086    UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL);
   16087    UInt gstOffR = ymmGuestRegOffset(regNoR);
   16088    IRExpr*  opc4         = mkU64(opc);
   16089    IRExpr*  gstOffDe     = mkU64(gstOffD);
   16090    IRExpr*  gstOffLe     = mkU64(gstOffL);
   16091    IRExpr*  gstOffRe     = mkU64(gstOffR);
   16092    IRExpr** args
   16093       = mkIRExprVec_4( opc4, gstOffDe, gstOffLe, gstOffRe );
   16094 
   16095    IRDirty* d    = unsafeIRDirty_0_N( 0/*regparms*/, nm, fn, args );
   16096    /* It's not really a dirty call, but we can't use the clean
   16097       helper mechanism here for the very lame reason that we can't
   16098       pass 2 x V128s by value to a helper, nor get one back.  Hence
   16099       this roundabout scheme. */
   16100    d->needsBBP = True;
   16101    d->nFxState = 2;
   16102    vex_bzero(&d->fxState, sizeof(d->fxState));
   16103    /* AES{ENC,ENCLAST,DEC,DECLAST} read both registers, and writes
   16104       the second for !isAvx or the third for isAvx.
   16105       AESIMC (0xDB) reads the first register, and writes the second. */
   16106    d->fxState[0].fx     = Ifx_Read;
   16107    d->fxState[0].offset = gstOffL;
   16108    d->fxState[0].size   = sizeof(U128);
   16109    d->fxState[1].offset = gstOffR;
   16110    d->fxState[1].size   = sizeof(U128);
   16111    if (opc == 0xDB)
   16112       d->fxState[1].fx   = Ifx_Write;
   16113    else if (!isAvx || rG == regNoR)
   16114       d->fxState[1].fx   = Ifx_Modify;
   16115    else {
   16116       d->fxState[1].fx     = Ifx_Read;
   16117       d->nFxState++;
   16118       d->fxState[2].fx     = Ifx_Write;
   16119       d->fxState[2].offset = gstOffD;
   16120       d->fxState[2].size   = sizeof(U128);
   16121    }
   16122 
   16123    stmt( IRStmt_Dirty(d) );
   16124    {
   16125       HChar* opsuf;
   16126       switch (opc) {
   16127          case 0xDC: opsuf = "enc"; break;
   16128          case 0XDD: opsuf = "enclast"; break;
   16129          case 0xDE: opsuf = "dec"; break;
   16130          case 0xDF: opsuf = "declast"; break;
   16131          case 0xDB: opsuf = "imc"; break;
   16132          default: vassert(0);
   16133       }
   16134       DIP("%saes%s %s,%s%s%s\n", isAvx ? "v" : "", opsuf,
   16135           (regNoL == 16 ? dis_buf : nameXMMReg(regNoL)),
   16136           nameXMMReg(regNoR),
   16137           (isAvx && opc != 0xDB) ? "," : "",
   16138           (isAvx && opc != 0xDB) ? nameXMMReg(rG) : "");
   16139    }
   16140    if (isAvx)
   16141       putYMMRegLane128( rG, 1, mkV128(0) );
   16142    return delta;
   16143 }
   16144 
   16145 static Long dis_AESKEYGENASSIST ( VexAbiInfo* vbi, Prefix pfx,
   16146                                   Long delta, Bool isAvx )
   16147 {
   16148    IRTemp addr   = IRTemp_INVALID;
   16149    Int    alen   = 0;
   16150    HChar  dis_buf[50];
   16151    UChar  modrm  = getUChar(delta);
   16152    UInt   regNoL = 0;
   16153    UInt   regNoR = gregOfRexRM(pfx, modrm);
   16154    UChar  imm    = 0;
   16155 
   16156    /* This is a nasty kludge.  See AESENC et al. instructions. */
   16157    modrm = getUChar(delta);
   16158    if (epartIsReg(modrm)) {
   16159       regNoL = eregOfRexRM(pfx, modrm);
   16160       imm = getUChar(delta+1);
   16161       delta += 1+1;
   16162    } else {
   16163       regNoL = 16; /* use XMM16 as an intermediary */
   16164       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   16165       /* alignment check ???? . */
   16166       stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) ));
   16167       imm = getUChar(delta+alen);
   16168       delta += alen+1;
   16169    }
   16170 
   16171    /* Who ya gonna call?  Presumably not Ghostbusters. */
   16172    void*  fn = &amd64g_dirtyhelper_AESKEYGENASSIST;
   16173    HChar* nm = "amd64g_dirtyhelper_AESKEYGENASSIST";
   16174 
   16175    /* Round up the arguments.  Note that this is a kludge -- the
   16176       use of mkU64 rather than mkIRExpr_HWord implies the
   16177       assumption that the host's word size is 64-bit. */
   16178    UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL);
   16179    UInt gstOffR = ymmGuestRegOffset(regNoR);
   16180 
   16181    IRExpr*  imme          = mkU64(imm & 0xFF);
   16182    IRExpr*  gstOffLe     = mkU64(gstOffL);
   16183    IRExpr*  gstOffRe     = mkU64(gstOffR);
   16184    IRExpr** args
   16185       = mkIRExprVec_3( imme, gstOffLe, gstOffRe );
   16186 
   16187    IRDirty* d    = unsafeIRDirty_0_N( 0/*regparms*/, nm, fn, args );
   16188    /* It's not really a dirty call, but we can't use the clean
   16189       helper mechanism here for the very lame reason that we can't
   16190       pass 2 x V128s by value to a helper, nor get one back.  Hence
   16191       this roundabout scheme. */
   16192    d->needsBBP = True;
   16193    d->nFxState = 2;
   16194    vex_bzero(&d->fxState, sizeof(d->fxState));
   16195    d->fxState[0].fx     = Ifx_Read;
   16196    d->fxState[0].offset = gstOffL;
   16197    d->fxState[0].size   = sizeof(U128);
   16198    d->fxState[1].fx     = Ifx_Write;
   16199    d->fxState[1].offset = gstOffR;
   16200    d->fxState[1].size   = sizeof(U128);
   16201    stmt( IRStmt_Dirty(d) );
   16202 
   16203    DIP("%saeskeygenassist $%x,%s,%s\n", isAvx ? "v" : "", (UInt)imm,
   16204        (regNoL == 16 ? dis_buf : nameXMMReg(regNoL)),
   16205        nameXMMReg(regNoR));
   16206    if (isAvx)
   16207       putYMMRegLane128( regNoR, 1, mkV128(0) );
   16208    return delta;
   16209 }
   16210 
   16211 
   16212 __attribute__((noinline))
   16213 static
   16214 Long dis_ESC_0F38__SSE4 ( Bool* decode_OK,
   16215                           VexAbiInfo* vbi,
   16216                           Prefix pfx, Int sz, Long deltaIN )
   16217 {
   16218    IRTemp addr  = IRTemp_INVALID;
   16219    UChar  modrm = 0;
   16220    Int    alen  = 0;
   16221    HChar  dis_buf[50];
   16222 
   16223    *decode_OK = False;
   16224 
   16225    Long   delta = deltaIN;
   16226    UChar  opc   = getUChar(delta);
   16227    delta++;
   16228    switch (opc) {
   16229 
   16230    case 0x10:
   16231    case 0x14:
   16232    case 0x15:
   16233       /* 66 0F 38 10 /r = PBLENDVB xmm1, xmm2/m128  (byte gran)
   16234          66 0F 38 14 /r = BLENDVPS xmm1, xmm2/m128  (float gran)
   16235          66 0F 38 15 /r = BLENDVPD xmm1, xmm2/m128  (double gran)
   16236          Blend at various granularities, with XMM0 (implicit operand)
   16237          providing the controlling mask.
   16238       */
   16239       if (have66noF2noF3(pfx) && sz == 2) {
   16240          modrm = getUChar(delta);
   16241 
   16242          HChar* nm    = NULL;
   16243          UInt   gran  = 0;
   16244          IROp   opSAR = Iop_INVALID;
   16245          switch (opc) {
   16246             case 0x10:
   16247                nm = "pblendvb"; gran = 1; opSAR = Iop_SarN8x16;
   16248                break;
   16249             case 0x14:
   16250                nm = "blendvps"; gran = 4; opSAR = Iop_SarN32x4;
   16251                break;
   16252             case 0x15:
   16253                nm = "blendvpd"; gran = 8; opSAR = Iop_SarN64x2;
   16254                break;
   16255          }
   16256          vassert(nm);
   16257 
   16258          IRTemp vecE = newTemp(Ity_V128);
   16259          IRTemp vecG = newTemp(Ity_V128);
   16260          IRTemp vec0 = newTemp(Ity_V128);
   16261 
   16262          if ( epartIsReg(modrm) ) {
   16263             assign(vecE, getXMMReg(eregOfRexRM(pfx, modrm)));
   16264             delta += 1;
   16265             DIP( "%s %s,%s\n", nm,
   16266                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   16267                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   16268          } else {
   16269             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16270             gen_SEGV_if_not_16_aligned( addr );
   16271             assign(vecE, loadLE( Ity_V128, mkexpr(addr) ));
   16272             delta += alen;
   16273             DIP( "%s %s,%s\n", nm,
   16274                  dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   16275          }
   16276 
   16277          assign(vecG, getXMMReg(gregOfRexRM(pfx, modrm)));
   16278          assign(vec0, getXMMReg(0));
   16279 
   16280          IRTemp res = math_PBLENDVB_128( vecE, vecG, vec0, gran, opSAR );
   16281          putXMMReg(gregOfRexRM(pfx, modrm), mkexpr(res));
   16282 
   16283          goto decode_success;
   16284       }
   16285       break;
   16286 
   16287    case 0x17:
   16288       /* 66 0F 38 17 /r = PTEST xmm1, xmm2/m128
   16289          Logical compare (set ZF and CF from AND/ANDN of the operands) */
   16290       if (have66noF2noF3(pfx)
   16291           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   16292          delta = dis_xTESTy_128( vbi, pfx, delta, False/*!isAvx*/, 0 );
   16293          goto decode_success;
   16294       }
   16295       break;
   16296 
   16297    case 0x20:
   16298       /* 66 0F 38 20 /r = PMOVSXBW xmm1, xmm2/m64
   16299          Packed Move with Sign Extend from Byte to Word (XMM) */
   16300       if (have66noF2noF3(pfx) && sz == 2) {
   16301          delta = dis_PMOVxXBW_128( vbi, pfx, delta,
   16302                                    False/*!isAvx*/, False/*!xIsZ*/ );
   16303          goto decode_success;
   16304       }
   16305       break;
   16306 
   16307    case 0x21:
   16308       /* 66 0F 38 21 /r = PMOVSXBD xmm1, xmm2/m32
   16309          Packed Move with Sign Extend from Byte to DWord (XMM) */
   16310       if (have66noF2noF3(pfx) && sz == 2) {
   16311          delta = dis_PMOVxXBD_128( vbi, pfx, delta,
   16312                                    False/*!isAvx*/, False/*!xIsZ*/ );
   16313          goto decode_success;
   16314       }
   16315       break;
   16316 
   16317    case 0x22:
   16318       /* 66 0F 38 22 /r = PMOVSXBQ xmm1, xmm2/m16
   16319          Packed Move with Sign Extend from Byte to QWord (XMM) */
   16320       if (have66noF2noF3(pfx) && sz == 2) {
   16321          delta = dis_PMOVSXBQ_128( vbi, pfx, delta, False/*!isAvx*/ );
   16322          goto decode_success;
   16323       }
   16324       break;
   16325 
   16326    case 0x23:
   16327       /* 66 0F 38 23 /r = PMOVSXWD xmm1, xmm2/m64
   16328          Packed Move with Sign Extend from Word to DWord (XMM) */
   16329       if (have66noF2noF3(pfx) && sz == 2) {
   16330          delta = dis_PMOVxXWD_128(vbi, pfx, delta,
   16331                                   False/*!isAvx*/, False/*!xIsZ*/);
   16332          goto decode_success;
   16333       }
   16334       break;
   16335 
   16336    case 0x24:
   16337       /* 66 0F 38 24 /r = PMOVSXWQ xmm1, xmm2/m32
   16338          Packed Move with Sign Extend from Word to QWord (XMM) */
   16339       if (have66noF2noF3(pfx) && sz == 2) {
   16340          delta = dis_PMOVSXWQ_128( vbi, pfx, delta, False/*!isAvx*/ );
   16341          goto decode_success;
   16342       }
   16343       break;
   16344 
   16345    case 0x25:
   16346       /* 66 0F 38 25 /r = PMOVSXDQ xmm1, xmm2/m64
   16347          Packed Move with Sign Extend from Double Word to Quad Word (XMM) */
   16348       if (have66noF2noF3(pfx) && sz == 2) {
   16349          delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
   16350                                    False/*!isAvx*/, False/*!xIsZ*/ );
   16351          goto decode_success;
   16352       }
   16353       break;
   16354 
   16355    case 0x28:
   16356       /* 66 0F 38 28 = PMULDQ -- signed widening multiply of 32-lanes
   16357          0 x 0 to form lower 64-bit half and lanes 2 x 2 to form upper
   16358          64-bit half */
   16359       /* This is a really poor translation -- could be improved if
   16360          performance critical.  It's a copy-paste of PMULUDQ, too. */
   16361       if (have66noF2noF3(pfx) && sz == 2) {
   16362          IRTemp sV = newTemp(Ity_V128);
   16363          IRTemp dV = newTemp(Ity_V128);
   16364          modrm = getUChar(delta);
   16365          UInt rG = gregOfRexRM(pfx,modrm);
   16366          assign( dV, getXMMReg(rG) );
   16367          if (epartIsReg(modrm)) {
   16368             UInt rE = eregOfRexRM(pfx,modrm);
   16369             assign( sV, getXMMReg(rE) );
   16370             delta += 1;
   16371             DIP("pmuldq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   16372          } else {
   16373             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   16374             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   16375             delta += alen;
   16376             DIP("pmuldq %s,%s\n", dis_buf, nameXMMReg(rG));
   16377          }
   16378 
   16379          putXMMReg( rG, mkexpr(math_PMULDQ_128( dV, sV )) );
   16380          goto decode_success;
   16381       }
   16382       break;
   16383 
   16384    case 0x29:
   16385       /* 66 0F 38 29 = PCMPEQQ
   16386          64x2 equality comparison */
   16387       if (have66noF2noF3(pfx) && sz == 2) {
   16388          /* FIXME: this needs an alignment check */
   16389          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   16390                                     "pcmpeqq", Iop_CmpEQ64x2, False );
   16391          goto decode_success;
   16392       }
   16393       break;
   16394 
   16395    case 0x2B:
   16396       /* 66 0f 38 2B /r = PACKUSDW xmm1, xmm2/m128
   16397          2x 32x4 S->U saturating narrow from xmm2/m128 to xmm1 */
   16398       if (have66noF2noF3(pfx) && sz == 2) {
   16399 
   16400          modrm = getUChar(delta);
   16401 
   16402          IRTemp argL = newTemp(Ity_V128);
   16403          IRTemp argR = newTemp(Ity_V128);
   16404 
   16405          if ( epartIsReg(modrm) ) {
   16406             assign( argL, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   16407             delta += 1;
   16408             DIP( "packusdw %s,%s\n",
   16409                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   16410                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   16411          } else {
   16412             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16413             gen_SEGV_if_not_16_aligned( addr );
   16414             assign( argL, loadLE( Ity_V128, mkexpr(addr) ));
   16415             delta += alen;
   16416             DIP( "packusdw %s,%s\n",
   16417                  dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   16418          }
   16419 
   16420          assign(argR, getXMMReg( gregOfRexRM(pfx, modrm) ));
   16421 
   16422          putXMMReg( gregOfRexRM(pfx, modrm),
   16423                     binop( Iop_QNarrowBin32Sto16Ux8,
   16424                            mkexpr(argL), mkexpr(argR)) );
   16425 
   16426          goto decode_success;
   16427       }
   16428       break;
   16429 
   16430    case 0x30:
   16431       /* 66 0F 38 30 /r = PMOVZXBW xmm1, xmm2/m64
   16432          Packed Move with Zero Extend from Byte to Word (XMM) */
   16433       if (have66noF2noF3(pfx) && sz == 2) {
   16434          delta = dis_PMOVxXBW_128( vbi, pfx, delta,
   16435                                    False/*!isAvx*/, True/*xIsZ*/ );
   16436          goto decode_success;
   16437       }
   16438       break;
   16439 
   16440    case 0x31:
   16441       /* 66 0F 38 31 /r = PMOVZXBD xmm1, xmm2/m32
   16442          Packed Move with Zero Extend from Byte to DWord (XMM) */
   16443       if (have66noF2noF3(pfx) && sz == 2) {
   16444          delta = dis_PMOVxXBD_128( vbi, pfx, delta,
   16445                                    False/*!isAvx*/, True/*xIsZ*/ );
   16446          goto decode_success;
   16447       }
   16448       break;
   16449 
   16450    case 0x32:
   16451       /* 66 0F 38 32 /r = PMOVZXBQ xmm1, xmm2/m16
   16452          Packed Move with Zero Extend from Byte to QWord (XMM) */
   16453       if (have66noF2noF3(pfx) && sz == 2) {
   16454          delta = dis_PMOVZXBQ_128( vbi, pfx, delta, False/*!isAvx*/ );
   16455          goto decode_success;
   16456       }
   16457       break;
   16458 
   16459    case 0x33:
   16460       /* 66 0F 38 33 /r = PMOVZXWD xmm1, xmm2/m64
   16461          Packed Move with Zero Extend from Word to DWord (XMM) */
   16462       if (have66noF2noF3(pfx) && sz == 2) {
   16463          delta = dis_PMOVxXWD_128( vbi, pfx, delta,
   16464                                    False/*!isAvx*/, True/*xIsZ*/ );
   16465          goto decode_success;
   16466       }
   16467       break;
   16468 
   16469    case 0x34:
   16470       /* 66 0F 38 34 /r = PMOVZXWQ xmm1, xmm2/m32
   16471          Packed Move with Zero Extend from Word to QWord (XMM) */
   16472       if (have66noF2noF3(pfx) && sz == 2) {
   16473          delta = dis_PMOVZXWQ_128( vbi, pfx, delta, False/*!isAvx*/ );
   16474          goto decode_success;
   16475       }
   16476       break;
   16477 
   16478    case 0x35:
   16479       /* 66 0F 38 35 /r = PMOVZXDQ xmm1, xmm2/m64
   16480          Packed Move with Zero Extend from DWord to QWord (XMM) */
   16481       if (have66noF2noF3(pfx) && sz == 2) {
   16482          delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
   16483                                    False/*!isAvx*/, True/*xIsZ*/ );
   16484          goto decode_success;
   16485       }
   16486       break;
   16487 
   16488    case 0x37:
   16489       /* 66 0F 38 37 = PCMPGTQ
   16490          64x2 comparison (signed, presumably; the Intel docs don't say :-)
   16491       */
   16492       if (have66noF2noF3(pfx) && sz == 2) {
   16493          /* FIXME: this needs an alignment check */
   16494          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   16495                                     "pcmpgtq", Iop_CmpGT64Sx2, False );
   16496          goto decode_success;
   16497       }
   16498       break;
   16499 
   16500    case 0x38:
   16501    case 0x3C:
   16502       /* 66 0F 38 38 /r = PMINSB xmm1, xmm2/m128    8Sx16 (signed) min
   16503          66 0F 38 3C /r = PMAXSB xmm1, xmm2/m128    8Sx16 (signed) max
   16504       */
   16505       if (have66noF2noF3(pfx) && sz == 2) {
   16506          /* FIXME: this needs an alignment check */
   16507          Bool isMAX = opc == 0x3C;
   16508          delta = dis_SSEint_E_to_G(
   16509                     vbi, pfx, delta,
   16510                     isMAX ? "pmaxsb" : "pminsb",
   16511                     isMAX ? Iop_Max8Sx16 : Iop_Min8Sx16,
   16512                     False
   16513                  );
   16514          goto decode_success;
   16515       }
   16516       break;
   16517 
   16518    case 0x39:
   16519    case 0x3D:
   16520       /* 66 0F 38 39 /r = PMINSD xmm1, xmm2/m128
   16521          Minimum of Packed Signed Double Word Integers (XMM)
   16522          66 0F 38 3D /r = PMAXSD xmm1, xmm2/m128
   16523          Maximum of Packed Signed Double Word Integers (XMM)
   16524       */
   16525       if (have66noF2noF3(pfx) && sz == 2) {
   16526          /* FIXME: this needs an alignment check */
   16527          Bool isMAX = opc == 0x3D;
   16528          delta = dis_SSEint_E_to_G(
   16529                     vbi, pfx, delta,
   16530                     isMAX ? "pmaxsd" : "pminsd",
   16531                     isMAX ? Iop_Max32Sx4 : Iop_Min32Sx4,
   16532                     False
   16533                  );
   16534          goto decode_success;
   16535       }
   16536       break;
   16537 
   16538    case 0x3A:
   16539    case 0x3E:
   16540       /* 66 0F 38 3A /r = PMINUW xmm1, xmm2/m128
   16541          Minimum of Packed Unsigned Word Integers (XMM)
   16542          66 0F 38 3E /r = PMAXUW xmm1, xmm2/m128
   16543          Maximum of Packed Unsigned Word Integers (XMM)
   16544       */
   16545       if (have66noF2noF3(pfx) && sz == 2) {
   16546          /* FIXME: this needs an alignment check */
   16547          Bool isMAX = opc == 0x3E;
   16548          delta = dis_SSEint_E_to_G(
   16549                     vbi, pfx, delta,
   16550                     isMAX ? "pmaxuw" : "pminuw",
   16551                     isMAX ? Iop_Max16Ux8 : Iop_Min16Ux8,
   16552                     False
   16553                  );
   16554          goto decode_success;
   16555       }
   16556       break;
   16557 
   16558    case 0x3B:
   16559    case 0x3F:
   16560       /* 66 0F 38 3B /r = PMINUD xmm1, xmm2/m128
   16561          Minimum of Packed Unsigned Doubleword Integers (XMM)
   16562          66 0F 38 3F /r = PMAXUD xmm1, xmm2/m128
   16563          Maximum of Packed Unsigned Doubleword Integers (XMM)
   16564       */
   16565       if (have66noF2noF3(pfx) && sz == 2) {
   16566          /* FIXME: this needs an alignment check */
   16567          Bool isMAX = opc == 0x3F;
   16568          delta = dis_SSEint_E_to_G(
   16569                     vbi, pfx, delta,
   16570                     isMAX ? "pmaxud" : "pminud",
   16571                     isMAX ? Iop_Max32Ux4 : Iop_Min32Ux4,
   16572                     False
   16573                  );
   16574          goto decode_success;
   16575       }
   16576       break;
   16577 
   16578    case 0x40:
   16579       /* 66 0F 38 40 /r = PMULLD xmm1, xmm2/m128
   16580          32x4 integer multiply from xmm2/m128 to xmm1 */
   16581       if (have66noF2noF3(pfx) && sz == 2) {
   16582 
   16583          modrm = getUChar(delta);
   16584 
   16585          IRTemp argL = newTemp(Ity_V128);
   16586          IRTemp argR = newTemp(Ity_V128);
   16587 
   16588          if ( epartIsReg(modrm) ) {
   16589             assign( argL, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   16590             delta += 1;
   16591             DIP( "pmulld %s,%s\n",
   16592                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   16593                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   16594          } else {
   16595             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16596             gen_SEGV_if_not_16_aligned( addr );
   16597             assign( argL, loadLE( Ity_V128, mkexpr(addr) ));
   16598             delta += alen;
   16599             DIP( "pmulld %s,%s\n",
   16600                  dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   16601          }
   16602 
   16603          assign(argR, getXMMReg( gregOfRexRM(pfx, modrm) ));
   16604 
   16605          putXMMReg( gregOfRexRM(pfx, modrm),
   16606                     binop( Iop_Mul32x4, mkexpr(argL), mkexpr(argR)) );
   16607 
   16608          goto decode_success;
   16609       }
   16610       break;
   16611 
   16612    case 0x41:
   16613       /* 66 0F 38 41 /r = PHMINPOSUW xmm1, xmm2/m128
   16614          Packed Horizontal Word Minimum from xmm2/m128 to xmm1 */
   16615       if (have66noF2noF3(pfx) && sz == 2) {
   16616          delta = dis_PHMINPOSUW_128( vbi, pfx, delta, False/*!isAvx*/ );
   16617          goto decode_success;
   16618       }
   16619       break;
   16620 
   16621    case 0xDC:
   16622    case 0xDD:
   16623    case 0xDE:
   16624    case 0xDF:
   16625    case 0xDB:
   16626       /* 66 0F 38 DC /r = AESENC xmm1, xmm2/m128
   16627                   DD /r = AESENCLAST xmm1, xmm2/m128
   16628                   DE /r = AESDEC xmm1, xmm2/m128
   16629                   DF /r = AESDECLAST xmm1, xmm2/m128
   16630 
   16631                   DB /r = AESIMC xmm1, xmm2/m128 */
   16632       if (have66noF2noF3(pfx) && sz == 2) {
   16633          delta = dis_AESx( vbi, pfx, delta, False/*!isAvx*/, opc );
   16634          goto decode_success;
   16635       }
   16636       break;
   16637 
   16638    case 0xF0:
   16639    case 0xF1:
   16640       /* F2 0F 38 F0 /r = CRC32 r/m8, r32 (REX.W ok, 66 not ok)
   16641          F2 0F 38 F1 /r = CRC32 r/m{16,32,64}, r32
   16642          The decoding on this is a bit unusual.
   16643       */
   16644       if (haveF2noF3(pfx)
   16645           && (opc == 0xF1 || (opc == 0xF0 && !have66(pfx)))) {
   16646          modrm = getUChar(delta);
   16647 
   16648          if (opc == 0xF0)
   16649             sz = 1;
   16650          else
   16651             vassert(sz == 2 || sz == 4 || sz == 8);
   16652 
   16653          IRType tyE = szToITy(sz);
   16654          IRTemp valE = newTemp(tyE);
   16655 
   16656          if (epartIsReg(modrm)) {
   16657             assign(valE, getIRegE(sz, pfx, modrm));
   16658             delta += 1;
   16659             DIP("crc32b %s,%s\n", nameIRegE(sz, pfx, modrm),
   16660                 nameIRegG(1==getRexW(pfx) ? 8 : 4, pfx, modrm));
   16661          } else {
   16662             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16663             assign(valE, loadLE(tyE, mkexpr(addr)));
   16664             delta += alen;
   16665             DIP("crc32b %s,%s\n", dis_buf,
   16666                 nameIRegG(1==getRexW(pfx) ? 8 : 4, pfx, modrm));
   16667          }
   16668 
   16669          /* Somewhat funny getting/putting of the crc32 value, in order
   16670             to ensure that it turns into 64-bit gets and puts.  However,
   16671             mask off the upper 32 bits so as to not get memcheck false
   16672             +ves around the helper call. */
   16673          IRTemp valG0 = newTemp(Ity_I64);
   16674          assign(valG0, binop(Iop_And64, getIRegG(8, pfx, modrm),
   16675                              mkU64(0xFFFFFFFF)));
   16676 
   16677          HChar* nm = NULL;
   16678          void*  fn = NULL;
   16679          switch (sz) {
   16680             case 1: nm = "amd64g_calc_crc32b";
   16681                     fn = &amd64g_calc_crc32b; break;
   16682             case 2: nm = "amd64g_calc_crc32w";
   16683                     fn = &amd64g_calc_crc32w; break;
   16684             case 4: nm = "amd64g_calc_crc32l";
   16685                     fn = &amd64g_calc_crc32l; break;
   16686             case 8: nm = "amd64g_calc_crc32q";
   16687                     fn = &amd64g_calc_crc32q; break;
   16688          }
   16689          vassert(nm && fn);
   16690          IRTemp valG1 = newTemp(Ity_I64);
   16691          assign(valG1,
   16692                 mkIRExprCCall(Ity_I64, 0/*regparm*/, nm, fn,
   16693                               mkIRExprVec_2(mkexpr(valG0),
   16694                                             widenUto64(mkexpr(valE)))));
   16695 
   16696          putIRegG(4, pfx, modrm, unop(Iop_64to32, mkexpr(valG1)));
   16697          goto decode_success;
   16698       }
   16699       break;
   16700 
   16701    default:
   16702       break;
   16703 
   16704    }
   16705 
   16706   //decode_failure:
   16707    *decode_OK = False;
   16708    return deltaIN;
   16709 
   16710   decode_success:
   16711    *decode_OK = True;
   16712    return delta;
   16713 }
   16714 
   16715 
   16716 /*------------------------------------------------------------*/
   16717 /*---                                                      ---*/
   16718 /*--- Top-level SSE4: dis_ESC_0F3A__SSE4                   ---*/
   16719 /*---                                                      ---*/
   16720 /*------------------------------------------------------------*/
   16721 
   16722 static Long dis_PEXTRW ( VexAbiInfo* vbi, Prefix pfx,
   16723                          Long delta, Bool isAvx )
   16724 {
   16725    IRTemp addr  = IRTemp_INVALID;
   16726    IRTemp t0    = IRTemp_INVALID;
   16727    IRTemp t1    = IRTemp_INVALID;
   16728    IRTemp t2    = IRTemp_INVALID;
   16729    IRTemp t3    = IRTemp_INVALID;
   16730    UChar  modrm = getUChar(delta);
   16731    Int    alen  = 0;
   16732    HChar  dis_buf[50];
   16733    UInt   rG    = gregOfRexRM(pfx,modrm);
   16734    Int    imm8_20;
   16735    IRTemp xmm_vec = newTemp(Ity_V128);
   16736    IRTemp d16   = newTemp(Ity_I16);
   16737    HChar* mbV   = isAvx ? "v" : "";
   16738 
   16739    vassert(0==getRexW(pfx)); /* ensured by caller */
   16740    assign( xmm_vec, getXMMReg(rG) );
   16741    breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
   16742 
   16743    if ( epartIsReg( modrm ) ) {
   16744       imm8_20 = (Int)(getUChar(delta+1) & 7);
   16745    } else {
   16746       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   16747       imm8_20 = (Int)(getUChar(delta+alen) & 7);
   16748    }
   16749 
   16750    switch (imm8_20) {
   16751       case 0:  assign(d16, unop(Iop_32to16,   mkexpr(t0))); break;
   16752       case 1:  assign(d16, unop(Iop_32HIto16, mkexpr(t0))); break;
   16753       case 2:  assign(d16, unop(Iop_32to16,   mkexpr(t1))); break;
   16754       case 3:  assign(d16, unop(Iop_32HIto16, mkexpr(t1))); break;
   16755       case 4:  assign(d16, unop(Iop_32to16,   mkexpr(t2))); break;
   16756       case 5:  assign(d16, unop(Iop_32HIto16, mkexpr(t2))); break;
   16757       case 6:  assign(d16, unop(Iop_32to16,   mkexpr(t3))); break;
   16758       case 7:  assign(d16, unop(Iop_32HIto16, mkexpr(t3))); break;
   16759       default: vassert(0);
   16760    }
   16761 
   16762    if ( epartIsReg( modrm ) ) {
   16763       UInt rE = eregOfRexRM(pfx,modrm);
   16764       putIReg32( rE, unop(Iop_16Uto32, mkexpr(d16)) );
   16765       delta += 1+1;
   16766       DIP( "%spextrw $%d, %s,%s\n", mbV, imm8_20,
   16767            nameXMMReg( rG ), nameIReg32( rE ) );
   16768    } else {
   16769       storeLE( mkexpr(addr), mkexpr(d16) );
   16770       delta += alen+1;
   16771       DIP( "%spextrw $%d, %s,%s\n", mbV, imm8_20, nameXMMReg( rG ), dis_buf );
   16772    }
   16773    return delta;
   16774 }
   16775 
   16776 
   16777 static Long dis_PEXTRD ( VexAbiInfo* vbi, Prefix pfx,
   16778                          Long delta, Bool isAvx )
   16779 {
   16780    IRTemp addr  = IRTemp_INVALID;
   16781    IRTemp t0    = IRTemp_INVALID;
   16782    IRTemp t1    = IRTemp_INVALID;
   16783    IRTemp t2    = IRTemp_INVALID;
   16784    IRTemp t3    = IRTemp_INVALID;
   16785    UChar  modrm = 0;
   16786    Int    alen  = 0;
   16787    HChar  dis_buf[50];
   16788 
   16789    Int    imm8_10;
   16790    IRTemp xmm_vec   = newTemp(Ity_V128);
   16791    IRTemp src_dword = newTemp(Ity_I32);
   16792    HChar* mbV = isAvx ? "v" : "";
   16793 
   16794    vassert(0==getRexW(pfx)); /* ensured by caller */
   16795    modrm = getUChar(delta);
   16796    assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
   16797    breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
   16798 
   16799    if ( epartIsReg( modrm ) ) {
   16800       imm8_10 = (Int)(getUChar(delta+1) & 3);
   16801    } else {
   16802       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   16803       imm8_10 = (Int)(getUChar(delta+alen) & 3);
   16804    }
   16805 
   16806    switch ( imm8_10 ) {
   16807       case 0:  assign( src_dword, mkexpr(t0) ); break;
   16808       case 1:  assign( src_dword, mkexpr(t1) ); break;
   16809       case 2:  assign( src_dword, mkexpr(t2) ); break;
   16810       case 3:  assign( src_dword, mkexpr(t3) ); break;
   16811       default: vassert(0);
   16812    }
   16813 
   16814    if ( epartIsReg( modrm ) ) {
   16815       putIReg32( eregOfRexRM(pfx,modrm), mkexpr(src_dword) );
   16816       delta += 1+1;
   16817       DIP( "%spextrd $%d, %s,%s\n", mbV, imm8_10,
   16818            nameXMMReg( gregOfRexRM(pfx, modrm) ),
   16819            nameIReg32( eregOfRexRM(pfx, modrm) ) );
   16820    } else {
   16821       storeLE( mkexpr(addr), mkexpr(src_dword) );
   16822       delta += alen+1;
   16823       DIP( "%spextrd $%d, %s,%s\n", mbV,
   16824            imm8_10, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
   16825    }
   16826    return delta;
   16827 }
   16828 
   16829 
   16830 static Long dis_PEXTRQ ( VexAbiInfo* vbi, Prefix pfx,
   16831                          Long delta, Bool isAvx )
   16832 {
   16833    IRTemp addr  = IRTemp_INVALID;
   16834    UChar  modrm = 0;
   16835    Int    alen  = 0;
   16836    HChar  dis_buf[50];
   16837 
   16838    Int imm8_0;
   16839    IRTemp xmm_vec   = newTemp(Ity_V128);
   16840    IRTemp src_qword = newTemp(Ity_I64);
   16841    HChar* mbV = isAvx ? "v" : "";
   16842 
   16843    vassert(1==getRexW(pfx)); /* ensured by caller */
   16844    modrm = getUChar(delta);
   16845    assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
   16846 
   16847    if ( epartIsReg( modrm ) ) {
   16848       imm8_0 = (Int)(getUChar(delta+1) & 1);
   16849    } else {
   16850       addr   = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   16851       imm8_0 = (Int)(getUChar(delta+alen) & 1);
   16852    }
   16853 
   16854    switch ( imm8_0 ) {
   16855       case 0:  assign( src_qword, unop(Iop_V128to64,   mkexpr(xmm_vec)) );
   16856                break;
   16857       case 1:  assign( src_qword, unop(Iop_V128HIto64, mkexpr(xmm_vec)) );
   16858                break;
   16859       default: vassert(0);
   16860    }
   16861 
   16862    if ( epartIsReg( modrm ) ) {
   16863       putIReg64( eregOfRexRM(pfx,modrm), mkexpr(src_qword) );
   16864       delta += 1+1;
   16865       DIP( "%spextrq $%d, %s,%s\n", mbV, imm8_0,
   16866            nameXMMReg( gregOfRexRM(pfx, modrm) ),
   16867            nameIReg64( eregOfRexRM(pfx, modrm) ) );
   16868    } else {
   16869       storeLE( mkexpr(addr), mkexpr(src_qword) );
   16870       delta += alen+1;
   16871       DIP( "%spextrq $%d, %s,%s\n", mbV,
   16872            imm8_0, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
   16873    }
   16874    return delta;
   16875 }
   16876 
   16877 
   16878 /* This can fail, in which case it returns the original (unchanged)
   16879    delta. */
   16880 static Long dis_PCMPxSTRx ( VexAbiInfo* vbi, Prefix pfx,
   16881                             Long delta, Bool isAvx, UChar opc )
   16882 {
   16883    Long   delta0  = delta;
   16884    UInt   isISTRx = opc & 2;
   16885    UInt   isxSTRM = (opc & 1) ^ 1;
   16886    UInt   regNoL  = 0;
   16887    UInt   regNoR  = 0;
   16888    UChar  imm     = 0;
   16889    IRTemp addr    = IRTemp_INVALID;
   16890    Int    alen    = 0;
   16891    HChar  dis_buf[50];
   16892 
   16893    /* This is a nasty kludge.  We need to pass 2 x V128 to the helper
   16894       (which is clean).  Since we can't do that, use a dirty helper to
   16895       compute the results directly from the XMM regs in the guest
   16896       state.  That means for the memory case, we need to move the left
   16897       operand into a pseudo-register (XMM16, let's call it). */
   16898    UChar modrm = getUChar(delta);
   16899    if (epartIsReg(modrm)) {
   16900       regNoL = eregOfRexRM(pfx, modrm);
   16901       regNoR = gregOfRexRM(pfx, modrm);
   16902       imm = getUChar(delta+1);
   16903       delta += 1+1;
   16904    } else {
   16905       regNoL = 16; /* use XMM16 as an intermediary */
   16906       regNoR = gregOfRexRM(pfx, modrm);
   16907       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   16908       /* No alignment check; I guess that makes sense, given that
   16909          these insns are for dealing with C style strings. */
   16910       stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) ));
   16911       imm = getUChar(delta+alen);
   16912       delta += alen+1;
   16913    }
   16914 
   16915    /* Now we know the XMM reg numbers for the operands, and the
   16916       immediate byte.  Is it one we can actually handle? Throw out any
   16917       cases for which the helper function has not been verified. */
   16918    switch (imm) {
   16919       case 0x00:
   16920       case 0x02: case 0x08: case 0x0A: case 0x0C: case 0x12:
   16921       case 0x1A: case 0x38: case 0x3A: case 0x44: case 0x4A:
   16922       case 0x46:
   16923          break;
   16924       case 0x01: // the 16-bit character versions of the above
   16925       case 0x03: case 0x09: case 0x0B: case 0x0D: case 0x13:
   16926       case 0x1B: case 0x39: case 0x3B: case 0x45: case 0x4B:
   16927          break;
   16928       default:
   16929          return delta0; /*FAIL*/
   16930    }
   16931 
   16932    /* Who ya gonna call?  Presumably not Ghostbusters. */
   16933    void*  fn = &amd64g_dirtyhelper_PCMPxSTRx;
   16934    HChar* nm = "amd64g_dirtyhelper_PCMPxSTRx";
   16935 
   16936    /* Round up the arguments.  Note that this is a kludge -- the use
   16937       of mkU64 rather than mkIRExpr_HWord implies the assumption that
   16938       the host's word size is 64-bit. */
   16939    UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL);
   16940    UInt gstOffR = ymmGuestRegOffset(regNoR);
   16941 
   16942    IRExpr*  opc4_and_imm = mkU64((opc << 8) | (imm & 0xFF));
   16943    IRExpr*  gstOffLe     = mkU64(gstOffL);
   16944    IRExpr*  gstOffRe     = mkU64(gstOffR);
   16945    IRExpr*  edxIN        = isISTRx ? mkU64(0) : getIRegRDX(8);
   16946    IRExpr*  eaxIN        = isISTRx ? mkU64(0) : getIRegRAX(8);
   16947    IRExpr** args
   16948       = mkIRExprVec_5( opc4_and_imm, gstOffLe, gstOffRe, edxIN, eaxIN );
   16949 
   16950    IRTemp   resT = newTemp(Ity_I64);
   16951    IRDirty* d    = unsafeIRDirty_1_N( resT, 0/*regparms*/, nm, fn, args );
   16952    /* It's not really a dirty call, but we can't use the clean helper
   16953       mechanism here for the very lame reason that we can't pass 2 x
   16954       V128s by value to a helper, nor get one back.  Hence this
   16955       roundabout scheme. */
   16956    d->needsBBP = True;
   16957    d->nFxState = 2;
   16958    vex_bzero(&d->fxState, sizeof(d->fxState));
   16959    d->fxState[0].fx     = Ifx_Read;
   16960    d->fxState[0].offset = gstOffL;
   16961    d->fxState[0].size   = sizeof(U128);
   16962    d->fxState[1].fx     = Ifx_Read;
   16963    d->fxState[1].offset = gstOffR;
   16964    d->fxState[1].size   = sizeof(U128);
   16965    if (isxSTRM) {
   16966       /* Declare that the helper writes XMM0. */
   16967       d->nFxState = 3;
   16968       d->fxState[2].fx     = Ifx_Write;
   16969       d->fxState[2].offset = ymmGuestRegOffset(0);
   16970       d->fxState[2].size   = sizeof(U128);
   16971    }
   16972 
   16973    stmt( IRStmt_Dirty(d) );
   16974 
   16975    /* Now resT[15:0] holds the new OSZACP values, so the condition
   16976       codes must be updated. And for a xSTRI case, resT[31:16] holds
   16977       the new ECX value, so stash that too. */
   16978    if (!isxSTRM) {
   16979       putIReg64(R_RCX, binop(Iop_And64,
   16980                              binop(Iop_Shr64, mkexpr(resT), mkU8(16)),
   16981                              mkU64(0xFFFF)));
   16982    }
   16983 
   16984    /* Zap the upper half of the dest reg as per AVX conventions. */
   16985    if (isxSTRM && isAvx)
   16986       putYMMRegLane128(/*YMM*/0, 1, mkV128(0));
   16987 
   16988    stmt( IRStmt_Put(
   16989             OFFB_CC_DEP1,
   16990             binop(Iop_And64, mkexpr(resT), mkU64(0xFFFF))
   16991    ));
   16992    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   16993    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   16994    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   16995 
   16996    if (regNoL == 16) {
   16997       DIP("%spcmp%cstr%c $%x,%s,%s\n",
   16998           isAvx ? "v" : "", isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i',
   16999           (UInt)imm, dis_buf, nameXMMReg(regNoR));
   17000    } else {
   17001       DIP("%spcmp%cstr%c $%x,%s,%s\n",
   17002           isAvx ? "v" : "", isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i',
   17003           (UInt)imm, nameXMMReg(regNoL), nameXMMReg(regNoR));
   17004    }
   17005 
   17006    return delta;
   17007 }
   17008 
   17009 
   17010 static IRTemp math_PINSRB_128 ( IRTemp v128, IRTemp u8, UInt imm8 )
   17011 {
   17012    vassert(imm8 >= 0 && imm8 <= 15);
   17013 
   17014    // Create a V128 value which has the selected byte in the
   17015    // specified lane, and zeroes everywhere else.
   17016    IRTemp tmp128    = newTemp(Ity_V128);
   17017    IRTemp halfshift = newTemp(Ity_I64);
   17018    assign(halfshift, binop(Iop_Shl64,
   17019                            unop(Iop_8Uto64, mkexpr(u8)),
   17020                            mkU8(8 * (imm8 & 7))));
   17021    if (imm8 < 8) {
   17022       assign(tmp128, binop(Iop_64HLtoV128, mkU64(0), mkexpr(halfshift)));
   17023    } else {
   17024       assign(tmp128, binop(Iop_64HLtoV128, mkexpr(halfshift), mkU64(0)));
   17025    }
   17026 
   17027    UShort mask = ~(1 << imm8);
   17028    IRTemp res  = newTemp(Ity_V128);
   17029    assign( res, binop(Iop_OrV128,
   17030                       mkexpr(tmp128),
   17031                       binop(Iop_AndV128, mkexpr(v128), mkV128(mask))) );
   17032    return res;
   17033 }
   17034 
   17035 
   17036 static IRTemp math_PINSRD_128 ( IRTemp v128, IRTemp u32, UInt imm8 )
   17037 {
   17038    IRTemp z32 = newTemp(Ity_I32);
   17039    assign(z32, mkU32(0));
   17040 
   17041    /* Surround u32 with zeroes as per imm, giving us something we can
   17042       OR into a suitably masked-out v128.*/
   17043    IRTemp withZs = newTemp(Ity_V128);
   17044    UShort mask = 0;
   17045    switch (imm8) {
   17046       case 3:  mask = 0x0FFF;
   17047                assign(withZs, mkV128from32s(u32, z32, z32, z32));
   17048                break;
   17049       case 2:  mask = 0xF0FF;
   17050                assign(withZs, mkV128from32s(z32, u32, z32, z32));
   17051                break;
   17052       case 1:  mask = 0xFF0F;
   17053                assign(withZs, mkV128from32s(z32, z32, u32, z32));
   17054                break;
   17055       case 0:  mask = 0xFFF0;
   17056                assign(withZs, mkV128from32s(z32, z32, z32, u32));
   17057                break;
   17058       default: vassert(0);
   17059    }
   17060 
   17061    IRTemp res = newTemp(Ity_V128);
   17062    assign(res, binop( Iop_OrV128,
   17063                       mkexpr(withZs),
   17064                       binop( Iop_AndV128, mkexpr(v128), mkV128(mask) ) ) );
   17065    return res;
   17066 }
   17067 
   17068 
   17069 static IRTemp math_PINSRQ_128 ( IRTemp v128, IRTemp u64, UInt imm8 )
   17070 {
   17071    /* Surround u64 with zeroes as per imm, giving us something we can
   17072       OR into a suitably masked-out v128.*/
   17073    IRTemp withZs = newTemp(Ity_V128);
   17074    UShort mask = 0;
   17075    if (imm8 == 0) {
   17076       mask = 0xFF00;
   17077       assign(withZs, binop(Iop_64HLtoV128, mkU64(0), mkexpr(u64)));
   17078    } else {
   17079       vassert(imm8 == 1);
   17080       mask = 0x00FF;
   17081       assign( withZs, binop(Iop_64HLtoV128, mkexpr(u64), mkU64(0)));
   17082    }
   17083 
   17084    IRTemp res = newTemp(Ity_V128);
   17085    assign( res, binop( Iop_OrV128,
   17086                        mkexpr(withZs),
   17087                        binop( Iop_AndV128, mkexpr(v128), mkV128(mask) ) ) );
   17088    return res;
   17089 }
   17090 
   17091 
   17092 static IRTemp math_INSERTPS ( IRTemp dstV, IRTemp toInsertD, UInt imm8 )
   17093 {
   17094    const IRTemp inval = IRTemp_INVALID;
   17095    IRTemp dstDs[4] = { inval, inval, inval, inval };
   17096    breakupV128to32s( dstV, &dstDs[3], &dstDs[2], &dstDs[1], &dstDs[0] );
   17097 
   17098    vassert(imm8 <= 255);
   17099    dstDs[(imm8 >> 4) & 3] = toInsertD; /* "imm8_count_d" */
   17100 
   17101    UInt imm8_zmask = (imm8 & 15);
   17102    IRTemp zero_32 = newTemp(Ity_I32);
   17103    assign( zero_32, mkU32(0) );
   17104    IRTemp resV = newTemp(Ity_V128);
   17105    assign( resV, mkV128from32s(
   17106                     ((imm8_zmask & 8) == 8) ? zero_32 : dstDs[3],
   17107                     ((imm8_zmask & 4) == 4) ? zero_32 : dstDs[2],
   17108                     ((imm8_zmask & 2) == 2) ? zero_32 : dstDs[1],
   17109                     ((imm8_zmask & 1) == 1) ? zero_32 : dstDs[0]) );
   17110    return resV;
   17111 }
   17112 
   17113 
   17114 static Long dis_PEXTRB_128_GtoE ( VexAbiInfo* vbi, Prefix pfx,
   17115                                   Long delta, Bool isAvx )
   17116 {
   17117    IRTemp addr     = IRTemp_INVALID;
   17118    Int    alen     = 0;
   17119    HChar  dis_buf[50];
   17120    IRTemp xmm_vec  = newTemp(Ity_V128);
   17121    IRTemp sel_lane = newTemp(Ity_I32);
   17122    IRTemp shr_lane = newTemp(Ity_I32);
   17123    UChar* mbV      = isAvx ? "v" : "";
   17124    UChar  modrm    = getUChar(delta);
   17125    IRTemp t3, t2, t1, t0;
   17126    Int    imm8;
   17127    assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
   17128    t3 = t2 = t1 = t0 = IRTemp_INVALID;
   17129    breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
   17130 
   17131    if ( epartIsReg( modrm ) ) {
   17132       imm8 = (Int)getUChar(delta+1);
   17133    } else {
   17134       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   17135       imm8 = (Int)getUChar(delta+alen);
   17136    }
   17137    switch ( (imm8 >> 2) & 3 ) {
   17138       case 0:  assign( sel_lane, mkexpr(t0) ); break;
   17139       case 1:  assign( sel_lane, mkexpr(t1) ); break;
   17140       case 2:  assign( sel_lane, mkexpr(t2) ); break;
   17141       case 3:  assign( sel_lane, mkexpr(t3) ); break;
   17142       default: vassert(0);
   17143    }
   17144    assign( shr_lane,
   17145            binop( Iop_Shr32, mkexpr(sel_lane), mkU8(((imm8 & 3)*8)) ) );
   17146 
   17147    if ( epartIsReg( modrm ) ) {
   17148       putIReg64( eregOfRexRM(pfx,modrm),
   17149                  unop( Iop_32Uto64,
   17150                        binop(Iop_And32, mkexpr(shr_lane), mkU32(255)) ) );
   17151       delta += 1+1;
   17152       DIP( "%spextrb $%d, %s,%s\n", mbV, imm8,
   17153            nameXMMReg( gregOfRexRM(pfx, modrm) ),
   17154            nameIReg64( eregOfRexRM(pfx, modrm) ) );
   17155    } else {
   17156       storeLE( mkexpr(addr), unop(Iop_32to8, mkexpr(shr_lane) ) );
   17157       delta += alen+1;
   17158       DIP( "%spextrb $%d,%s,%s\n", mbV,
   17159            imm8, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
   17160    }
   17161 
   17162    return delta;
   17163 }
   17164 
   17165 
   17166 static IRTemp math_DPPD_128 ( IRTemp src_vec, IRTemp dst_vec, UInt imm8 )
   17167 {
   17168    vassert(imm8 < 256);
   17169    UShort imm8_perms[4] = { 0x0000, 0x00FF, 0xFF00, 0xFFFF };
   17170    IRTemp and_vec = newTemp(Ity_V128);
   17171    IRTemp sum_vec = newTemp(Ity_V128);
   17172    assign( and_vec, binop( Iop_AndV128,
   17173                            binop( Iop_Mul64Fx2,
   17174                                   mkexpr(dst_vec), mkexpr(src_vec) ),
   17175                            mkV128( imm8_perms[ ((imm8 >> 4) & 3) ] ) ) );
   17176 
   17177    assign( sum_vec, binop( Iop_Add64F0x2,
   17178                            binop( Iop_InterleaveHI64x2,
   17179                                   mkexpr(and_vec), mkexpr(and_vec) ),
   17180                            binop( Iop_InterleaveLO64x2,
   17181                                   mkexpr(and_vec), mkexpr(and_vec) ) ) );
   17182    IRTemp res = newTemp(Ity_V128);
   17183    assign(res, binop( Iop_AndV128,
   17184                       binop( Iop_InterleaveLO64x2,
   17185                              mkexpr(sum_vec), mkexpr(sum_vec) ),
   17186                       mkV128( imm8_perms[ (imm8 & 3) ] ) ) );
   17187    return res;
   17188 }
   17189 
   17190 
   17191 static IRTemp math_DPPS_128 ( IRTemp src_vec, IRTemp dst_vec, UInt imm8 )
   17192 {
   17193    vassert(imm8 < 256);
   17194    IRTemp tmp_prod_vec = newTemp(Ity_V128);
   17195    IRTemp prod_vec     = newTemp(Ity_V128);
   17196    IRTemp sum_vec      = newTemp(Ity_V128);
   17197    IRTemp v3, v2, v1, v0;
   17198    v3 = v2 = v1 = v0   = IRTemp_INVALID;
   17199    UShort imm8_perms[16] = { 0x0000, 0x000F, 0x00F0, 0x00FF, 0x0F00,
   17200                              0x0F0F, 0x0FF0, 0x0FFF, 0xF000, 0xF00F,
   17201                              0xF0F0, 0xF0FF, 0xFF00, 0xFF0F, 0xFFF0,
   17202                              0xFFFF };
   17203 
   17204    assign( tmp_prod_vec,
   17205            binop( Iop_AndV128,
   17206                   binop( Iop_Mul32Fx4, mkexpr(dst_vec),
   17207                                        mkexpr(src_vec) ),
   17208                   mkV128( imm8_perms[((imm8 >> 4)& 15)] ) ) );
   17209    breakupV128to32s( tmp_prod_vec, &v3, &v2, &v1, &v0 );
   17210    assign( prod_vec, mkV128from32s( v3, v1, v2, v0 ) );
   17211 
   17212    assign( sum_vec, binop( Iop_Add32Fx4,
   17213                            binop( Iop_InterleaveHI32x4,
   17214                                   mkexpr(prod_vec), mkexpr(prod_vec) ),
   17215                            binop( Iop_InterleaveLO32x4,
   17216                                   mkexpr(prod_vec), mkexpr(prod_vec) ) ) );
   17217 
   17218    IRTemp res = newTemp(Ity_V128);
   17219    assign( res, binop( Iop_AndV128,
   17220                        binop( Iop_Add32Fx4,
   17221                               binop( Iop_InterleaveHI32x4,
   17222                                      mkexpr(sum_vec), mkexpr(sum_vec) ),
   17223                               binop( Iop_InterleaveLO32x4,
   17224                                      mkexpr(sum_vec), mkexpr(sum_vec) ) ),
   17225                        mkV128( imm8_perms[ (imm8 & 15) ] ) ) );
   17226    return res;
   17227 }
   17228 
   17229 
   17230 static IRTemp math_MPSADBW_128 ( IRTemp dst_vec, IRTemp src_vec, UInt imm8 )
   17231 {
   17232    /* Mask out bits of the operands we don't need.  This isn't
   17233       strictly necessary, but it does ensure Memcheck doesn't
   17234       give us any false uninitialised value errors as a
   17235       result. */
   17236    UShort src_mask[4] = { 0x000F, 0x00F0, 0x0F00, 0xF000 };
   17237    UShort dst_mask[2] = { 0x07FF, 0x7FF0 };
   17238 
   17239    IRTemp src_maskV = newTemp(Ity_V128);
   17240    IRTemp dst_maskV = newTemp(Ity_V128);
   17241    assign(src_maskV, mkV128( src_mask[ imm8 & 3 ] ));
   17242    assign(dst_maskV, mkV128( dst_mask[ (imm8 >> 2) & 1 ] ));
   17243 
   17244    IRTemp src_masked = newTemp(Ity_V128);
   17245    IRTemp dst_masked = newTemp(Ity_V128);
   17246    assign(src_masked, binop(Iop_AndV128, mkexpr(src_vec), mkexpr(src_maskV)));
   17247    assign(dst_masked, binop(Iop_AndV128, mkexpr(dst_vec), mkexpr(dst_maskV)));
   17248 
   17249    /* Generate 4 64 bit values that we can hand to a clean helper */
   17250    IRTemp sHi = newTemp(Ity_I64);
   17251    IRTemp sLo = newTemp(Ity_I64);
   17252    assign( sHi, unop(Iop_V128HIto64, mkexpr(src_masked)) );
   17253    assign( sLo, unop(Iop_V128to64,   mkexpr(src_masked)) );
   17254 
   17255    IRTemp dHi = newTemp(Ity_I64);
   17256    IRTemp dLo = newTemp(Ity_I64);
   17257    assign( dHi, unop(Iop_V128HIto64, mkexpr(dst_masked)) );
   17258    assign( dLo, unop(Iop_V128to64,   mkexpr(dst_masked)) );
   17259 
   17260    /* Compute halves of the result separately */
   17261    IRTemp resHi = newTemp(Ity_I64);
   17262    IRTemp resLo = newTemp(Ity_I64);
   17263 
   17264    IRExpr** argsHi
   17265       = mkIRExprVec_5( mkexpr(sHi), mkexpr(sLo), mkexpr(dHi), mkexpr(dLo),
   17266                        mkU64( 0x80 | (imm8 & 7) ));
   17267    IRExpr** argsLo
   17268       = mkIRExprVec_5( mkexpr(sHi), mkexpr(sLo), mkexpr(dHi), mkexpr(dLo),
   17269                        mkU64( 0x00 | (imm8 & 7) ));
   17270 
   17271    assign(resHi, mkIRExprCCall( Ity_I64, 0/*regparm*/,
   17272                                 "amd64g_calc_mpsadbw",
   17273                                 &amd64g_calc_mpsadbw, argsHi ));
   17274    assign(resLo, mkIRExprCCall( Ity_I64, 0/*regparm*/,
   17275                                 "amd64g_calc_mpsadbw",
   17276                                 &amd64g_calc_mpsadbw, argsLo ));
   17277 
   17278    IRTemp res = newTemp(Ity_V128);
   17279    assign(res, binop(Iop_64HLtoV128, mkexpr(resHi), mkexpr(resLo)));
   17280    return res;
   17281 }
   17282 
   17283 static Long dis_EXTRACTPS ( VexAbiInfo* vbi, Prefix pfx,
   17284                             Long delta, Bool isAvx )
   17285 {
   17286    IRTemp addr       = IRTemp_INVALID;
   17287    Int    alen       = 0;
   17288    HChar  dis_buf[50];
   17289    UChar  modrm      = getUChar(delta);
   17290    Int imm8_10;
   17291    IRTemp xmm_vec    = newTemp(Ity_V128);
   17292    IRTemp src_dword  = newTemp(Ity_I32);
   17293    UInt   rG         = gregOfRexRM(pfx,modrm);
   17294    IRTemp t3, t2, t1, t0;
   17295    t3 = t2 = t1 = t0 = IRTemp_INVALID;
   17296 
   17297    assign( xmm_vec, getXMMReg( rG ) );
   17298    breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
   17299 
   17300    if ( epartIsReg( modrm ) ) {
   17301       imm8_10 = (Int)(getUChar(delta+1) & 3);
   17302    } else {
   17303       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   17304       imm8_10 = (Int)(getUChar(delta+alen) & 3);
   17305    }
   17306 
   17307    switch ( imm8_10 ) {
   17308       case 0:  assign( src_dword, mkexpr(t0) ); break;
   17309       case 1:  assign( src_dword, mkexpr(t1) ); break;
   17310       case 2:  assign( src_dword, mkexpr(t2) ); break;
   17311       case 3:  assign( src_dword, mkexpr(t3) ); break;
   17312       default: vassert(0);
   17313    }
   17314 
   17315    if ( epartIsReg( modrm ) ) {
   17316       UInt rE = eregOfRexRM(pfx,modrm);
   17317       putIReg32( rE, mkexpr(src_dword) );
   17318       delta += 1+1;
   17319       DIP( "%sextractps $%d, %s,%s\n", isAvx ? "v" : "", imm8_10,
   17320            nameXMMReg( rG ), nameIReg32( rE ) );
   17321    } else {
   17322       storeLE( mkexpr(addr), mkexpr(src_dword) );
   17323       delta += alen+1;
   17324       DIP( "%sextractps $%d, %s,%s\n", isAvx ? "v" : "", imm8_10,
   17325            nameXMMReg( rG ), dis_buf );
   17326    }
   17327 
   17328    return delta;
   17329 }
   17330 
   17331 
   17332 static IRTemp math_PCLMULQDQ( IRTemp dV, IRTemp sV, UInt imm8 )
   17333 {
   17334    IRTemp t0 = newTemp(Ity_I64);
   17335    IRTemp t1 = newTemp(Ity_I64);
   17336    assign(t0, unop((imm8&1)? Iop_V128HIto64 : Iop_V128to64,
   17337               mkexpr(dV)));
   17338    assign(t1, unop((imm8&16) ? Iop_V128HIto64 : Iop_V128to64,
   17339               mkexpr(sV)));
   17340 
   17341    IRTemp t2 = newTemp(Ity_I64);
   17342    IRTemp t3 = newTemp(Ity_I64);
   17343 
   17344    IRExpr** args;
   17345 
   17346    args = mkIRExprVec_3(mkexpr(t0), mkexpr(t1), mkU64(0));
   17347    assign(t2, mkIRExprCCall(Ity_I64,0, "amd64g_calculate_pclmul",
   17348                             &amd64g_calculate_pclmul, args));
   17349    args = mkIRExprVec_3(mkexpr(t0), mkexpr(t1), mkU64(1));
   17350    assign(t3, mkIRExprCCall(Ity_I64,0, "amd64g_calculate_pclmul",
   17351                             &amd64g_calculate_pclmul, args));
   17352 
   17353    IRTemp res     = newTemp(Ity_V128);
   17354    assign(res, binop(Iop_64HLtoV128, mkexpr(t3), mkexpr(t2)));
   17355    return res;
   17356 }
   17357 
   17358 
   17359 __attribute__((noinline))
   17360 static
   17361 Long dis_ESC_0F3A__SSE4 ( Bool* decode_OK,
   17362                           VexAbiInfo* vbi,
   17363                           Prefix pfx, Int sz, Long deltaIN )
   17364 {
   17365    IRTemp addr  = IRTemp_INVALID;
   17366    UChar  modrm = 0;
   17367    Int    alen  = 0;
   17368    HChar  dis_buf[50];
   17369 
   17370    *decode_OK = False;
   17371 
   17372    Long   delta = deltaIN;
   17373    UChar  opc   = getUChar(delta);
   17374    delta++;
   17375    switch (opc) {
   17376 
   17377    case 0x08:
   17378       /* 66 0F 3A 08 /r ib = ROUNDPS imm8, xmm2/m128, xmm1 */
   17379       if (have66noF2noF3(pfx) && sz == 2) {
   17380 
   17381          IRTemp src0 = newTemp(Ity_F32);
   17382          IRTemp src1 = newTemp(Ity_F32);
   17383          IRTemp src2 = newTemp(Ity_F32);
   17384          IRTemp src3 = newTemp(Ity_F32);
   17385          IRTemp res0 = newTemp(Ity_F32);
   17386          IRTemp res1 = newTemp(Ity_F32);
   17387          IRTemp res2 = newTemp(Ity_F32);
   17388          IRTemp res3 = newTemp(Ity_F32);
   17389          IRTemp rm   = newTemp(Ity_I32);
   17390          Int    imm  = 0;
   17391 
   17392          modrm = getUChar(delta);
   17393 
   17394          if (epartIsReg(modrm)) {
   17395             assign( src0,
   17396                     getXMMRegLane32F( eregOfRexRM(pfx, modrm), 0 ) );
   17397             assign( src1,
   17398                     getXMMRegLane32F( eregOfRexRM(pfx, modrm), 1 ) );
   17399             assign( src2,
   17400                     getXMMRegLane32F( eregOfRexRM(pfx, modrm), 2 ) );
   17401             assign( src3,
   17402                     getXMMRegLane32F( eregOfRexRM(pfx, modrm), 3 ) );
   17403             imm = getUChar(delta+1);
   17404             if (imm & ~15) goto decode_failure;
   17405             delta += 1+1;
   17406             DIP( "roundps $%d,%s,%s\n",
   17407                  imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
   17408                       nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   17409          } else {
   17410             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   17411             gen_SEGV_if_not_16_aligned(addr);
   17412             assign( src0, loadLE(Ity_F32,
   17413                                  binop(Iop_Add64, mkexpr(addr), mkU64(0) )));
   17414             assign( src1, loadLE(Ity_F32,
   17415                                  binop(Iop_Add64, mkexpr(addr), mkU64(4) )));
   17416             assign( src2, loadLE(Ity_F32,
   17417                                  binop(Iop_Add64, mkexpr(addr), mkU64(8) )));
   17418             assign( src3, loadLE(Ity_F32,
   17419                                  binop(Iop_Add64, mkexpr(addr), mkU64(12) )));
   17420             imm = getUChar(delta+alen);
   17421             if (imm & ~15) goto decode_failure;
   17422             delta += alen+1;
   17423             DIP( "roundps $%d,%s,%s\n",
   17424                  imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   17425          }
   17426 
   17427          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   17428             that encoding is the same as the encoding for IRRoundingMode,
   17429             we can use that value directly in the IR as a rounding
   17430             mode. */
   17431          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   17432 
   17433          assign(res0, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src0)) );
   17434          assign(res1, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src1)) );
   17435          assign(res2, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src2)) );
   17436          assign(res3, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src3)) );
   17437 
   17438          putXMMRegLane32F( gregOfRexRM(pfx, modrm), 0, mkexpr(res0) );
   17439          putXMMRegLane32F( gregOfRexRM(pfx, modrm), 1, mkexpr(res1) );
   17440          putXMMRegLane32F( gregOfRexRM(pfx, modrm), 2, mkexpr(res2) );
   17441          putXMMRegLane32F( gregOfRexRM(pfx, modrm), 3, mkexpr(res3) );
   17442 
   17443          goto decode_success;
   17444       }
   17445       break;
   17446 
   17447    case 0x09:
   17448       /* 66 0F 3A 09 /r ib = ROUNDPD imm8, xmm2/m128, xmm1 */
   17449       if (have66noF2noF3(pfx) && sz == 2) {
   17450 
   17451          IRTemp src0 = newTemp(Ity_F64);
   17452          IRTemp src1 = newTemp(Ity_F64);
   17453          IRTemp res0 = newTemp(Ity_F64);
   17454          IRTemp res1 = newTemp(Ity_F64);
   17455          IRTemp rm   = newTemp(Ity_I32);
   17456          Int    imm  = 0;
   17457 
   17458          modrm = getUChar(delta);
   17459 
   17460          if (epartIsReg(modrm)) {
   17461             assign( src0,
   17462                     getXMMRegLane64F( eregOfRexRM(pfx, modrm), 0 ) );
   17463             assign( src1,
   17464                     getXMMRegLane64F( eregOfRexRM(pfx, modrm), 1 ) );
   17465             imm = getUChar(delta+1);
   17466             if (imm & ~15) goto decode_failure;
   17467             delta += 1+1;
   17468             DIP( "roundpd $%d,%s,%s\n",
   17469                  imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
   17470                       nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   17471          } else {
   17472             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   17473             gen_SEGV_if_not_16_aligned(addr);
   17474             assign( src0, loadLE(Ity_F64,
   17475                                  binop(Iop_Add64, mkexpr(addr), mkU64(0) )));
   17476             assign( src1, loadLE(Ity_F64,
   17477                                  binop(Iop_Add64, mkexpr(addr), mkU64(8) )));
   17478             imm = getUChar(delta+alen);
   17479             if (imm & ~15) goto decode_failure;
   17480             delta += alen+1;
   17481             DIP( "roundpd $%d,%s,%s\n",
   17482                  imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   17483          }
   17484 
   17485          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   17486             that encoding is the same as the encoding for IRRoundingMode,
   17487             we can use that value directly in the IR as a rounding
   17488             mode. */
   17489          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   17490 
   17491          assign(res0, binop(Iop_RoundF64toInt, mkexpr(rm), mkexpr(src0)) );
   17492          assign(res1, binop(Iop_RoundF64toInt, mkexpr(rm), mkexpr(src1)) );
   17493 
   17494          putXMMRegLane64F( gregOfRexRM(pfx, modrm), 0, mkexpr(res0) );
   17495          putXMMRegLane64F( gregOfRexRM(pfx, modrm), 1, mkexpr(res1) );
   17496 
   17497          goto decode_success;
   17498       }
   17499       break;
   17500 
   17501    case 0x0A:
   17502    case 0x0B:
   17503       /* 66 0F 3A 0A /r ib = ROUNDSS imm8, xmm2/m32, xmm1
   17504          66 0F 3A 0B /r ib = ROUNDSD imm8, xmm2/m64, xmm1
   17505       */
   17506       if (have66noF2noF3(pfx) && sz == 2) {
   17507 
   17508          Bool   isD = opc == 0x0B;
   17509          IRTemp src = newTemp(isD ? Ity_F64 : Ity_F32);
   17510          IRTemp res = newTemp(isD ? Ity_F64 : Ity_F32);
   17511          Int    imm = 0;
   17512 
   17513          modrm = getUChar(delta);
   17514 
   17515          if (epartIsReg(modrm)) {
   17516             assign( src,
   17517                     isD ? getXMMRegLane64F( eregOfRexRM(pfx, modrm), 0 )
   17518                         : getXMMRegLane32F( eregOfRexRM(pfx, modrm), 0 ) );
   17519             imm = getUChar(delta+1);
   17520             if (imm & ~15) goto decode_failure;
   17521             delta += 1+1;
   17522             DIP( "rounds%c $%d,%s,%s\n",
   17523                  isD ? 'd' : 's',
   17524                  imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
   17525                       nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   17526          } else {
   17527             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   17528             assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
   17529             imm = getUChar(delta+alen);
   17530             if (imm & ~15) goto decode_failure;
   17531             delta += alen+1;
   17532             DIP( "rounds%c $%d,%s,%s\n",
   17533                  isD ? 'd' : 's',
   17534                  imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   17535          }
   17536 
   17537          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   17538             that encoding is the same as the encoding for IRRoundingMode,
   17539             we can use that value directly in the IR as a rounding
   17540             mode. */
   17541          assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
   17542                            (imm & 4) ? get_sse_roundingmode()
   17543                                      : mkU32(imm & 3),
   17544                            mkexpr(src)) );
   17545 
   17546          if (isD)
   17547             putXMMRegLane64F( gregOfRexRM(pfx, modrm), 0, mkexpr(res) );
   17548          else
   17549             putXMMRegLane32F( gregOfRexRM(pfx, modrm), 0, mkexpr(res) );
   17550 
   17551          goto decode_success;
   17552       }
   17553       break;
   17554 
   17555    case 0x0C:
   17556       /* 66 0F 3A 0C /r ib = BLENDPS xmm1, xmm2/m128, imm8
   17557          Blend Packed Single Precision Floating-Point Values (XMM) */
   17558       if (have66noF2noF3(pfx) && sz == 2) {
   17559 
   17560          Int imm8;
   17561          IRTemp dst_vec = newTemp(Ity_V128);
   17562          IRTemp src_vec = newTemp(Ity_V128);
   17563 
   17564          modrm = getUChar(delta);
   17565 
   17566          assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
   17567 
   17568          if ( epartIsReg( modrm ) ) {
   17569             imm8 = (Int)getUChar(delta+1);
   17570             assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   17571             delta += 1+1;
   17572             DIP( "blendps $%d, %s,%s\n", imm8,
   17573                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   17574                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   17575          } else {
   17576             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   17577                              1/* imm8 is 1 byte after the amode */ );
   17578             gen_SEGV_if_not_16_aligned( addr );
   17579             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   17580             imm8 = (Int)getUChar(delta+alen);
   17581             delta += alen+1;
   17582             DIP( "blendpd $%d, %s,%s\n",
   17583                  imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   17584          }
   17585 
   17586          putXMMReg( gregOfRexRM(pfx, modrm),
   17587                     mkexpr( math_BLENDPS_128( src_vec, dst_vec, imm8) ) );
   17588          goto decode_success;
   17589       }
   17590       break;
   17591 
   17592    case 0x0D:
   17593       /* 66 0F 3A 0D /r ib = BLENDPD xmm1, xmm2/m128, imm8
   17594          Blend Packed Double Precision Floating-Point Values (XMM) */
   17595       if (have66noF2noF3(pfx) && sz == 2) {
   17596 
   17597          Int imm8;
   17598          IRTemp dst_vec = newTemp(Ity_V128);
   17599          IRTemp src_vec = newTemp(Ity_V128);
   17600 
   17601          modrm = getUChar(delta);
   17602          assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
   17603 
   17604          if ( epartIsReg( modrm ) ) {
   17605             imm8 = (Int)getUChar(delta+1);
   17606             assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   17607             delta += 1+1;
   17608             DIP( "blendpd $%d, %s,%s\n", imm8,
   17609                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   17610                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   17611          } else {
   17612             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   17613                              1/* imm8 is 1 byte after the amode */ );
   17614             gen_SEGV_if_not_16_aligned( addr );
   17615             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   17616             imm8 = (Int)getUChar(delta+alen);
   17617             delta += alen+1;
   17618             DIP( "blendpd $%d, %s,%s\n",
   17619                  imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   17620          }
   17621 
   17622          putXMMReg( gregOfRexRM(pfx, modrm),
   17623                     mkexpr( math_BLENDPD_128( src_vec, dst_vec, imm8) ) );
   17624          goto decode_success;
   17625       }
   17626       break;
   17627 
   17628    case 0x0E:
   17629       /* 66 0F 3A 0E /r ib = PBLENDW xmm1, xmm2/m128, imm8
   17630          Blend Packed Words (XMM) */
   17631       if (have66noF2noF3(pfx) && sz == 2) {
   17632 
   17633          Int imm8;
   17634          IRTemp dst_vec = newTemp(Ity_V128);
   17635          IRTemp src_vec = newTemp(Ity_V128);
   17636 
   17637          modrm = getUChar(delta);
   17638 
   17639          assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
   17640 
   17641          if ( epartIsReg( modrm ) ) {
   17642             imm8 = (Int)getUChar(delta+1);
   17643             assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   17644             delta += 1+1;
   17645             DIP( "pblendw $%d, %s,%s\n", imm8,
   17646                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   17647                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   17648          } else {
   17649             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   17650                              1/* imm8 is 1 byte after the amode */ );
   17651             gen_SEGV_if_not_16_aligned( addr );
   17652             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   17653             imm8 = (Int)getUChar(delta+alen);
   17654             delta += alen+1;
   17655             DIP( "pblendw $%d, %s,%s\n",
   17656                  imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   17657          }
   17658 
   17659          putXMMReg( gregOfRexRM(pfx, modrm),
   17660                     mkexpr( math_PBLENDW_128( src_vec, dst_vec, imm8) ) );
   17661          goto decode_success;
   17662       }
   17663       break;
   17664 
   17665    case 0x14:
   17666       /* 66 0F 3A 14 /r ib = PEXTRB r/m16, xmm, imm8
   17667          Extract Byte from xmm, store in mem or zero-extend + store in gen.reg.
   17668          (XMM) */
   17669       if (have66noF2noF3(pfx) && sz == 2) {
   17670          delta = dis_PEXTRB_128_GtoE( vbi, pfx, delta, False/*!isAvx*/ );
   17671          goto decode_success;
   17672       }
   17673       break;
   17674 
   17675    case 0x15:
   17676       /* 66 0F 3A 15 /r ib = PEXTRW r/m16, xmm, imm8
   17677          Extract Word from xmm, store in mem or zero-extend + store in gen.reg.
   17678          (XMM) */
   17679       if (have66noF2noF3(pfx) && sz == 2) {
   17680          delta = dis_PEXTRW( vbi, pfx, delta, False/*!isAvx*/ );
   17681          goto decode_success;
   17682       }
   17683       break;
   17684 
   17685    case 0x16:
   17686       /* 66 no-REX.W 0F 3A 16 /r ib = PEXTRD reg/mem32, xmm2, imm8
   17687          Extract Doubleword int from xmm reg and store in gen.reg or mem. (XMM)
   17688          Note that this insn has the same opcodes as PEXTRQ, but
   17689          here the REX.W bit is _not_ present */
   17690       if (have66noF2noF3(pfx)
   17691           && sz == 2 /* REX.W is _not_ present */) {
   17692          delta = dis_PEXTRD( vbi, pfx, delta, False/*!isAvx*/ );
   17693          goto decode_success;
   17694       }
   17695       /* 66 REX.W 0F 3A 16 /r ib = PEXTRQ reg/mem64, xmm2, imm8
   17696          Extract Quadword int from xmm reg and store in gen.reg or mem. (XMM)
   17697          Note that this insn has the same opcodes as PEXTRD, but
   17698          here the REX.W bit is present */
   17699       if (have66noF2noF3(pfx)
   17700           && sz == 8 /* REX.W is present */) {
   17701          delta = dis_PEXTRQ( vbi, pfx, delta, False/*!isAvx*/);
   17702          goto decode_success;
   17703       }
   17704       break;
   17705 
   17706    case 0x17:
   17707       /* 66 0F 3A 17 /r ib = EXTRACTPS reg/mem32, xmm2, imm8 Extract
   17708          float from xmm reg and store in gen.reg or mem.  This is
   17709          identical to PEXTRD, except that REX.W appears to be ignored.
   17710       */
   17711       if (have66noF2noF3(pfx)
   17712           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   17713          delta = dis_EXTRACTPS( vbi, pfx, delta, False/*!isAvx*/ );
   17714          goto decode_success;
   17715       }
   17716       break;
   17717 
   17718    case 0x20:
   17719       /* 66 0F 3A 20 /r ib = PINSRB xmm1, r32/m8, imm8
   17720          Extract byte from r32/m8 and insert into xmm1 */
   17721       if (have66noF2noF3(pfx) && sz == 2) {
   17722          Int    imm8;
   17723          IRTemp new8 = newTemp(Ity_I8);
   17724          modrm = getUChar(delta);
   17725          UInt rG = gregOfRexRM(pfx, modrm);
   17726          if ( epartIsReg( modrm ) ) {
   17727             UInt rE = eregOfRexRM(pfx,modrm);
   17728             imm8 = (Int)(getUChar(delta+1) & 0xF);
   17729             assign( new8, unop(Iop_32to8, getIReg32(rE)) );
   17730             delta += 1+1;
   17731             DIP( "pinsrb $%d,%s,%s\n", imm8,
   17732                  nameIReg32(rE), nameXMMReg(rG) );
   17733          } else {
   17734             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   17735             imm8 = (Int)(getUChar(delta+alen) & 0xF);
   17736             assign( new8, loadLE( Ity_I8, mkexpr(addr) ) );
   17737             delta += alen+1;
   17738             DIP( "pinsrb $%d,%s,%s\n",
   17739                  imm8, dis_buf, nameXMMReg(rG) );
   17740          }
   17741          IRTemp src_vec = newTemp(Ity_V128);
   17742          assign(src_vec, getXMMReg( gregOfRexRM(pfx, modrm) ));
   17743          IRTemp res = math_PINSRB_128( src_vec, new8, imm8 );
   17744          putXMMReg( rG, mkexpr(res) );
   17745          goto decode_success;
   17746       }
   17747       break;
   17748 
   17749    case 0x21:
   17750       /* 66 0F 3A 21 /r ib = INSERTPS imm8, xmm2/m32, xmm1
   17751          Insert Packed Single Precision Floating-Point Value (XMM) */
   17752       if (have66noF2noF3(pfx) && sz == 2) {
   17753          UInt   imm8;
   17754          IRTemp d2ins = newTemp(Ity_I32); /* comes from the E part */
   17755          const IRTemp inval = IRTemp_INVALID;
   17756 
   17757          modrm = getUChar(delta);
   17758          UInt rG = gregOfRexRM(pfx, modrm);
   17759 
   17760          if ( epartIsReg( modrm ) ) {
   17761             UInt   rE = eregOfRexRM(pfx, modrm);
   17762             IRTemp vE = newTemp(Ity_V128);
   17763             assign( vE, getXMMReg(rE) );
   17764             IRTemp dsE[4] = { inval, inval, inval, inval };
   17765             breakupV128to32s( vE, &dsE[3], &dsE[2], &dsE[1], &dsE[0] );
   17766             imm8 = getUChar(delta+1);
   17767             d2ins = dsE[(imm8 >> 6) & 3]; /* "imm8_count_s" */
   17768             delta += 1+1;
   17769             DIP( "insertps $%u, %s,%s\n",
   17770                  imm8, nameXMMReg(rE), nameXMMReg(rG) );
   17771          } else {
   17772             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   17773             assign( d2ins, loadLE( Ity_I32, mkexpr(addr) ) );
   17774             imm8 = getUChar(delta+alen);
   17775             delta += alen+1;
   17776             DIP( "insertps $%u, %s,%s\n",
   17777                  imm8, dis_buf, nameXMMReg(rG) );
   17778          }
   17779 
   17780          IRTemp vG = newTemp(Ity_V128);
   17781          assign( vG, getXMMReg(rG) );
   17782 
   17783          putXMMReg( rG, mkexpr(math_INSERTPS( vG, d2ins, imm8 )) );
   17784          goto decode_success;
   17785       }
   17786       break;
   17787 
   17788    case 0x22:
   17789       /* 66 no-REX.W 0F 3A 22 /r ib = PINSRD xmm1, r/m32, imm8
   17790          Extract Doubleword int from gen.reg/mem32 and insert into xmm1 */
   17791       if (have66noF2noF3(pfx)
   17792           && sz == 2 /* REX.W is NOT present */) {
   17793          Int    imm8_10;
   17794          IRTemp src_u32 = newTemp(Ity_I32);
   17795          modrm = getUChar(delta);
   17796          UInt rG = gregOfRexRM(pfx, modrm);
   17797 
   17798          if ( epartIsReg( modrm ) ) {
   17799             UInt rE = eregOfRexRM(pfx,modrm);
   17800             imm8_10 = (Int)(getUChar(delta+1) & 3);
   17801             assign( src_u32, getIReg32( rE ) );
   17802             delta += 1+1;
   17803             DIP( "pinsrd $%d, %s,%s\n",
   17804                  imm8_10, nameIReg32(rE), nameXMMReg(rG) );
   17805          } else {
   17806             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   17807             imm8_10 = (Int)(getUChar(delta+alen) & 3);
   17808             assign( src_u32, loadLE( Ity_I32, mkexpr(addr) ) );
   17809             delta += alen+1;
   17810             DIP( "pinsrd $%d, %s,%s\n",
   17811                  imm8_10, dis_buf, nameXMMReg(rG) );
   17812          }
   17813 
   17814          IRTemp src_vec = newTemp(Ity_V128);
   17815          assign(src_vec, getXMMReg( rG ));
   17816          IRTemp res_vec = math_PINSRD_128( src_vec, src_u32, imm8_10 );
   17817          putXMMReg( rG, mkexpr(res_vec) );
   17818          goto decode_success;
   17819       }
   17820       /* 66 REX.W 0F 3A 22 /r ib = PINSRQ xmm1, r/m64, imm8
   17821          Extract Quadword int from gen.reg/mem64 and insert into xmm1 */
   17822       if (have66noF2noF3(pfx)
   17823           && sz == 8 /* REX.W is present */) {
   17824          Int imm8_0;
   17825          IRTemp src_u64 = newTemp(Ity_I64);
   17826          modrm = getUChar(delta);
   17827          UInt rG = gregOfRexRM(pfx, modrm);
   17828 
   17829          if ( epartIsReg( modrm ) ) {
   17830             UInt rE = eregOfRexRM(pfx,modrm);
   17831             imm8_0 = (Int)(getUChar(delta+1) & 1);
   17832             assign( src_u64, getIReg64( rE ) );
   17833             delta += 1+1;
   17834             DIP( "pinsrq $%d, %s,%s\n",
   17835                  imm8_0, nameIReg64(rE), nameXMMReg(rG) );
   17836          } else {
   17837             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   17838             imm8_0 = (Int)(getUChar(delta+alen) & 1);
   17839             assign( src_u64, loadLE( Ity_I64, mkexpr(addr) ) );
   17840             delta += alen+1;
   17841             DIP( "pinsrq $%d, %s,%s\n",
   17842                  imm8_0, dis_buf, nameXMMReg(rG) );
   17843          }
   17844 
   17845          IRTemp src_vec = newTemp(Ity_V128);
   17846          assign(src_vec, getXMMReg( rG ));
   17847          IRTemp res_vec = math_PINSRQ_128( src_vec, src_u64, imm8_0 );
   17848          putXMMReg( rG, mkexpr(res_vec) );
   17849          goto decode_success;
   17850       }
   17851       break;
   17852 
   17853    case 0x40:
   17854       /* 66 0F 3A 40 /r ib = DPPS xmm1, xmm2/m128, imm8
   17855          Dot Product of Packed Single Precision Floating-Point Values (XMM) */
   17856       if (have66noF2noF3(pfx) && sz == 2) {
   17857          modrm = getUChar(delta);
   17858          Int    imm8;
   17859          IRTemp src_vec = newTemp(Ity_V128);
   17860          IRTemp dst_vec = newTemp(Ity_V128);
   17861          UInt   rG      = gregOfRexRM(pfx, modrm);
   17862          assign( dst_vec, getXMMReg( rG ) );
   17863          if ( epartIsReg( modrm ) ) {
   17864             UInt rE = eregOfRexRM(pfx, modrm);
   17865             imm8 = (Int)getUChar(delta+1);
   17866             assign( src_vec, getXMMReg(rE) );
   17867             delta += 1+1;
   17868             DIP( "dpps $%d, %s,%s\n",
   17869                  imm8, nameXMMReg(rE), nameXMMReg(rG) );
   17870          } else {
   17871             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   17872                              1/* imm8 is 1 byte after the amode */ );
   17873             gen_SEGV_if_not_16_aligned( addr );
   17874             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   17875             imm8 = (Int)getUChar(delta+alen);
   17876             delta += alen+1;
   17877             DIP( "dpps $%d, %s,%s\n",
   17878                  imm8, dis_buf, nameXMMReg(rG) );
   17879          }
   17880          IRTemp res = math_DPPS_128( src_vec, dst_vec, imm8 );
   17881          putXMMReg( rG, mkexpr(res) );
   17882          goto decode_success;
   17883       }
   17884       break;
   17885 
   17886    case 0x41:
   17887       /* 66 0F 3A 41 /r ib = DPPD xmm1, xmm2/m128, imm8
   17888          Dot Product of Packed Double Precision Floating-Point Values (XMM) */
   17889       if (have66noF2noF3(pfx) && sz == 2) {
   17890          modrm = getUChar(delta);
   17891          Int    imm8;
   17892          IRTemp src_vec = newTemp(Ity_V128);
   17893          IRTemp dst_vec = newTemp(Ity_V128);
   17894          UInt   rG      = gregOfRexRM(pfx, modrm);
   17895          assign( dst_vec, getXMMReg( rG ) );
   17896          if ( epartIsReg( modrm ) ) {
   17897             UInt rE = eregOfRexRM(pfx, modrm);
   17898             imm8 = (Int)getUChar(delta+1);
   17899             assign( src_vec, getXMMReg(rE) );
   17900             delta += 1+1;
   17901             DIP( "dppd $%d, %s,%s\n",
   17902                  imm8, nameXMMReg(rE), nameXMMReg(rG) );
   17903          } else {
   17904             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   17905                              1/* imm8 is 1 byte after the amode */ );
   17906             gen_SEGV_if_not_16_aligned( addr );
   17907             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   17908             imm8 = (Int)getUChar(delta+alen);
   17909             delta += alen+1;
   17910             DIP( "dppd $%d, %s,%s\n",
   17911                  imm8, dis_buf, nameXMMReg(rG) );
   17912          }
   17913          IRTemp res = math_DPPD_128( src_vec, dst_vec, imm8 );
   17914          putXMMReg( rG, mkexpr(res) );
   17915          goto decode_success;
   17916       }
   17917       break;
   17918 
   17919    case 0x42:
   17920       /* 66 0F 3A 42 /r ib = MPSADBW xmm1, xmm2/m128, imm8
   17921          Multiple Packed Sums of Absolule Difference (XMM) */
   17922       if (have66noF2noF3(pfx) && sz == 2) {
   17923          Int    imm8;
   17924          IRTemp src_vec = newTemp(Ity_V128);
   17925          IRTemp dst_vec = newTemp(Ity_V128);
   17926          modrm          = getUChar(delta);
   17927          UInt   rG      = gregOfRexRM(pfx, modrm);
   17928 
   17929          assign( dst_vec, getXMMReg(rG) );
   17930 
   17931          if ( epartIsReg( modrm ) ) {
   17932             UInt rE = eregOfRexRM(pfx, modrm);
   17933 
   17934             imm8 = (Int)getUChar(delta+1);
   17935             assign( src_vec, getXMMReg(rE) );
   17936             delta += 1+1;
   17937             DIP( "mpsadbw $%d, %s,%s\n", imm8,
   17938                  nameXMMReg(rE), nameXMMReg(rG) );
   17939          } else {
   17940             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   17941                              1/* imm8 is 1 byte after the amode */ );
   17942             gen_SEGV_if_not_16_aligned( addr );
   17943             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   17944             imm8 = (Int)getUChar(delta+alen);
   17945             delta += alen+1;
   17946             DIP( "mpsadbw $%d, %s,%s\n", imm8, dis_buf, nameXMMReg(rG) );
   17947          }
   17948 
   17949          putXMMReg( rG, mkexpr( math_MPSADBW_128(dst_vec, src_vec, imm8) ) );
   17950          goto decode_success;
   17951       }
   17952       break;
   17953 
   17954    case 0x44:
   17955       /* 66 0F 3A 44 /r ib = PCLMULQDQ xmm1, xmm2/m128, imm8
   17956        * Carry-less multiplication of selected XMM quadwords into XMM
   17957        * registers (a.k.a multiplication of polynomials over GF(2))
   17958        */
   17959       if (have66noF2noF3(pfx) && sz == 2) {
   17960 
   17961          Int imm8;
   17962          IRTemp svec = newTemp(Ity_V128);
   17963          IRTemp dvec = newTemp(Ity_V128);
   17964          modrm       = getUChar(delta);
   17965          UInt   rG   = gregOfRexRM(pfx, modrm);
   17966 
   17967          assign( dvec, getXMMReg(rG) );
   17968 
   17969          if ( epartIsReg( modrm ) ) {
   17970             UInt rE = eregOfRexRM(pfx, modrm);
   17971             imm8 = (Int)getUChar(delta+1);
   17972             assign( svec, getXMMReg(rE) );
   17973             delta += 1+1;
   17974             DIP( "pclmulqdq $%d, %s,%s\n", imm8,
   17975                  nameXMMReg(rE), nameXMMReg(rG) );
   17976          } else {
   17977             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   17978                              1/* imm8 is 1 byte after the amode */ );
   17979             gen_SEGV_if_not_16_aligned( addr );
   17980             assign( svec, loadLE( Ity_V128, mkexpr(addr) ) );
   17981             imm8 = (Int)getUChar(delta+alen);
   17982             delta += alen+1;
   17983             DIP( "pclmulqdq $%d, %s,%s\n",
   17984                  imm8, dis_buf, nameXMMReg(rG) );
   17985          }
   17986 
   17987          putXMMReg( rG, mkexpr( math_PCLMULQDQ(dvec, svec, imm8) ) );
   17988          goto decode_success;
   17989       }
   17990       break;
   17991 
   17992    case 0x60:
   17993    case 0x61:
   17994    case 0x62:
   17995    case 0x63:
   17996       /* 66 0F 3A 63 /r ib = PCMPISTRI imm8, xmm2/m128, xmm1
   17997          66 0F 3A 62 /r ib = PCMPISTRM imm8, xmm2/m128, xmm1
   17998          66 0F 3A 61 /r ib = PCMPESTRI imm8, xmm2/m128, xmm1
   17999          66 0F 3A 60 /r ib = PCMPESTRM imm8, xmm2/m128, xmm1
   18000          (selected special cases that actually occur in glibc,
   18001           not by any means a complete implementation.)
   18002       */
   18003       if (have66noF2noF3(pfx) && sz == 2) {
   18004          Long delta0 = delta;
   18005          delta = dis_PCMPxSTRx( vbi, pfx, delta, False/*!isAvx*/, opc );
   18006          if (delta > delta0) goto decode_success;
   18007          /* else fall though; dis_PCMPxSTRx failed to decode it */
   18008       }
   18009       break;
   18010 
   18011    case 0xDF:
   18012       /* 66 0F 3A DF /r ib = AESKEYGENASSIST imm8, xmm2/m128, xmm1 */
   18013       if (have66noF2noF3(pfx) && sz == 2) {
   18014          delta = dis_AESKEYGENASSIST( vbi, pfx, delta, False/*!isAvx*/ );
   18015          goto decode_success;
   18016       }
   18017       break;
   18018 
   18019    default:
   18020       break;
   18021 
   18022    }
   18023 
   18024   decode_failure:
   18025    *decode_OK = False;
   18026    return deltaIN;
   18027 
   18028   decode_success:
   18029    *decode_OK = True;
   18030    return delta;
   18031 }
   18032 
   18033 
   18034 /*------------------------------------------------------------*/
   18035 /*---                                                      ---*/
   18036 /*--- Top-level post-escape decoders: dis_ESC_NONE         ---*/
   18037 /*---                                                      ---*/
   18038 /*------------------------------------------------------------*/
   18039 
   18040 __attribute__((noinline))
   18041 static
   18042 Long dis_ESC_NONE (
   18043         /*MB_OUT*/DisResult* dres,
   18044         /*MB_OUT*/Bool*      expect_CAS,
   18045         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
   18046         Bool         resteerCisOk,
   18047         void*        callback_opaque,
   18048         VexArchInfo* archinfo,
   18049         VexAbiInfo*  vbi,
   18050         Prefix pfx, Int sz, Long deltaIN
   18051      )
   18052 {
   18053    Long   d64   = 0;
   18054    UChar  abyte = 0;
   18055    IRTemp addr  = IRTemp_INVALID;
   18056    IRTemp t1    = IRTemp_INVALID;
   18057    IRTemp t2    = IRTemp_INVALID;
   18058    IRTemp t3    = IRTemp_INVALID;
   18059    IRTemp t4    = IRTemp_INVALID;
   18060    IRTemp t5    = IRTemp_INVALID;
   18061    IRType ty    = Ity_INVALID;
   18062    UChar  modrm = 0;
   18063    Int    am_sz = 0;
   18064    Int    d_sz  = 0;
   18065    Int    alen  = 0;
   18066    HChar  dis_buf[50];
   18067 
   18068    Long   delta = deltaIN;
   18069    UChar  opc   = getUChar(delta);
   18070    delta++;
   18071    switch (opc) {
   18072 
   18073    case 0x00: /* ADD Gb,Eb */
   18074       if (haveF2orF3(pfx)) goto decode_failure;
   18075       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Add8, True, 1, delta, "add" );
   18076       return delta;
   18077    case 0x01: /* ADD Gv,Ev */
   18078       if (haveF2orF3(pfx)) goto decode_failure;
   18079       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Add8, True, sz, delta, "add" );
   18080       return delta;
   18081 
   18082    case 0x02: /* ADD Eb,Gb */
   18083       if (haveF2orF3(pfx)) goto decode_failure;
   18084       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Add8, True, 1, delta, "add" );
   18085       return delta;
   18086    case 0x03: /* ADD Ev,Gv */
   18087       if (haveF2orF3(pfx)) goto decode_failure;
   18088       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Add8, True, sz, delta, "add" );
   18089       return delta;
   18090 
   18091    case 0x04: /* ADD Ib, AL */
   18092       if (haveF2orF3(pfx)) goto decode_failure;
   18093       delta = dis_op_imm_A( 1, False, Iop_Add8, True, delta, "add" );
   18094       return delta;
   18095    case 0x05: /* ADD Iv, eAX */
   18096       if (haveF2orF3(pfx)) goto decode_failure;
   18097       delta = dis_op_imm_A(sz, False, Iop_Add8, True, delta, "add" );
   18098       return delta;
   18099 
   18100    case 0x08: /* OR Gb,Eb */
   18101       if (haveF2orF3(pfx)) goto decode_failure;
   18102       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Or8, True, 1, delta, "or" );
   18103       return delta;
   18104    case 0x09: /* OR Gv,Ev */
   18105       if (haveF2orF3(pfx)) goto decode_failure;
   18106       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Or8, True, sz, delta, "or" );
   18107       return delta;
   18108 
   18109    case 0x0A: /* OR Eb,Gb */
   18110       if (haveF2orF3(pfx)) goto decode_failure;
   18111       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Or8, True, 1, delta, "or" );
   18112       return delta;
   18113    case 0x0B: /* OR Ev,Gv */
   18114       if (haveF2orF3(pfx)) goto decode_failure;
   18115       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Or8, True, sz, delta, "or" );
   18116       return delta;
   18117 
   18118    case 0x0C: /* OR Ib, AL */
   18119       if (haveF2orF3(pfx)) goto decode_failure;
   18120       delta = dis_op_imm_A( 1, False, Iop_Or8, True, delta, "or" );
   18121       return delta;
   18122    case 0x0D: /* OR Iv, eAX */
   18123       if (haveF2orF3(pfx)) goto decode_failure;
   18124       delta = dis_op_imm_A( sz, False, Iop_Or8, True, delta, "or" );
   18125       return delta;
   18126 
   18127    case 0x10: /* ADC Gb,Eb */
   18128       if (haveF2orF3(pfx)) goto decode_failure;
   18129       delta = dis_op2_G_E ( vbi, pfx, True, Iop_Add8, True, 1, delta, "adc" );
   18130       return delta;
   18131    case 0x11: /* ADC Gv,Ev */
   18132       if (haveF2orF3(pfx)) goto decode_failure;
   18133       delta = dis_op2_G_E ( vbi, pfx, True, Iop_Add8, True, sz, delta, "adc" );
   18134       return delta;
   18135 
   18136    case 0x12: /* ADC Eb,Gb */
   18137       if (haveF2orF3(pfx)) goto decode_failure;
   18138       delta = dis_op2_E_G ( vbi, pfx, True, Iop_Add8, True, 1, delta, "adc" );
   18139       return delta;
   18140    case 0x13: /* ADC Ev,Gv */
   18141       if (haveF2orF3(pfx)) goto decode_failure;
   18142       delta = dis_op2_E_G ( vbi, pfx, True, Iop_Add8, True, sz, delta, "adc" );
   18143       return delta;
   18144 
   18145    case 0x14: /* ADC Ib, AL */
   18146       if (haveF2orF3(pfx)) goto decode_failure;
   18147       delta = dis_op_imm_A( 1, True, Iop_Add8, True, delta, "adc" );
   18148       return delta;
   18149    case 0x15: /* ADC Iv, eAX */
   18150       if (haveF2orF3(pfx)) goto decode_failure;
   18151       delta = dis_op_imm_A( sz, True, Iop_Add8, True, delta, "adc" );
   18152       return delta;
   18153 
   18154    case 0x18: /* SBB Gb,Eb */
   18155       if (haveF2orF3(pfx)) goto decode_failure;
   18156       delta = dis_op2_G_E ( vbi, pfx, True, Iop_Sub8, True, 1, delta, "sbb" );
   18157       return delta;
   18158    case 0x19: /* SBB Gv,Ev */
   18159       if (haveF2orF3(pfx)) goto decode_failure;
   18160       delta = dis_op2_G_E ( vbi, pfx, True, Iop_Sub8, True, sz, delta, "sbb" );
   18161       return delta;
   18162 
   18163    case 0x1A: /* SBB Eb,Gb */
   18164       if (haveF2orF3(pfx)) goto decode_failure;
   18165       delta = dis_op2_E_G ( vbi, pfx, True, Iop_Sub8, True, 1, delta, "sbb" );
   18166       return delta;
   18167    case 0x1B: /* SBB Ev,Gv */
   18168       if (haveF2orF3(pfx)) goto decode_failure;
   18169       delta = dis_op2_E_G ( vbi, pfx, True, Iop_Sub8, True, sz, delta, "sbb" );
   18170       return delta;
   18171 
   18172    case 0x1C: /* SBB Ib, AL */
   18173       if (haveF2orF3(pfx)) goto decode_failure;
   18174       delta = dis_op_imm_A( 1, True, Iop_Sub8, True, delta, "sbb" );
   18175       return delta;
   18176    case 0x1D: /* SBB Iv, eAX */
   18177       if (haveF2orF3(pfx)) goto decode_failure;
   18178       delta = dis_op_imm_A( sz, True, Iop_Sub8, True, delta, "sbb" );
   18179       return delta;
   18180 
   18181    case 0x20: /* AND Gb,Eb */
   18182       if (haveF2orF3(pfx)) goto decode_failure;
   18183       delta = dis_op2_G_E ( vbi, pfx, False, Iop_And8, True, 1, delta, "and" );
   18184       return delta;
   18185    case 0x21: /* AND Gv,Ev */
   18186       if (haveF2orF3(pfx)) goto decode_failure;
   18187       delta = dis_op2_G_E ( vbi, pfx, False, Iop_And8, True, sz, delta, "and" );
   18188       return delta;
   18189 
   18190    case 0x22: /* AND Eb,Gb */
   18191       if (haveF2orF3(pfx)) goto decode_failure;
   18192       delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, True, 1, delta, "and" );
   18193       return delta;
   18194    case 0x23: /* AND Ev,Gv */
   18195       if (haveF2orF3(pfx)) goto decode_failure;
   18196       delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, True, sz, delta, "and" );
   18197       return delta;
   18198 
   18199    case 0x24: /* AND Ib, AL */
   18200       if (haveF2orF3(pfx)) goto decode_failure;
   18201       delta = dis_op_imm_A( 1, False, Iop_And8, True, delta, "and" );
   18202       return delta;
   18203    case 0x25: /* AND Iv, eAX */
   18204       if (haveF2orF3(pfx)) goto decode_failure;
   18205       delta = dis_op_imm_A( sz, False, Iop_And8, True, delta, "and" );
   18206       return delta;
   18207 
   18208    case 0x28: /* SUB Gb,Eb */
   18209       if (haveF2orF3(pfx)) goto decode_failure;
   18210       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, True, 1, delta, "sub" );
   18211       return delta;
   18212    case 0x29: /* SUB Gv,Ev */
   18213       if (haveF2orF3(pfx)) goto decode_failure;
   18214       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, True, sz, delta, "sub" );
   18215       return delta;
   18216 
   18217    case 0x2A: /* SUB Eb,Gb */
   18218       if (haveF2orF3(pfx)) goto decode_failure;
   18219       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, True, 1, delta, "sub" );
   18220       return delta;
   18221    case 0x2B: /* SUB Ev,Gv */
   18222       if (haveF2orF3(pfx)) goto decode_failure;
   18223       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, True, sz, delta, "sub" );
   18224       return delta;
   18225 
   18226    case 0x2C: /* SUB Ib, AL */
   18227       if (haveF2orF3(pfx)) goto decode_failure;
   18228       delta = dis_op_imm_A(1, False, Iop_Sub8, True, delta, "sub" );
   18229       return delta;
   18230 
   18231    case 0x2D: /* SUB Iv, eAX */
   18232       if (haveF2orF3(pfx)) goto decode_failure;
   18233       delta = dis_op_imm_A( sz, False, Iop_Sub8, True, delta, "sub" );
   18234       return delta;
   18235 
   18236    case 0x30: /* XOR Gb,Eb */
   18237       if (haveF2orF3(pfx)) goto decode_failure;
   18238       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Xor8, True, 1, delta, "xor" );
   18239       return delta;
   18240    case 0x31: /* XOR Gv,Ev */
   18241       if (haveF2orF3(pfx)) goto decode_failure;
   18242       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Xor8, True, sz, delta, "xor" );
   18243       return delta;
   18244 
   18245    case 0x32: /* XOR Eb,Gb */
   18246       if (haveF2orF3(pfx)) goto decode_failure;
   18247       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Xor8, True, 1, delta, "xor" );
   18248       return delta;
   18249    case 0x33: /* XOR Ev,Gv */
   18250       if (haveF2orF3(pfx)) goto decode_failure;
   18251       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Xor8, True, sz, delta, "xor" );
   18252       return delta;
   18253 
   18254    case 0x34: /* XOR Ib, AL */
   18255       if (haveF2orF3(pfx)) goto decode_failure;
   18256       delta = dis_op_imm_A( 1, False, Iop_Xor8, True, delta, "xor" );
   18257       return delta;
   18258    case 0x35: /* XOR Iv, eAX */
   18259       if (haveF2orF3(pfx)) goto decode_failure;
   18260       delta = dis_op_imm_A( sz, False, Iop_Xor8, True, delta, "xor" );
   18261       return delta;
   18262 
   18263    case 0x38: /* CMP Gb,Eb */
   18264       if (haveF2orF3(pfx)) goto decode_failure;
   18265       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, False, 1, delta, "cmp" );
   18266       return delta;
   18267    case 0x39: /* CMP Gv,Ev */
   18268       if (haveF2orF3(pfx)) goto decode_failure;
   18269       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, False, sz, delta, "cmp" );
   18270       return delta;
   18271 
   18272    case 0x3A: /* CMP Eb,Gb */
   18273       if (haveF2orF3(pfx)) goto decode_failure;
   18274       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, False, 1, delta, "cmp" );
   18275       return delta;
   18276    case 0x3B: /* CMP Ev,Gv */
   18277       if (haveF2orF3(pfx)) goto decode_failure;
   18278       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, False, sz, delta, "cmp" );
   18279       return delta;
   18280 
   18281    case 0x3C: /* CMP Ib, AL */
   18282       if (haveF2orF3(pfx)) goto decode_failure;
   18283       delta = dis_op_imm_A( 1, False, Iop_Sub8, False, delta, "cmp" );
   18284       return delta;
   18285    case 0x3D: /* CMP Iv, eAX */
   18286       if (haveF2orF3(pfx)) goto decode_failure;
   18287       delta = dis_op_imm_A( sz, False, Iop_Sub8, False, delta, "cmp" );
   18288       return delta;
   18289 
   18290    case 0x50: /* PUSH eAX */
   18291    case 0x51: /* PUSH eCX */
   18292    case 0x52: /* PUSH eDX */
   18293    case 0x53: /* PUSH eBX */
   18294    case 0x55: /* PUSH eBP */
   18295    case 0x56: /* PUSH eSI */
   18296    case 0x57: /* PUSH eDI */
   18297    case 0x54: /* PUSH eSP */
   18298       /* This is the Right Way, in that the value to be pushed is
   18299          established before %rsp is changed, so that pushq %rsp
   18300          correctly pushes the old value. */
   18301       if (haveF2orF3(pfx)) goto decode_failure;
   18302       vassert(sz == 2 || sz == 4 || sz == 8);
   18303       if (sz == 4)
   18304          sz = 8; /* there is no encoding for 32-bit push in 64-bit mode */
   18305       ty = sz==2 ? Ity_I16 : Ity_I64;
   18306       t1 = newTemp(ty);
   18307       t2 = newTemp(Ity_I64);
   18308       assign(t1, getIRegRexB(sz, pfx, opc-0x50));
   18309       assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(sz)));
   18310       putIReg64(R_RSP, mkexpr(t2) );
   18311       storeLE(mkexpr(t2),mkexpr(t1));
   18312       DIP("push%c %s\n", nameISize(sz), nameIRegRexB(sz,pfx,opc-0x50));
   18313       return delta;
   18314 
   18315    case 0x58: /* POP eAX */
   18316    case 0x59: /* POP eCX */
   18317    case 0x5A: /* POP eDX */
   18318    case 0x5B: /* POP eBX */
   18319    case 0x5D: /* POP eBP */
   18320    case 0x5E: /* POP eSI */
   18321    case 0x5F: /* POP eDI */
   18322    case 0x5C: /* POP eSP */
   18323       if (haveF2orF3(pfx)) goto decode_failure;
   18324       vassert(sz == 2 || sz == 4 || sz == 8);
   18325       if (sz == 4)
   18326          sz = 8; /* there is no encoding for 32-bit pop in 64-bit mode */
   18327       t1 = newTemp(szToITy(sz));
   18328       t2 = newTemp(Ity_I64);
   18329       assign(t2, getIReg64(R_RSP));
   18330       assign(t1, loadLE(szToITy(sz),mkexpr(t2)));
   18331       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t2), mkU64(sz)));
   18332       putIRegRexB(sz, pfx, opc-0x58, mkexpr(t1));
   18333       DIP("pop%c %s\n", nameISize(sz), nameIRegRexB(sz,pfx,opc-0x58));
   18334       return delta;
   18335 
   18336    case 0x63: /* MOVSX */
   18337       if (haveF2orF3(pfx)) goto decode_failure;
   18338       if (haveREX(pfx) && 1==getRexW(pfx)) {
   18339          vassert(sz == 8);
   18340          /* movsx r/m32 to r64 */
   18341          modrm = getUChar(delta);
   18342          if (epartIsReg(modrm)) {
   18343             delta++;
   18344             putIRegG(8, pfx, modrm,
   18345                              unop(Iop_32Sto64,
   18346                                   getIRegE(4, pfx, modrm)));
   18347             DIP("movslq %s,%s\n",
   18348                 nameIRegE(4, pfx, modrm),
   18349                 nameIRegG(8, pfx, modrm));
   18350             return delta;
   18351          } else {
   18352             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   18353             delta += alen;
   18354             putIRegG(8, pfx, modrm,
   18355                              unop(Iop_32Sto64,
   18356                                   loadLE(Ity_I32, mkexpr(addr))));
   18357             DIP("movslq %s,%s\n", dis_buf,
   18358                 nameIRegG(8, pfx, modrm));
   18359             return delta;
   18360          }
   18361       } else {
   18362          goto decode_failure;
   18363       }
   18364 
   18365    case 0x68: /* PUSH Iv */
   18366       if (haveF2orF3(pfx)) goto decode_failure;
   18367       /* Note, sz==4 is not possible in 64-bit mode.  Hence ... */
   18368       if (sz == 4) sz = 8;
   18369       d64 = getSDisp(imin(4,sz),delta);
   18370       delta += imin(4,sz);
   18371       goto do_push_I;
   18372 
   18373    case 0x69: /* IMUL Iv, Ev, Gv */
   18374       if (haveF2orF3(pfx)) goto decode_failure;
   18375       delta = dis_imul_I_E_G ( vbi, pfx, sz, delta, sz );
   18376       return delta;
   18377 
   18378    case 0x6A: /* PUSH Ib, sign-extended to sz */
   18379       if (haveF2orF3(pfx)) goto decode_failure;
   18380       /* Note, sz==4 is not possible in 64-bit mode.  Hence ... */
   18381       if (sz == 4) sz = 8;
   18382       d64 = getSDisp8(delta); delta += 1;
   18383       goto do_push_I;
   18384    do_push_I:
   18385       ty = szToITy(sz);
   18386       t1 = newTemp(Ity_I64);
   18387       t2 = newTemp(ty);
   18388       assign( t1, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
   18389       putIReg64(R_RSP, mkexpr(t1) );
   18390       /* stop mkU16 asserting if d32 is a negative 16-bit number
   18391          (bug #132813) */
   18392       if (ty == Ity_I16)
   18393          d64 &= 0xFFFF;
   18394       storeLE( mkexpr(t1), mkU(ty,d64) );
   18395       DIP("push%c $%lld\n", nameISize(sz), (Long)d64);
   18396       return delta;
   18397 
   18398    case 0x6B: /* IMUL Ib, Ev, Gv */
   18399       delta = dis_imul_I_E_G ( vbi, pfx, sz, delta, 1 );
   18400       return delta;
   18401 
   18402    case 0x70:
   18403    case 0x71:
   18404    case 0x72:   /* JBb/JNAEb (jump below) */
   18405    case 0x73:   /* JNBb/JAEb (jump not below) */
   18406    case 0x74:   /* JZb/JEb (jump zero) */
   18407    case 0x75:   /* JNZb/JNEb (jump not zero) */
   18408    case 0x76:   /* JBEb/JNAb (jump below or equal) */
   18409    case 0x77:   /* JNBEb/JAb (jump not below or equal) */
   18410    case 0x78:   /* JSb (jump negative) */
   18411    case 0x79:   /* JSb (jump not negative) */
   18412    case 0x7A:   /* JP (jump parity even) */
   18413    case 0x7B:   /* JNP/JPO (jump parity odd) */
   18414    case 0x7C:   /* JLb/JNGEb (jump less) */
   18415    case 0x7D:   /* JGEb/JNLb (jump greater or equal) */
   18416    case 0x7E:   /* JLEb/JNGb (jump less or equal) */
   18417    case 0x7F: { /* JGb/JNLEb (jump greater) */
   18418       Long   jmpDelta;
   18419       HChar* comment  = "";
   18420       if (haveF2orF3(pfx)) goto decode_failure;
   18421       jmpDelta = getSDisp8(delta);
   18422       vassert(-128 <= jmpDelta && jmpDelta < 128);
   18423       d64 = (guest_RIP_bbstart+delta+1) + jmpDelta;
   18424       delta++;
   18425       if (resteerCisOk
   18426           && vex_control.guest_chase_cond
   18427           && (Addr64)d64 != (Addr64)guest_RIP_bbstart
   18428           && jmpDelta < 0
   18429           && resteerOkFn( callback_opaque, d64) ) {
   18430          /* Speculation: assume this backward branch is taken.  So we
   18431             need to emit a side-exit to the insn following this one,
   18432             on the negation of the condition, and continue at the
   18433             branch target address (d64).  If we wind up back at the
   18434             first instruction of the trace, just stop; it's better to
   18435             let the IR loop unroller handle that case. */
   18436          stmt( IRStmt_Exit(
   18437                   mk_amd64g_calculate_condition(
   18438                      (AMD64Condcode)(1 ^ (opc - 0x70))),
   18439                   Ijk_Boring,
   18440                   IRConst_U64(guest_RIP_bbstart+delta),
   18441                   OFFB_RIP ) );
   18442          dres->whatNext   = Dis_ResteerC;
   18443          dres->continueAt = d64;
   18444          comment = "(assumed taken)";
   18445       }
   18446       else
   18447       if (resteerCisOk
   18448           && vex_control.guest_chase_cond
   18449           && (Addr64)d64 != (Addr64)guest_RIP_bbstart
   18450           && jmpDelta >= 0
   18451           && resteerOkFn( callback_opaque, guest_RIP_bbstart+delta ) ) {
   18452          /* Speculation: assume this forward branch is not taken.  So
   18453             we need to emit a side-exit to d64 (the dest) and continue
   18454             disassembling at the insn immediately following this
   18455             one. */
   18456          stmt( IRStmt_Exit(
   18457                   mk_amd64g_calculate_condition((AMD64Condcode)(opc - 0x70)),
   18458                   Ijk_Boring,
   18459                   IRConst_U64(d64),
   18460                   OFFB_RIP ) );
   18461          dres->whatNext   = Dis_ResteerC;
   18462          dres->continueAt = guest_RIP_bbstart+delta;
   18463          comment = "(assumed not taken)";
   18464       }
   18465       else {
   18466          /* Conservative default translation - end the block at this
   18467             point. */
   18468          jcc_01( dres, (AMD64Condcode)(opc - 0x70),
   18469                  guest_RIP_bbstart+delta, d64 );
   18470          vassert(dres->whatNext == Dis_StopHere);
   18471       }
   18472       DIP("j%s-8 0x%llx %s\n", name_AMD64Condcode(opc - 0x70), d64, comment);
   18473       return delta;
   18474    }
   18475 
   18476    case 0x80: /* Grp1 Ib,Eb */
   18477       if (haveF2orF3(pfx)) goto decode_failure;
   18478       modrm = getUChar(delta);
   18479       am_sz = lengthAMode(pfx,delta);
   18480       sz    = 1;
   18481       d_sz  = 1;
   18482       d64   = getSDisp8(delta + am_sz);
   18483       delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
   18484       return delta;
   18485 
   18486    case 0x81: /* Grp1 Iv,Ev */
   18487       if (haveF2orF3(pfx)) goto decode_failure;
   18488       modrm = getUChar(delta);
   18489       am_sz = lengthAMode(pfx,delta);
   18490       d_sz  = imin(sz,4);
   18491       d64   = getSDisp(d_sz, delta + am_sz);
   18492       delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
   18493       return delta;
   18494 
   18495    case 0x83: /* Grp1 Ib,Ev */
   18496       if (haveF2orF3(pfx)) goto decode_failure;
   18497       modrm = getUChar(delta);
   18498       am_sz = lengthAMode(pfx,delta);
   18499       d_sz  = 1;
   18500       d64   = getSDisp8(delta + am_sz);
   18501       delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
   18502       return delta;
   18503 
   18504    case 0x84: /* TEST Eb,Gb */
   18505       if (haveF2orF3(pfx)) goto decode_failure;
   18506       delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, False, 1, delta, "test" );
   18507       return delta;
   18508 
   18509    case 0x85: /* TEST Ev,Gv */
   18510       if (haveF2orF3(pfx)) goto decode_failure;
   18511       delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, False, sz, delta, "test" );
   18512       return delta;
   18513 
   18514    /* XCHG reg,mem automatically asserts LOCK# even without a LOCK
   18515       prefix.  Therefore, surround it with a IRStmt_MBE(Imbe_BusLock)
   18516       and IRStmt_MBE(Imbe_BusUnlock) pair.  But be careful; if it is
   18517       used with an explicit LOCK prefix, we don't want to end up with
   18518       two IRStmt_MBE(Imbe_BusLock)s -- one made here and one made by
   18519       the generic LOCK logic at the top of disInstr. */
   18520    case 0x86: /* XCHG Gb,Eb */
   18521       sz = 1;
   18522       /* Fall through ... */
   18523    case 0x87: /* XCHG Gv,Ev */
   18524       if (haveF2orF3(pfx)) goto decode_failure;
   18525       modrm = getUChar(delta);
   18526       ty = szToITy(sz);
   18527       t1 = newTemp(ty); t2 = newTemp(ty);
   18528       if (epartIsReg(modrm)) {
   18529          assign(t1, getIRegE(sz, pfx, modrm));
   18530          assign(t2, getIRegG(sz, pfx, modrm));
   18531          putIRegG(sz, pfx, modrm, mkexpr(t1));
   18532          putIRegE(sz, pfx, modrm, mkexpr(t2));
   18533          delta++;
   18534          DIP("xchg%c %s, %s\n",
   18535              nameISize(sz), nameIRegG(sz, pfx, modrm),
   18536                             nameIRegE(sz, pfx, modrm));
   18537       } else {
   18538          *expect_CAS = True;
   18539          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   18540          assign( t1, loadLE(ty, mkexpr(addr)) );
   18541          assign( t2, getIRegG(sz, pfx, modrm) );
   18542          casLE( mkexpr(addr),
   18543                 mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
   18544          putIRegG( sz, pfx, modrm, mkexpr(t1) );
   18545          delta += alen;
   18546          DIP("xchg%c %s, %s\n", nameISize(sz),
   18547                                 nameIRegG(sz, pfx, modrm), dis_buf);
   18548       }
   18549       return delta;
   18550 
   18551    case 0x88: /* MOV Gb,Eb */
   18552       if (haveF2orF3(pfx)) goto decode_failure;
   18553       delta = dis_mov_G_E(vbi, pfx, 1, delta);
   18554       return delta;
   18555 
   18556    case 0x89: /* MOV Gv,Ev */
   18557       if (haveF2orF3(pfx)) goto decode_failure;
   18558       delta = dis_mov_G_E(vbi, pfx, sz, delta);
   18559       return delta;
   18560 
   18561    case 0x8A: /* MOV Eb,Gb */
   18562       if (haveF2orF3(pfx)) goto decode_failure;
   18563       delta = dis_mov_E_G(vbi, pfx, 1, delta);
   18564       return delta;
   18565 
   18566    case 0x8B: /* MOV Ev,Gv */
   18567       if (haveF2orF3(pfx)) goto decode_failure;
   18568       delta = dis_mov_E_G(vbi, pfx, sz, delta);
   18569       return delta;
   18570 
   18571    case 0x8D: /* LEA M,Gv */
   18572       if (haveF2orF3(pfx)) goto decode_failure;
   18573       if (sz != 4 && sz != 8)
   18574          goto decode_failure;
   18575       modrm = getUChar(delta);
   18576       if (epartIsReg(modrm))
   18577          goto decode_failure;
   18578       /* NOTE!  this is the one place where a segment override prefix
   18579          has no effect on the address calculation.  Therefore we clear
   18580          any segment override bits in pfx. */
   18581       addr = disAMode ( &alen, vbi, clearSegBits(pfx), delta, dis_buf, 0 );
   18582       delta += alen;
   18583       /* This is a hack.  But it isn't clear that really doing the
   18584          calculation at 32 bits is really worth it.  Hence for leal,
   18585          do the full 64-bit calculation and then truncate it. */
   18586       putIRegG( sz, pfx, modrm,
   18587                          sz == 4
   18588                             ? unop(Iop_64to32, mkexpr(addr))
   18589                             : mkexpr(addr)
   18590               );
   18591       DIP("lea%c %s, %s\n", nameISize(sz), dis_buf,
   18592                             nameIRegG(sz,pfx,modrm));
   18593       return delta;
   18594 
   18595    case 0x8F: { /* POPQ m64 / POPW m16 */
   18596       Int   len;
   18597       UChar rm;
   18598       /* There is no encoding for 32-bit pop in 64-bit mode.
   18599          So sz==4 actually means sz==8. */
   18600       if (haveF2orF3(pfx)) goto decode_failure;
   18601       vassert(sz == 2 || sz == 4
   18602               || /* tolerate redundant REX.W, see #210481 */ sz == 8);
   18603       if (sz == 4) sz = 8;
   18604       if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
   18605 
   18606       rm = getUChar(delta);
   18607 
   18608       /* make sure this instruction is correct POP */
   18609       if (epartIsReg(rm) || gregLO3ofRM(rm) != 0)
   18610          goto decode_failure;
   18611       /* and has correct size */
   18612       vassert(sz == 8);
   18613 
   18614       t1 = newTemp(Ity_I64);
   18615       t3 = newTemp(Ity_I64);
   18616       assign( t1, getIReg64(R_RSP) );
   18617       assign( t3, loadLE(Ity_I64, mkexpr(t1)) );
   18618 
   18619       /* Increase RSP; must be done before the STORE.  Intel manual
   18620          says: If the RSP register is used as a base register for
   18621          addressing a destination operand in memory, the POP
   18622          instruction computes the effective address of the operand
   18623          after it increments the RSP register.  */
   18624       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t1), mkU64(sz)) );
   18625 
   18626       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   18627       storeLE( mkexpr(addr), mkexpr(t3) );
   18628 
   18629       DIP("popl %s\n", dis_buf);
   18630 
   18631       delta += len;
   18632       return delta;
   18633    }
   18634 
   18635    case 0x90: /* XCHG eAX,eAX */
   18636       /* detect and handle F3 90 (rep nop) specially */
   18637       if (!have66(pfx) && !haveF2(pfx) && haveF3(pfx)) {
   18638          DIP("rep nop (P4 pause)\n");
   18639          /* "observe" the hint.  The Vex client needs to be careful not
   18640             to cause very long delays as a result, though. */
   18641          jmp_lit(dres, Ijk_Yield, guest_RIP_bbstart+delta);
   18642          vassert(dres->whatNext == Dis_StopHere);
   18643          return delta;
   18644       }
   18645       /* detect and handle NOPs specially */
   18646       if (/* F2/F3 probably change meaning completely */
   18647           !haveF2orF3(pfx)
   18648           /* If REX.B is 1, we're not exchanging rAX with itself */
   18649           && getRexB(pfx)==0 ) {
   18650          DIP("nop\n");
   18651          return delta;
   18652       }
   18653       /* else fall through to normal case. */
   18654    case 0x91: /* XCHG rAX,rCX */
   18655    case 0x92: /* XCHG rAX,rDX */
   18656    case 0x93: /* XCHG rAX,rBX */
   18657    case 0x94: /* XCHG rAX,rSP */
   18658    case 0x95: /* XCHG rAX,rBP */
   18659    case 0x96: /* XCHG rAX,rSI */
   18660    case 0x97: /* XCHG rAX,rDI */
   18661       /* guard against mutancy */
   18662       if (haveF2orF3(pfx)) goto decode_failure;
   18663       codegen_xchg_rAX_Reg ( pfx, sz, opc - 0x90 );
   18664       return delta;
   18665 
   18666    case 0x98: /* CBW */
   18667       if (haveF2orF3(pfx)) goto decode_failure;
   18668       if (sz == 8) {
   18669          putIRegRAX( 8, unop(Iop_32Sto64, getIRegRAX(4)) );
   18670          DIP(/*"cdqe\n"*/"cltq");
   18671          return delta;
   18672       }
   18673       if (sz == 4) {
   18674          putIRegRAX( 4, unop(Iop_16Sto32, getIRegRAX(2)) );
   18675          DIP("cwtl\n");
   18676          return delta;
   18677       }
   18678       if (sz == 2) {
   18679          putIRegRAX( 2, unop(Iop_8Sto16, getIRegRAX(1)) );
   18680          DIP("cbw\n");
   18681          return delta;
   18682       }
   18683       goto decode_failure;
   18684 
   18685    case 0x99: /* CWD/CDQ/CQO */
   18686       if (haveF2orF3(pfx)) goto decode_failure;
   18687       vassert(sz == 2 || sz == 4 || sz == 8);
   18688       ty = szToITy(sz);
   18689       putIRegRDX( sz,
   18690                   binop(mkSizedOp(ty,Iop_Sar8),
   18691                         getIRegRAX(sz),
   18692                         mkU8(sz == 2 ? 15 : (sz == 4 ? 31 : 63))) );
   18693       DIP(sz == 2 ? "cwd\n"
   18694                   : (sz == 4 ? /*"cdq\n"*/ "cltd\n"
   18695                              : "cqo\n"));
   18696       return delta;
   18697 
   18698    case 0x9B: /* FWAIT (X87 insn) */
   18699       /* ignore? */
   18700       DIP("fwait\n");
   18701       return delta;
   18702 
   18703    case 0x9C: /* PUSHF */ {
   18704       /* Note.  There is no encoding for a 32-bit pushf in 64-bit
   18705          mode.  So sz==4 actually means sz==8. */
   18706       /* 24 July 06: has also been seen with a redundant REX prefix,
   18707          so must also allow sz==8. */
   18708       if (haveF2orF3(pfx)) goto decode_failure;
   18709       vassert(sz == 2 || sz == 4 || sz == 8);
   18710       if (sz == 4) sz = 8;
   18711       if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
   18712 
   18713       t1 = newTemp(Ity_I64);
   18714       assign( t1, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
   18715       putIReg64(R_RSP, mkexpr(t1) );
   18716 
   18717       t2 = newTemp(Ity_I64);
   18718       assign( t2, mk_amd64g_calculate_rflags_all() );
   18719 
   18720       /* Patch in the D flag.  This can simply be a copy of bit 10 of
   18721          baseBlock[OFFB_DFLAG]. */
   18722       t3 = newTemp(Ity_I64);
   18723       assign( t3, binop(Iop_Or64,
   18724                         mkexpr(t2),
   18725                         binop(Iop_And64,
   18726                               IRExpr_Get(OFFB_DFLAG,Ity_I64),
   18727                               mkU64(1<<10)))
   18728             );
   18729 
   18730       /* And patch in the ID flag. */
   18731       t4 = newTemp(Ity_I64);
   18732       assign( t4, binop(Iop_Or64,
   18733                         mkexpr(t3),
   18734                         binop(Iop_And64,
   18735                               binop(Iop_Shl64, IRExpr_Get(OFFB_IDFLAG,Ity_I64),
   18736                                                mkU8(21)),
   18737                               mkU64(1<<21)))
   18738             );
   18739 
   18740       /* And patch in the AC flag too. */
   18741       t5 = newTemp(Ity_I64);
   18742       assign( t5, binop(Iop_Or64,
   18743                         mkexpr(t4),
   18744                         binop(Iop_And64,
   18745                               binop(Iop_Shl64, IRExpr_Get(OFFB_ACFLAG,Ity_I64),
   18746                                                mkU8(18)),
   18747                               mkU64(1<<18)))
   18748             );
   18749 
   18750       /* if sz==2, the stored value needs to be narrowed. */
   18751       if (sz == 2)
   18752         storeLE( mkexpr(t1), unop(Iop_32to16,
   18753                              unop(Iop_64to32,mkexpr(t5))) );
   18754       else
   18755         storeLE( mkexpr(t1), mkexpr(t5) );
   18756 
   18757       DIP("pushf%c\n", nameISize(sz));
   18758       return delta;
   18759    }
   18760 
   18761    case 0x9D: /* POPF */
   18762       /* Note.  There is no encoding for a 32-bit popf in 64-bit mode.
   18763          So sz==4 actually means sz==8. */
   18764       if (haveF2orF3(pfx)) goto decode_failure;
   18765       vassert(sz == 2 || sz == 4);
   18766       if (sz == 4) sz = 8;
   18767       if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
   18768       t1 = newTemp(Ity_I64); t2 = newTemp(Ity_I64);
   18769       assign(t2, getIReg64(R_RSP));
   18770       assign(t1, widenUto64(loadLE(szToITy(sz),mkexpr(t2))));
   18771       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t2), mkU64(sz)));
   18772       /* t1 is the flag word.  Mask out everything except OSZACP and
   18773          set the flags thunk to AMD64G_CC_OP_COPY. */
   18774       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   18775       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   18776       stmt( IRStmt_Put( OFFB_CC_DEP1,
   18777                         binop(Iop_And64,
   18778                               mkexpr(t1),
   18779                               mkU64( AMD64G_CC_MASK_C | AMD64G_CC_MASK_P
   18780                                      | AMD64G_CC_MASK_A | AMD64G_CC_MASK_Z
   18781                                      | AMD64G_CC_MASK_S| AMD64G_CC_MASK_O )
   18782                              )
   18783                        )
   18784           );
   18785 
   18786       /* Also need to set the D flag, which is held in bit 10 of t1.
   18787          If zero, put 1 in OFFB_DFLAG, else -1 in OFFB_DFLAG. */
   18788       stmt( IRStmt_Put(
   18789                OFFB_DFLAG,
   18790                IRExpr_Mux0X(
   18791                   unop(Iop_32to8,
   18792                   unop(Iop_64to32,
   18793                        binop(Iop_And64,
   18794                              binop(Iop_Shr64, mkexpr(t1), mkU8(10)),
   18795                              mkU64(1)))),
   18796                   mkU64(1),
   18797                   mkU64(0xFFFFFFFFFFFFFFFFULL)))
   18798           );
   18799 
   18800       /* And set the ID flag */
   18801       stmt( IRStmt_Put(
   18802                OFFB_IDFLAG,
   18803                IRExpr_Mux0X(
   18804                   unop(Iop_32to8,
   18805                   unop(Iop_64to32,
   18806                        binop(Iop_And64,
   18807                              binop(Iop_Shr64, mkexpr(t1), mkU8(21)),
   18808                              mkU64(1)))),
   18809                   mkU64(0),
   18810                   mkU64(1)))
   18811           );
   18812 
   18813       /* And set the AC flag too */
   18814       stmt( IRStmt_Put(
   18815                OFFB_ACFLAG,
   18816                IRExpr_Mux0X(
   18817                   unop(Iop_32to8,
   18818                   unop(Iop_64to32,
   18819                        binop(Iop_And64,
   18820                              binop(Iop_Shr64, mkexpr(t1), mkU8(18)),
   18821                              mkU64(1)))),
   18822                   mkU64(0),
   18823                   mkU64(1)))
   18824           );
   18825 
   18826       DIP("popf%c\n", nameISize(sz));
   18827       return delta;
   18828 
   18829    case 0x9E: /* SAHF */
   18830       codegen_SAHF();
   18831       DIP("sahf\n");
   18832       return delta;
   18833 
   18834    case 0x9F: /* LAHF */
   18835       codegen_LAHF();
   18836       DIP("lahf\n");
   18837       return delta;
   18838 
   18839    case 0xA0: /* MOV Ob,AL */
   18840       if (have66orF2orF3(pfx)) goto decode_failure;
   18841       sz = 1;
   18842       /* Fall through ... */
   18843    case 0xA1: /* MOV Ov,eAX */
   18844       if (sz != 8 && sz != 4 && sz != 2 && sz != 1)
   18845          goto decode_failure;
   18846       d64 = getDisp64(delta);
   18847       delta += 8;
   18848       ty = szToITy(sz);
   18849       addr = newTemp(Ity_I64);
   18850       assign( addr, handleAddrOverrides(vbi, pfx, mkU64(d64)) );
   18851       putIRegRAX(sz, loadLE( ty, mkexpr(addr) ));
   18852       DIP("mov%c %s0x%llx, %s\n", nameISize(sz),
   18853                                   segRegTxt(pfx), d64,
   18854                                   nameIRegRAX(sz));
   18855       return delta;
   18856 
   18857    case 0xA2: /* MOV AL,Ob */
   18858       if (have66orF2orF3(pfx)) goto decode_failure;
   18859       sz = 1;
   18860       /* Fall through ... */
   18861    case 0xA3: /* MOV eAX,Ov */
   18862       if (sz != 8 && sz != 4 && sz != 2 && sz != 1)
   18863          goto decode_failure;
   18864       d64 = getDisp64(delta);
   18865       delta += 8;
   18866       ty = szToITy(sz);
   18867       addr = newTemp(Ity_I64);
   18868       assign( addr, handleAddrOverrides(vbi, pfx, mkU64(d64)) );
   18869       storeLE( mkexpr(addr), getIRegRAX(sz) );
   18870       DIP("mov%c %s, %s0x%llx\n", nameISize(sz), nameIRegRAX(sz),
   18871                                   segRegTxt(pfx), d64);
   18872       return delta;
   18873 
   18874    case 0xA4:
   18875    case 0xA5:
   18876       /* F3 A4: rep movsb */
   18877       if (haveF3(pfx) && !haveF2(pfx)) {
   18878          if (opc == 0xA4)
   18879             sz = 1;
   18880          dis_REP_op ( dres, AMD64CondAlways, dis_MOVS, sz,
   18881                       guest_RIP_curr_instr,
   18882                       guest_RIP_bbstart+delta, "rep movs", pfx );
   18883         dres->whatNext = Dis_StopHere;
   18884         return delta;
   18885       }
   18886       /* A4: movsb */
   18887       if (!haveF3(pfx) && !haveF2(pfx)) {
   18888          if (opc == 0xA4)
   18889             sz = 1;
   18890          dis_string_op( dis_MOVS, sz, "movs", pfx );
   18891          return delta;
   18892       }
   18893       goto decode_failure;
   18894 
   18895    case 0xA6:
   18896    case 0xA7:
   18897       /* F3 A6/A7: repe cmps/rep cmps{w,l,q} */
   18898       if (haveF3(pfx) && !haveF2(pfx)) {
   18899          if (opc == 0xA6)
   18900             sz = 1;
   18901          dis_REP_op ( dres, AMD64CondZ, dis_CMPS, sz,
   18902                       guest_RIP_curr_instr,
   18903                       guest_RIP_bbstart+delta, "repe cmps", pfx );
   18904          dres->whatNext = Dis_StopHere;
   18905          return delta;
   18906       }
   18907       goto decode_failure;
   18908 
   18909    case 0xAA:
   18910    case 0xAB:
   18911       /* F3 AA/AB: rep stosb/rep stos{w,l,q} */
   18912       if (haveF3(pfx) && !haveF2(pfx)) {
   18913          if (opc == 0xAA)
   18914             sz = 1;
   18915          dis_REP_op ( dres, AMD64CondAlways, dis_STOS, sz,
   18916                       guest_RIP_curr_instr,
   18917                       guest_RIP_bbstart+delta, "rep stos", pfx );
   18918          vassert(dres->whatNext == Dis_StopHere);
   18919          return delta;
   18920       }
   18921       /* AA/AB: stosb/stos{w,l,q} */
   18922       if (!haveF3(pfx) && !haveF2(pfx)) {
   18923          if (opc == 0xAA)
   18924             sz = 1;
   18925          dis_string_op( dis_STOS, sz, "stos", pfx );
   18926          return delta;
   18927       }
   18928       goto decode_failure;
   18929 
   18930    case 0xA8: /* TEST Ib, AL */
   18931       if (haveF2orF3(pfx)) goto decode_failure;
   18932       delta = dis_op_imm_A( 1, False, Iop_And8, False, delta, "test" );
   18933       return delta;
   18934    case 0xA9: /* TEST Iv, eAX */
   18935       if (haveF2orF3(pfx)) goto decode_failure;
   18936       delta = dis_op_imm_A( sz, False, Iop_And8, False, delta, "test" );
   18937       return delta;
   18938 
   18939    case 0xAC: /* LODS, no REP prefix */
   18940    case 0xAD:
   18941       dis_string_op( dis_LODS, ( opc == 0xAC ? 1 : sz ), "lods", pfx );
   18942       return delta;
   18943 
   18944    case 0xAE:
   18945    case 0xAF:
   18946       /* F2 AE/AF: repne scasb/repne scas{w,l,q} */
   18947       if (haveF2(pfx) && !haveF3(pfx)) {
   18948          if (opc == 0xAE)
   18949             sz = 1;
   18950          dis_REP_op ( dres, AMD64CondNZ, dis_SCAS, sz,
   18951                       guest_RIP_curr_instr,
   18952                       guest_RIP_bbstart+delta, "repne scas", pfx );
   18953          vassert(dres->whatNext == Dis_StopHere);
   18954          return delta;
   18955       }
   18956       /* F3 AE/AF: repe scasb/repe scas{w,l,q} */
   18957       if (!haveF2(pfx) && haveF3(pfx)) {
   18958          if (opc == 0xAE)
   18959             sz = 1;
   18960          dis_REP_op ( dres, AMD64CondZ, dis_SCAS, sz,
   18961                       guest_RIP_curr_instr,
   18962                       guest_RIP_bbstart+delta, "repe scas", pfx );
   18963          vassert(dres->whatNext == Dis_StopHere);
   18964          return delta;
   18965       }
   18966       /* AE/AF: scasb/scas{w,l,q} */
   18967       if (!haveF2(pfx) && !haveF3(pfx)) {
   18968          if (opc == 0xAE)
   18969             sz = 1;
   18970          dis_string_op( dis_SCAS, sz, "scas", pfx );
   18971          return delta;
   18972       }
   18973       goto decode_failure;
   18974 
   18975    /* XXXX be careful here with moves to AH/BH/CH/DH */
   18976    case 0xB0: /* MOV imm,AL */
   18977    case 0xB1: /* MOV imm,CL */
   18978    case 0xB2: /* MOV imm,DL */
   18979    case 0xB3: /* MOV imm,BL */
   18980    case 0xB4: /* MOV imm,AH */
   18981    case 0xB5: /* MOV imm,CH */
   18982    case 0xB6: /* MOV imm,DH */
   18983    case 0xB7: /* MOV imm,BH */
   18984       if (haveF2orF3(pfx)) goto decode_failure;
   18985       d64 = getUChar(delta);
   18986       delta += 1;
   18987       putIRegRexB(1, pfx, opc-0xB0, mkU8(d64));
   18988       DIP("movb $%lld,%s\n", d64, nameIRegRexB(1,pfx,opc-0xB0));
   18989       return delta;
   18990 
   18991    case 0xB8: /* MOV imm,eAX */
   18992    case 0xB9: /* MOV imm,eCX */
   18993    case 0xBA: /* MOV imm,eDX */
   18994    case 0xBB: /* MOV imm,eBX */
   18995    case 0xBC: /* MOV imm,eSP */
   18996    case 0xBD: /* MOV imm,eBP */
   18997    case 0xBE: /* MOV imm,eSI */
   18998    case 0xBF: /* MOV imm,eDI */
   18999       /* This is the one-and-only place where 64-bit literals are
   19000          allowed in the instruction stream. */
   19001       if (haveF2orF3(pfx)) goto decode_failure;
   19002       if (sz == 8) {
   19003          d64 = getDisp64(delta);
   19004          delta += 8;
   19005          putIRegRexB(8, pfx, opc-0xB8, mkU64(d64));
   19006          DIP("movabsq $%lld,%s\n", (Long)d64,
   19007                                    nameIRegRexB(8,pfx,opc-0xB8));
   19008       } else {
   19009          d64 = getSDisp(imin(4,sz),delta);
   19010          delta += imin(4,sz);
   19011          putIRegRexB(sz, pfx, opc-0xB8,
   19012                          mkU(szToITy(sz), d64 & mkSizeMask(sz)));
   19013          DIP("mov%c $%lld,%s\n", nameISize(sz),
   19014                                  (Long)d64,
   19015                                  nameIRegRexB(sz,pfx,opc-0xB8));
   19016       }
   19017       return delta;
   19018 
   19019    case 0xC0: { /* Grp2 Ib,Eb */
   19020       Bool decode_OK = True;
   19021       if (haveF2orF3(pfx)) goto decode_failure;
   19022       modrm = getUChar(delta);
   19023       am_sz = lengthAMode(pfx,delta);
   19024       d_sz  = 1;
   19025       d64   = getUChar(delta + am_sz);
   19026       sz    = 1;
   19027       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   19028                          mkU8(d64 & 0xFF), NULL, &decode_OK );
   19029       if (!decode_OK) goto decode_failure;
   19030       return delta;
   19031    }
   19032 
   19033    case 0xC1: { /* Grp2 Ib,Ev */
   19034       Bool decode_OK = True;
   19035       if (haveF2orF3(pfx)) goto decode_failure;
   19036       modrm = getUChar(delta);
   19037       am_sz = lengthAMode(pfx,delta);
   19038       d_sz  = 1;
   19039       d64   = getUChar(delta + am_sz);
   19040       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   19041                          mkU8(d64 & 0xFF), NULL, &decode_OK );
   19042       if (!decode_OK) goto decode_failure;
   19043       return delta;
   19044    }
   19045 
   19046    case 0xC2: /* RET imm16 */
   19047       if (have66orF2orF3(pfx)) goto decode_failure;
   19048       d64 = getUDisp16(delta);
   19049       delta += 2;
   19050       dis_ret(dres, vbi, d64);
   19051       DIP("ret $%lld\n", d64);
   19052       return delta;
   19053 
   19054    case 0xC3: /* RET */
   19055       if (have66orF2(pfx)) goto decode_failure;
   19056       /* F3 is acceptable on AMD. */
   19057       dis_ret(dres, vbi, 0);
   19058       DIP(haveF3(pfx) ? "rep ; ret\n" : "ret\n");
   19059       return delta;
   19060 
   19061    case 0xC6: /* MOV Ib,Eb */
   19062       sz = 1;
   19063       goto do_Mov_I_E;
   19064    case 0xC7: /* MOV Iv,Ev */
   19065       goto do_Mov_I_E;
   19066    do_Mov_I_E:
   19067       if (haveF2orF3(pfx)) goto decode_failure;
   19068       modrm = getUChar(delta);
   19069       if (epartIsReg(modrm)) {
   19070          delta++; /* mod/rm byte */
   19071          d64 = getSDisp(imin(4,sz),delta);
   19072          delta += imin(4,sz);
   19073          putIRegE(sz, pfx, modrm,
   19074                       mkU(szToITy(sz), d64 & mkSizeMask(sz)));
   19075          DIP("mov%c $%lld, %s\n", nameISize(sz),
   19076                                   (Long)d64,
   19077                                   nameIRegE(sz,pfx,modrm));
   19078       } else {
   19079          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   19080                            /*xtra*/imin(4,sz) );
   19081          delta += alen;
   19082          d64 = getSDisp(imin(4,sz),delta);
   19083          delta += imin(4,sz);
   19084          storeLE(mkexpr(addr),
   19085                  mkU(szToITy(sz), d64 & mkSizeMask(sz)));
   19086          DIP("mov%c $%lld, %s\n", nameISize(sz), (Long)d64, dis_buf);
   19087       }
   19088       return delta;
   19089 
   19090    case 0xC8: /* ENTER */
   19091       /* Same comments re operand size as for LEAVE below apply.
   19092          Also, only handles the case "enter $imm16, $0"; other cases
   19093          for the second operand (nesting depth) are not handled. */
   19094       if (sz != 4)
   19095          goto decode_failure;
   19096       d64 = getUDisp16(delta);
   19097       delta += 2;
   19098       vassert(d64 >= 0 && d64 <= 0xFFFF);
   19099       if (getUChar(delta) != 0)
   19100          goto decode_failure;
   19101       delta++;
   19102       /* Intel docs seem to suggest:
   19103            push rbp
   19104            temp = rsp
   19105            rbp = temp
   19106            rsp = rsp - imm16
   19107       */
   19108       t1 = newTemp(Ity_I64);
   19109       assign(t1, getIReg64(R_RBP));
   19110       t2 = newTemp(Ity_I64);
   19111       assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   19112       putIReg64(R_RSP, mkexpr(t2));
   19113       storeLE(mkexpr(t2), mkexpr(t1));
   19114       putIReg64(R_RBP, mkexpr(t2));
   19115       if (d64 > 0) {
   19116          putIReg64(R_RSP, binop(Iop_Sub64, mkexpr(t2), mkU64(d64)));
   19117       }
   19118       DIP("enter $%u, $0\n", (UInt)d64);
   19119       return delta;
   19120 
   19121    case 0xC9: /* LEAVE */
   19122       /* In 64-bit mode this defaults to a 64-bit operand size.  There
   19123          is no way to encode a 32-bit variant.  Hence sz==4 but we do
   19124          it as if sz=8. */
   19125       if (sz != 4)
   19126          goto decode_failure;
   19127       t1 = newTemp(Ity_I64);
   19128       t2 = newTemp(Ity_I64);
   19129       assign(t1, getIReg64(R_RBP));
   19130       /* First PUT RSP looks redundant, but need it because RSP must
   19131          always be up-to-date for Memcheck to work... */
   19132       putIReg64(R_RSP, mkexpr(t1));
   19133       assign(t2, loadLE(Ity_I64,mkexpr(t1)));
   19134       putIReg64(R_RBP, mkexpr(t2));
   19135       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t1), mkU64(8)) );
   19136       DIP("leave\n");
   19137       return delta;
   19138 
   19139    case 0xCC: /* INT 3 */
   19140       jmp_lit(dres, Ijk_SigTRAP, guest_RIP_bbstart + delta);
   19141       vassert(dres->whatNext == Dis_StopHere);
   19142       DIP("int $0x3\n");
   19143       return delta;
   19144 
   19145    case 0xD0: { /* Grp2 1,Eb */
   19146       Bool decode_OK = True;
   19147       if (haveF2orF3(pfx)) goto decode_failure;
   19148       modrm = getUChar(delta);
   19149       am_sz = lengthAMode(pfx,delta);
   19150       d_sz  = 0;
   19151       d64   = 1;
   19152       sz    = 1;
   19153       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   19154                          mkU8(d64), NULL, &decode_OK );
   19155       if (!decode_OK) goto decode_failure;
   19156       return delta;
   19157    }
   19158 
   19159    case 0xD1: { /* Grp2 1,Ev */
   19160       Bool decode_OK = True;
   19161       if (haveF2orF3(pfx)) goto decode_failure;
   19162       modrm = getUChar(delta);
   19163       am_sz = lengthAMode(pfx,delta);
   19164       d_sz  = 0;
   19165       d64   = 1;
   19166       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   19167                          mkU8(d64), NULL, &decode_OK );
   19168       if (!decode_OK) goto decode_failure;
   19169       return delta;
   19170    }
   19171 
   19172    case 0xD2: { /* Grp2 CL,Eb */
   19173       Bool decode_OK = True;
   19174       if (haveF2orF3(pfx)) goto decode_failure;
   19175       modrm = getUChar(delta);
   19176       am_sz = lengthAMode(pfx,delta);
   19177       d_sz  = 0;
   19178       sz    = 1;
   19179       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   19180                          getIRegCL(), "%cl", &decode_OK );
   19181       if (!decode_OK) goto decode_failure;
   19182       return delta;
   19183    }
   19184 
   19185    case 0xD3: { /* Grp2 CL,Ev */
   19186       Bool decode_OK = True;
   19187       if (haveF2orF3(pfx)) goto decode_failure;
   19188       modrm = getUChar(delta);
   19189       am_sz = lengthAMode(pfx,delta);
   19190       d_sz  = 0;
   19191       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   19192                          getIRegCL(), "%cl", &decode_OK );
   19193       if (!decode_OK) goto decode_failure;
   19194       return delta;
   19195    }
   19196 
   19197    case 0xD8: /* X87 instructions */
   19198    case 0xD9:
   19199    case 0xDA:
   19200    case 0xDB:
   19201    case 0xDC:
   19202    case 0xDD:
   19203    case 0xDE:
   19204    case 0xDF: {
   19205       Bool redundantREXWok = False;
   19206 
   19207       if (haveF2orF3(pfx))
   19208          goto decode_failure;
   19209 
   19210       /* kludge to tolerate redundant rex.w prefixes (should do this
   19211          properly one day) */
   19212       /* mono 1.1.18.1 produces 48 D9 FA, which is rex.w fsqrt */
   19213       if ( (opc == 0xD9 && getUChar(delta+0) == 0xFA)/*fsqrt*/ )
   19214          redundantREXWok = True;
   19215 
   19216       Bool size_OK = False;
   19217       if ( sz == 4 )
   19218          size_OK = True;
   19219       else if ( sz == 8 )
   19220          size_OK = redundantREXWok;
   19221       else if ( sz == 2 ) {
   19222          int mod_rm = getUChar(delta+0);
   19223          int reg = gregLO3ofRM(mod_rm);
   19224          /* The HotSpot JVM uses these */
   19225          if ( (opc == 0xDD) && (reg == 0 /* FLDL   */ ||
   19226                                 reg == 4 /* FNSAVE */ ||
   19227                                 reg == 6 /* FRSTOR */ ) )
   19228             size_OK = True;
   19229       }
   19230       /* AMD manual says 0x66 size override is ignored, except where
   19231          it is meaningful */
   19232       if (!size_OK)
   19233          goto decode_failure;
   19234 
   19235       Bool decode_OK = False;
   19236       delta = dis_FPU ( &decode_OK, vbi, pfx, delta );
   19237       if (!decode_OK)
   19238          goto decode_failure;
   19239 
   19240       return delta;
   19241    }
   19242 
   19243    case 0xE0: /* LOOPNE disp8: decrement count, jump if count != 0 && ZF==0 */
   19244    case 0xE1: /* LOOPE  disp8: decrement count, jump if count != 0 && ZF==1 */
   19245    case 0xE2: /* LOOP   disp8: decrement count, jump if count != 0 */
   19246     { /* The docs say this uses rCX as a count depending on the
   19247          address size override, not the operand one. */
   19248       IRExpr* zbit  = NULL;
   19249       IRExpr* count = NULL;
   19250       IRExpr* cond  = NULL;
   19251       HChar*  xtra  = NULL;
   19252 
   19253       if (have66orF2orF3(pfx) || 1==getRexW(pfx)) goto decode_failure;
   19254       /* So at this point we've rejected any variants which appear to
   19255          be governed by the usual operand-size modifiers.  Hence only
   19256          the address size prefix can have an effect.  It changes the
   19257          size from 64 (default) to 32. */
   19258       d64 = guest_RIP_bbstart+delta+1 + getSDisp8(delta);
   19259       delta++;
   19260       if (haveASO(pfx)) {
   19261          /* 64to32 of 64-bit get is merely a get-put improvement
   19262             trick. */
   19263          putIReg32(R_RCX, binop(Iop_Sub32,
   19264                                 unop(Iop_64to32, getIReg64(R_RCX)),
   19265                                 mkU32(1)));
   19266       } else {
   19267          putIReg64(R_RCX, binop(Iop_Sub64, getIReg64(R_RCX), mkU64(1)));
   19268       }
   19269 
   19270       /* This is correct, both for 32- and 64-bit versions.  If we're
   19271          doing a 32-bit dec and the result is zero then the default
   19272          zero extension rule will cause the upper 32 bits to be zero
   19273          too.  Hence a 64-bit check against zero is OK. */
   19274       count = getIReg64(R_RCX);
   19275       cond = binop(Iop_CmpNE64, count, mkU64(0));
   19276       switch (opc) {
   19277          case 0xE2:
   19278             xtra = "";
   19279             break;
   19280          case 0xE1:
   19281             xtra = "e";
   19282             zbit = mk_amd64g_calculate_condition( AMD64CondZ );
   19283             cond = mkAnd1(cond, zbit);
   19284             break;
   19285          case 0xE0:
   19286             xtra = "ne";
   19287             zbit = mk_amd64g_calculate_condition( AMD64CondNZ );
   19288             cond = mkAnd1(cond, zbit);
   19289             break;
   19290          default:
   19291 	    vassert(0);
   19292       }
   19293       stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U64(d64), OFFB_RIP) );
   19294 
   19295       DIP("loop%s%s 0x%llx\n", xtra, haveASO(pfx) ? "l" : "", d64);
   19296       return delta;
   19297     }
   19298 
   19299    case 0xE3:
   19300       /* JRCXZ or JECXZ, depending address size override. */
   19301       if (have66orF2orF3(pfx)) goto decode_failure;
   19302       d64 = (guest_RIP_bbstart+delta+1) + getSDisp8(delta);
   19303       delta++;
   19304       if (haveASO(pfx)) {
   19305          /* 32-bit */
   19306          stmt( IRStmt_Exit( binop(Iop_CmpEQ64,
   19307                                   unop(Iop_32Uto64, getIReg32(R_RCX)),
   19308                                   mkU64(0)),
   19309                             Ijk_Boring,
   19310                             IRConst_U64(d64),
   19311                             OFFB_RIP
   19312              ));
   19313          DIP("jecxz 0x%llx\n", d64);
   19314       } else {
   19315          /* 64-bit */
   19316          stmt( IRStmt_Exit( binop(Iop_CmpEQ64,
   19317                                   getIReg64(R_RCX),
   19318                                   mkU64(0)),
   19319                             Ijk_Boring,
   19320                             IRConst_U64(d64),
   19321                             OFFB_RIP
   19322                ));
   19323          DIP("jrcxz 0x%llx\n", d64);
   19324       }
   19325       return delta;
   19326 
   19327    case 0xE4: /* IN imm8, AL */
   19328       sz = 1;
   19329       t1 = newTemp(Ity_I64);
   19330       abyte = getUChar(delta); delta++;
   19331       assign(t1, mkU64( abyte & 0xFF ));
   19332       DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIRegRAX(sz));
   19333       goto do_IN;
   19334    case 0xE5: /* IN imm8, eAX */
   19335       if (!(sz == 2 || sz == 4)) goto decode_failure;
   19336       t1 = newTemp(Ity_I64);
   19337       abyte = getUChar(delta); delta++;
   19338       assign(t1, mkU64( abyte & 0xFF ));
   19339       DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIRegRAX(sz));
   19340       goto do_IN;
   19341    case 0xEC: /* IN %DX, AL */
   19342       sz = 1;
   19343       t1 = newTemp(Ity_I64);
   19344       assign(t1, unop(Iop_16Uto64, getIRegRDX(2)));
   19345       DIP("in%c %s,%s\n", nameISize(sz), nameIRegRDX(2),
   19346                                          nameIRegRAX(sz));
   19347       goto do_IN;
   19348    case 0xED: /* IN %DX, eAX */
   19349       if (!(sz == 2 || sz == 4)) goto decode_failure;
   19350       t1 = newTemp(Ity_I64);
   19351       assign(t1, unop(Iop_16Uto64, getIRegRDX(2)));
   19352       DIP("in%c %s,%s\n", nameISize(sz), nameIRegRDX(2),
   19353                                          nameIRegRAX(sz));
   19354       goto do_IN;
   19355    do_IN: {
   19356       /* At this point, sz indicates the width, and t1 is a 64-bit
   19357          value giving port number. */
   19358       IRDirty* d;
   19359       if (haveF2orF3(pfx)) goto decode_failure;
   19360       vassert(sz == 1 || sz == 2 || sz == 4);
   19361       ty = szToITy(sz);
   19362       t2 = newTemp(Ity_I64);
   19363       d = unsafeIRDirty_1_N(
   19364              t2,
   19365              0/*regparms*/,
   19366              "amd64g_dirtyhelper_IN",
   19367              &amd64g_dirtyhelper_IN,
   19368              mkIRExprVec_2( mkexpr(t1), mkU64(sz) )
   19369           );
   19370       /* do the call, dumping the result in t2. */
   19371       stmt( IRStmt_Dirty(d) );
   19372       putIRegRAX(sz, narrowTo( ty, mkexpr(t2) ) );
   19373       return delta;
   19374    }
   19375 
   19376    case 0xE6: /* OUT AL, imm8 */
   19377       sz = 1;
   19378       t1 = newTemp(Ity_I64);
   19379       abyte = getUChar(delta); delta++;
   19380       assign( t1, mkU64( abyte & 0xFF ) );
   19381       DIP("out%c %s,$%d\n", nameISize(sz), nameIRegRAX(sz), (Int)abyte);
   19382       goto do_OUT;
   19383    case 0xE7: /* OUT eAX, imm8 */
   19384       if (!(sz == 2 || sz == 4)) goto decode_failure;
   19385       t1 = newTemp(Ity_I64);
   19386       abyte = getUChar(delta); delta++;
   19387       assign( t1, mkU64( abyte & 0xFF ) );
   19388       DIP("out%c %s,$%d\n", nameISize(sz), nameIRegRAX(sz), (Int)abyte);
   19389       goto do_OUT;
   19390    case 0xEE: /* OUT AL, %DX */
   19391       sz = 1;
   19392       t1 = newTemp(Ity_I64);
   19393       assign( t1, unop(Iop_16Uto64, getIRegRDX(2)) );
   19394       DIP("out%c %s,%s\n", nameISize(sz), nameIRegRAX(sz),
   19395                                           nameIRegRDX(2));
   19396       goto do_OUT;
   19397    case 0xEF: /* OUT eAX, %DX */
   19398       if (!(sz == 2 || sz == 4)) goto decode_failure;
   19399       t1 = newTemp(Ity_I64);
   19400       assign( t1, unop(Iop_16Uto64, getIRegRDX(2)) );
   19401       DIP("out%c %s,%s\n", nameISize(sz), nameIRegRAX(sz),
   19402                                           nameIRegRDX(2));
   19403       goto do_OUT;
   19404    do_OUT: {
   19405       /* At this point, sz indicates the width, and t1 is a 64-bit
   19406          value giving port number. */
   19407       IRDirty* d;
   19408       if (haveF2orF3(pfx)) goto decode_failure;
   19409       vassert(sz == 1 || sz == 2 || sz == 4);
   19410       ty = szToITy(sz);
   19411       d = unsafeIRDirty_0_N(
   19412              0/*regparms*/,
   19413              "amd64g_dirtyhelper_OUT",
   19414              &amd64g_dirtyhelper_OUT,
   19415              mkIRExprVec_3( mkexpr(t1),
   19416                             widenUto64( getIRegRAX(sz) ),
   19417                             mkU64(sz) )
   19418           );
   19419       stmt( IRStmt_Dirty(d) );
   19420       return delta;
   19421    }
   19422 
   19423    case 0xE8: /* CALL J4 */
   19424       if (haveF2orF3(pfx)) goto decode_failure;
   19425       d64 = getSDisp32(delta); delta += 4;
   19426       d64 += (guest_RIP_bbstart+delta);
   19427       /* (guest_RIP_bbstart+delta) == return-to addr, d64 == call-to addr */
   19428       t1 = newTemp(Ity_I64);
   19429       assign(t1, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   19430       putIReg64(R_RSP, mkexpr(t1));
   19431       storeLE( mkexpr(t1), mkU64(guest_RIP_bbstart+delta));
   19432       t2 = newTemp(Ity_I64);
   19433       assign(t2, mkU64((Addr64)d64));
   19434       make_redzone_AbiHint(vbi, t1, t2/*nia*/, "call-d32");
   19435       if (resteerOkFn( callback_opaque, (Addr64)d64) ) {
   19436          /* follow into the call target. */
   19437          dres->whatNext   = Dis_ResteerU;
   19438          dres->continueAt = d64;
   19439       } else {
   19440          jmp_lit(dres, Ijk_Call, d64);
   19441          vassert(dres->whatNext == Dis_StopHere);
   19442       }
   19443       DIP("call 0x%llx\n",d64);
   19444       return delta;
   19445 
   19446    case 0xE9: /* Jv (jump, 16/32 offset) */
   19447       if (haveF2orF3(pfx)) goto decode_failure;
   19448       if (sz != 4)
   19449          goto decode_failure; /* JRS added 2004 July 11 */
   19450       d64 = (guest_RIP_bbstart+delta+sz) + getSDisp(sz,delta);
   19451       delta += sz;
   19452       if (resteerOkFn(callback_opaque,d64)) {
   19453          dres->whatNext   = Dis_ResteerU;
   19454          dres->continueAt = d64;
   19455       } else {
   19456          jmp_lit(dres, Ijk_Boring, d64);
   19457          vassert(dres->whatNext == Dis_StopHere);
   19458       }
   19459       DIP("jmp 0x%llx\n", d64);
   19460       return delta;
   19461 
   19462    case 0xEB: /* Jb (jump, byte offset) */
   19463       if (haveF2orF3(pfx)) goto decode_failure;
   19464       if (sz != 4)
   19465          goto decode_failure; /* JRS added 2004 July 11 */
   19466       d64 = (guest_RIP_bbstart+delta+1) + getSDisp8(delta);
   19467       delta++;
   19468       if (resteerOkFn(callback_opaque,d64)) {
   19469          dres->whatNext   = Dis_ResteerU;
   19470          dres->continueAt = d64;
   19471       } else {
   19472          jmp_lit(dres, Ijk_Boring, d64);
   19473          vassert(dres->whatNext == Dis_StopHere);
   19474       }
   19475       DIP("jmp-8 0x%llx\n", d64);
   19476       return delta;
   19477 
   19478    case 0xF5: /* CMC */
   19479    case 0xF8: /* CLC */
   19480    case 0xF9: /* STC */
   19481       t1 = newTemp(Ity_I64);
   19482       t2 = newTemp(Ity_I64);
   19483       assign( t1, mk_amd64g_calculate_rflags_all() );
   19484       switch (opc) {
   19485          case 0xF5:
   19486             assign( t2, binop(Iop_Xor64, mkexpr(t1),
   19487                                          mkU64(AMD64G_CC_MASK_C)));
   19488             DIP("cmc\n");
   19489             break;
   19490          case 0xF8:
   19491             assign( t2, binop(Iop_And64, mkexpr(t1),
   19492                                          mkU64(~AMD64G_CC_MASK_C)));
   19493             DIP("clc\n");
   19494             break;
   19495          case 0xF9:
   19496             assign( t2, binop(Iop_Or64, mkexpr(t1),
   19497                                         mkU64(AMD64G_CC_MASK_C)));
   19498             DIP("stc\n");
   19499             break;
   19500          default:
   19501             vpanic("disInstr(x64)(cmc/clc/stc)");
   19502       }
   19503       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   19504       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   19505       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t2) ));
   19506       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   19507          elimination of previous stores to this field work better. */
   19508       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   19509       return delta;
   19510 
   19511    case 0xF6: { /* Grp3 Eb */
   19512       Bool decode_OK = True;
   19513       if (haveF2orF3(pfx)) goto decode_failure;
   19514       delta = dis_Grp3 ( vbi, pfx, 1, delta, &decode_OK );
   19515       if (!decode_OK) goto decode_failure;
   19516       return delta;
   19517    }
   19518 
   19519    case 0xF7: { /* Grp3 Ev */
   19520       Bool decode_OK = True;
   19521       if (haveF2orF3(pfx)) goto decode_failure;
   19522       delta = dis_Grp3 ( vbi, pfx, sz, delta, &decode_OK );
   19523       if (!decode_OK) goto decode_failure;
   19524       return delta;
   19525    }
   19526 
   19527    case 0xFC: /* CLD */
   19528       if (haveF2orF3(pfx)) goto decode_failure;
   19529       stmt( IRStmt_Put( OFFB_DFLAG, mkU64(1)) );
   19530       DIP("cld\n");
   19531       return delta;
   19532 
   19533    case 0xFD: /* STD */
   19534       if (haveF2orF3(pfx)) goto decode_failure;
   19535       stmt( IRStmt_Put( OFFB_DFLAG, mkU64(-1ULL)) );
   19536       DIP("std\n");
   19537       return delta;
   19538 
   19539    case 0xFE: { /* Grp4 Eb */
   19540       Bool decode_OK = True;
   19541       if (haveF2orF3(pfx)) goto decode_failure;
   19542       delta = dis_Grp4 ( vbi, pfx, delta, &decode_OK );
   19543       if (!decode_OK) goto decode_failure;
   19544       return delta;
   19545    }
   19546 
   19547    case 0xFF: { /* Grp5 Ev */
   19548       Bool decode_OK = True;
   19549       if (haveF2orF3(pfx)) goto decode_failure;
   19550       delta = dis_Grp5 ( vbi, pfx, sz, delta, dres, &decode_OK );
   19551       if (!decode_OK) goto decode_failure;
   19552       return delta;
   19553    }
   19554 
   19555    default:
   19556       break;
   19557 
   19558    }
   19559 
   19560   decode_failure:
   19561    return deltaIN; /* fail */
   19562 }
   19563 
   19564 
   19565 /*------------------------------------------------------------*/
   19566 /*---                                                      ---*/
   19567 /*--- Top-level post-escape decoders: dis_ESC_0F           ---*/
   19568 /*---                                                      ---*/
   19569 /*------------------------------------------------------------*/
   19570 
   19571 static IRTemp math_BSWAP ( IRTemp t1, IRType ty )
   19572 {
   19573    IRTemp t2 = newTemp(ty);
   19574    if (ty == Ity_I64) {
   19575       IRTemp m8  = newTemp(Ity_I64);
   19576       IRTemp s8  = newTemp(Ity_I64);
   19577       IRTemp m16 = newTemp(Ity_I64);
   19578       IRTemp s16 = newTemp(Ity_I64);
   19579       IRTemp m32 = newTemp(Ity_I64);
   19580       assign( m8, mkU64(0xFF00FF00FF00FF00ULL) );
   19581       assign( s8,
   19582               binop(Iop_Or64,
   19583                     binop(Iop_Shr64,
   19584                           binop(Iop_And64,mkexpr(t1),mkexpr(m8)),
   19585                           mkU8(8)),
   19586                     binop(Iop_And64,
   19587                           binop(Iop_Shl64,mkexpr(t1),mkU8(8)),
   19588                           mkexpr(m8))
   19589                    )
   19590             );
   19591 
   19592       assign( m16, mkU64(0xFFFF0000FFFF0000ULL) );
   19593       assign( s16,
   19594               binop(Iop_Or64,
   19595                     binop(Iop_Shr64,
   19596                           binop(Iop_And64,mkexpr(s8),mkexpr(m16)),
   19597                           mkU8(16)),
   19598                     binop(Iop_And64,
   19599                           binop(Iop_Shl64,mkexpr(s8),mkU8(16)),
   19600                           mkexpr(m16))
   19601                    )
   19602             );
   19603 
   19604       assign( m32, mkU64(0xFFFFFFFF00000000ULL) );
   19605       assign( t2,
   19606               binop(Iop_Or64,
   19607                     binop(Iop_Shr64,
   19608                           binop(Iop_And64,mkexpr(s16),mkexpr(m32)),
   19609                           mkU8(32)),
   19610                     binop(Iop_And64,
   19611                           binop(Iop_Shl64,mkexpr(s16),mkU8(32)),
   19612                           mkexpr(m32))
   19613                    )
   19614             );
   19615       return t2;
   19616    }
   19617    if (ty == Ity_I32) {
   19618       assign( t2,
   19619          binop(
   19620             Iop_Or32,
   19621             binop(Iop_Shl32, mkexpr(t1), mkU8(24)),
   19622             binop(
   19623                Iop_Or32,
   19624                binop(Iop_And32, binop(Iop_Shl32, mkexpr(t1), mkU8(8)),
   19625                                 mkU32(0x00FF0000)),
   19626                binop(Iop_Or32,
   19627                      binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(8)),
   19628                                       mkU32(0x0000FF00)),
   19629                      binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(24)),
   19630                                       mkU32(0x000000FF) )
   19631             )))
   19632       );
   19633       return t2;
   19634    }
   19635    if (ty == Ity_I16) {
   19636       assign(t2,
   19637              binop(Iop_Or16,
   19638                    binop(Iop_Shl16, mkexpr(t1), mkU8(8)),
   19639                    binop(Iop_Shr16, mkexpr(t1), mkU8(8)) ));
   19640       return t2;
   19641    }
   19642    vassert(0);
   19643    /*NOTREACHED*/
   19644    return IRTemp_INVALID;
   19645 }
   19646 
   19647 
   19648 __attribute__((noinline))
   19649 static
   19650 Long dis_ESC_0F (
   19651         /*MB_OUT*/DisResult* dres,
   19652         /*MB_OUT*/Bool*      expect_CAS,
   19653         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
   19654         Bool         resteerCisOk,
   19655         void*        callback_opaque,
   19656         VexArchInfo* archinfo,
   19657         VexAbiInfo*  vbi,
   19658         Prefix pfx, Int sz, Long deltaIN
   19659      )
   19660 {
   19661    Long   d64   = 0;
   19662    IRTemp addr  = IRTemp_INVALID;
   19663    IRTemp t1    = IRTemp_INVALID;
   19664    IRTemp t2    = IRTemp_INVALID;
   19665    UChar  modrm = 0;
   19666    Int    am_sz = 0;
   19667    Int    alen  = 0;
   19668    HChar  dis_buf[50];
   19669 
   19670    /* In the first switch, look for ordinary integer insns. */
   19671    Long   delta = deltaIN;
   19672    UChar  opc   = getUChar(delta);
   19673    delta++;
   19674    switch (opc) { /* first switch */
   19675 
   19676    case 0x01:
   19677    {
   19678       modrm = getUChar(delta);
   19679       /* 0F 01 /0 -- SGDT */
   19680       /* 0F 01 /1 -- SIDT */
   19681       if (!epartIsReg(modrm)
   19682           && (gregLO3ofRM(modrm) == 0 || gregLO3ofRM(modrm) == 1)) {
   19683          /* This is really revolting, but ... since each processor
   19684             (core) only has one IDT and one GDT, just let the guest
   19685             see it (pass-through semantics).  I can't see any way to
   19686             construct a faked-up value, so don't bother to try. */
   19687          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   19688          delta += alen;
   19689          switch (gregLO3ofRM(modrm)) {
   19690             case 0: DIP("sgdt %s\n", dis_buf); break;
   19691             case 1: DIP("sidt %s\n", dis_buf); break;
   19692             default: vassert(0); /*NOTREACHED*/
   19693          }
   19694          IRDirty* d = unsafeIRDirty_0_N (
   19695                           0/*regparms*/,
   19696                           "amd64g_dirtyhelper_SxDT",
   19697                           &amd64g_dirtyhelper_SxDT,
   19698                           mkIRExprVec_2( mkexpr(addr),
   19699                                          mkU64(gregLO3ofRM(modrm)) )
   19700                       );
   19701          /* declare we're writing memory */
   19702          d->mFx   = Ifx_Write;
   19703          d->mAddr = mkexpr(addr);
   19704          d->mSize = 6;
   19705          stmt( IRStmt_Dirty(d) );
   19706          return delta;
   19707       }
   19708       /* 0F 01 D0 = XGETBV */
   19709       if (modrm == 0xD0 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   19710          delta += 1;
   19711          DIP("xgetbv\n");
   19712          /* Fault (SEGV) if ECX isn't zero.  Intel docs say #GP and I
   19713             am not sure if that translates in to SEGV or to something
   19714             else, in user space. */
   19715          t1 = newTemp(Ity_I32);
   19716          assign( t1, getIReg32(R_RCX) );
   19717          stmt( IRStmt_Exit(binop(Iop_CmpNE32, mkexpr(t1), mkU32(0)),
   19718                            Ijk_SigSEGV,
   19719                            IRConst_U64(guest_RIP_curr_instr),
   19720                            OFFB_RIP
   19721          ));
   19722          putIRegRAX(4, mkU32(7));
   19723          putIRegRDX(4, mkU32(0));
   19724          return delta;
   19725       }
   19726       /* else decode failed */
   19727       break;
   19728    }
   19729 
   19730    case 0x05: /* SYSCALL */
   19731       guest_RIP_next_mustcheck = True;
   19732       guest_RIP_next_assumed = guest_RIP_bbstart + delta;
   19733       putIReg64( R_RCX, mkU64(guest_RIP_next_assumed) );
   19734       /* It's important that all guest state is up-to-date
   19735          at this point.  So we declare an end-of-block here, which
   19736          forces any cached guest state to be flushed. */
   19737       jmp_lit(dres, Ijk_Sys_syscall, guest_RIP_next_assumed);
   19738       vassert(dres->whatNext == Dis_StopHere);
   19739       DIP("syscall\n");
   19740       return delta;
   19741 
   19742    case 0x0B: /* UD2 */
   19743       stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_curr_instr) ) );
   19744       jmp_lit(dres, Ijk_NoDecode, guest_RIP_curr_instr);
   19745       vassert(dres->whatNext == Dis_StopHere);
   19746       DIP("ud2\n");
   19747       return delta;
   19748 
   19749    case 0x0D: /* 0F 0D /0 -- prefetch mem8 */
   19750               /* 0F 0D /1 -- prefetchw mem8 */
   19751       if (have66orF2orF3(pfx)) goto decode_failure;
   19752       modrm = getUChar(delta);
   19753       if (epartIsReg(modrm)) goto decode_failure;
   19754       if (gregLO3ofRM(modrm) != 0 && gregLO3ofRM(modrm) != 1)
   19755          goto decode_failure;
   19756       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   19757       delta += alen;
   19758       switch (gregLO3ofRM(modrm)) {
   19759          case 0: DIP("prefetch %s\n", dis_buf); break;
   19760          case 1: DIP("prefetchw %s\n", dis_buf); break;
   19761          default: vassert(0); /*NOTREACHED*/
   19762       }
   19763       return delta;
   19764 
   19765    case 0x1F:
   19766       if (haveF2orF3(pfx)) goto decode_failure;
   19767       modrm = getUChar(delta);
   19768       if (epartIsReg(modrm)) goto decode_failure;
   19769       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   19770       delta += alen;
   19771       DIP("nop%c %s\n", nameISize(sz), dis_buf);
   19772       return delta;
   19773 
   19774    case 0x31: { /* RDTSC */
   19775       IRTemp   val  = newTemp(Ity_I64);
   19776       IRExpr** args = mkIRExprVec_0();
   19777       IRDirty* d    = unsafeIRDirty_1_N (
   19778                          val,
   19779                          0/*regparms*/,
   19780                          "amd64g_dirtyhelper_RDTSC",
   19781                          &amd64g_dirtyhelper_RDTSC,
   19782                          args
   19783                       );
   19784       if (have66orF2orF3(pfx)) goto decode_failure;
   19785       /* execute the dirty call, dumping the result in val. */
   19786       stmt( IRStmt_Dirty(d) );
   19787       putIRegRDX(4, unop(Iop_64HIto32, mkexpr(val)));
   19788       putIRegRAX(4, unop(Iop_64to32, mkexpr(val)));
   19789       DIP("rdtsc\n");
   19790       return delta;
   19791    }
   19792 
   19793    case 0x40:
   19794    case 0x41:
   19795    case 0x42: /* CMOVBb/CMOVNAEb (cmov below) */
   19796    case 0x43: /* CMOVNBb/CMOVAEb (cmov not below) */
   19797    case 0x44: /* CMOVZb/CMOVEb (cmov zero) */
   19798    case 0x45: /* CMOVNZb/CMOVNEb (cmov not zero) */
   19799    case 0x46: /* CMOVBEb/CMOVNAb (cmov below or equal) */
   19800    case 0x47: /* CMOVNBEb/CMOVAb (cmov not below or equal) */
   19801    case 0x48: /* CMOVSb (cmov negative) */
   19802    case 0x49: /* CMOVSb (cmov not negative) */
   19803    case 0x4A: /* CMOVP (cmov parity even) */
   19804    case 0x4B: /* CMOVNP (cmov parity odd) */
   19805    case 0x4C: /* CMOVLb/CMOVNGEb (cmov less) */
   19806    case 0x4D: /* CMOVGEb/CMOVNLb (cmov greater or equal) */
   19807    case 0x4E: /* CMOVLEb/CMOVNGb (cmov less or equal) */
   19808    case 0x4F: /* CMOVGb/CMOVNLEb (cmov greater) */
   19809       if (haveF2orF3(pfx)) goto decode_failure;
   19810       delta = dis_cmov_E_G(vbi, pfx, sz, (AMD64Condcode)(opc - 0x40), delta);
   19811       return delta;
   19812 
   19813    case 0x80:
   19814    case 0x81:
   19815    case 0x82:   /* JBb/JNAEb (jump below) */
   19816    case 0x83:   /* JNBb/JAEb (jump not below) */
   19817    case 0x84:   /* JZb/JEb (jump zero) */
   19818    case 0x85:   /* JNZb/JNEb (jump not zero) */
   19819    case 0x86:   /* JBEb/JNAb (jump below or equal) */
   19820    case 0x87:   /* JNBEb/JAb (jump not below or equal) */
   19821    case 0x88:   /* JSb (jump negative) */
   19822    case 0x89:   /* JSb (jump not negative) */
   19823    case 0x8A:   /* JP (jump parity even) */
   19824    case 0x8B:   /* JNP/JPO (jump parity odd) */
   19825    case 0x8C:   /* JLb/JNGEb (jump less) */
   19826    case 0x8D:   /* JGEb/JNLb (jump greater or equal) */
   19827    case 0x8E:   /* JLEb/JNGb (jump less or equal) */
   19828    case 0x8F: { /* JGb/JNLEb (jump greater) */
   19829       Long   jmpDelta;
   19830       HChar* comment  = "";
   19831       if (haveF2orF3(pfx)) goto decode_failure;
   19832       jmpDelta = getSDisp32(delta);
   19833       d64 = (guest_RIP_bbstart+delta+4) + jmpDelta;
   19834       delta += 4;
   19835       if (resteerCisOk
   19836           && vex_control.guest_chase_cond
   19837           && (Addr64)d64 != (Addr64)guest_RIP_bbstart
   19838           && jmpDelta < 0
   19839           && resteerOkFn( callback_opaque, d64) ) {
   19840          /* Speculation: assume this backward branch is taken.  So
   19841             we need to emit a side-exit to the insn following this
   19842             one, on the negation of the condition, and continue at
   19843             the branch target address (d64).  If we wind up back at
   19844             the first instruction of the trace, just stop; it's
   19845             better to let the IR loop unroller handle that case. */
   19846          stmt( IRStmt_Exit(
   19847                   mk_amd64g_calculate_condition(
   19848                      (AMD64Condcode)(1 ^ (opc - 0x80))),
   19849                   Ijk_Boring,
   19850                   IRConst_U64(guest_RIP_bbstart+delta),
   19851                   OFFB_RIP
   19852              ));
   19853          dres->whatNext   = Dis_ResteerC;
   19854          dres->continueAt = d64;
   19855          comment = "(assumed taken)";
   19856       }
   19857       else
   19858       if (resteerCisOk
   19859           && vex_control.guest_chase_cond
   19860           && (Addr64)d64 != (Addr64)guest_RIP_bbstart
   19861           && jmpDelta >= 0
   19862           && resteerOkFn( callback_opaque, guest_RIP_bbstart+delta ) ) {
   19863          /* Speculation: assume this forward branch is not taken.
   19864             So we need to emit a side-exit to d64 (the dest) and
   19865             continue disassembling at the insn immediately
   19866             following this one. */
   19867          stmt( IRStmt_Exit(
   19868                   mk_amd64g_calculate_condition((AMD64Condcode)
   19869                                                 (opc - 0x80)),
   19870                   Ijk_Boring,
   19871                   IRConst_U64(d64),
   19872                   OFFB_RIP
   19873              ));
   19874          dres->whatNext   = Dis_ResteerC;
   19875          dres->continueAt = guest_RIP_bbstart+delta;
   19876          comment = "(assumed not taken)";
   19877       }
   19878       else {
   19879          /* Conservative default translation - end the block at
   19880             this point. */
   19881          jcc_01( dres, (AMD64Condcode)(opc - 0x80),
   19882                  guest_RIP_bbstart+delta, d64 );
   19883          vassert(dres->whatNext == Dis_StopHere);
   19884       }
   19885       DIP("j%s-32 0x%llx %s\n", name_AMD64Condcode(opc - 0x80), d64, comment);
   19886       return delta;
   19887    }
   19888 
   19889    case 0x90:
   19890    case 0x91:
   19891    case 0x92: /* set-Bb/set-NAEb (set if below) */
   19892    case 0x93: /* set-NBb/set-AEb (set if not below) */
   19893    case 0x94: /* set-Zb/set-Eb (set if zero) */
   19894    case 0x95: /* set-NZb/set-NEb (set if not zero) */
   19895    case 0x96: /* set-BEb/set-NAb (set if below or equal) */
   19896    case 0x97: /* set-NBEb/set-Ab (set if not below or equal) */
   19897    case 0x98: /* set-Sb (set if negative) */
   19898    case 0x99: /* set-Sb (set if not negative) */
   19899    case 0x9A: /* set-P (set if parity even) */
   19900    case 0x9B: /* set-NP (set if parity odd) */
   19901    case 0x9C: /* set-Lb/set-NGEb (set if less) */
   19902    case 0x9D: /* set-GEb/set-NLb (set if greater or equal) */
   19903    case 0x9E: /* set-LEb/set-NGb (set if less or equal) */
   19904    case 0x9F: /* set-Gb/set-NLEb (set if greater) */
   19905       if (haveF2orF3(pfx)) goto decode_failure;
   19906       t1 = newTemp(Ity_I8);
   19907       assign( t1, unop(Iop_1Uto8,mk_amd64g_calculate_condition(opc-0x90)) );
   19908       modrm = getUChar(delta);
   19909       if (epartIsReg(modrm)) {
   19910          delta++;
   19911          putIRegE(1, pfx, modrm, mkexpr(t1));
   19912          DIP("set%s %s\n", name_AMD64Condcode(opc-0x90),
   19913                            nameIRegE(1,pfx,modrm));
   19914       } else {
   19915          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   19916          delta += alen;
   19917          storeLE( mkexpr(addr), mkexpr(t1) );
   19918          DIP("set%s %s\n", name_AMD64Condcode(opc-0x90), dis_buf);
   19919       }
   19920       return delta;
   19921 
   19922    case 0xA2: { /* CPUID */
   19923       /* Uses dirty helper:
   19924             void amd64g_dirtyhelper_CPUID ( VexGuestAMD64State* )
   19925          declared to mod rax, wr rbx, rcx, rdx
   19926       */
   19927       IRDirty* d     = NULL;
   19928       HChar*   fName = NULL;
   19929       void*    fAddr = NULL;
   19930       if (haveF2orF3(pfx)) goto decode_failure;
   19931       if (archinfo->hwcaps == (VEX_HWCAPS_AMD64_SSE3
   19932                                |VEX_HWCAPS_AMD64_CX16
   19933                                |VEX_HWCAPS_AMD64_AVX)) {
   19934          fName = "amd64g_dirtyhelper_CPUID_avx_and_cx16";
   19935          fAddr = &amd64g_dirtyhelper_CPUID_avx_and_cx16;
   19936          /* This is a Core-i5-2300-like machine */
   19937       }
   19938       else if (archinfo->hwcaps == (VEX_HWCAPS_AMD64_SSE3
   19939                                     |VEX_HWCAPS_AMD64_CX16)) {
   19940          fName = "amd64g_dirtyhelper_CPUID_sse42_and_cx16";
   19941          fAddr = &amd64g_dirtyhelper_CPUID_sse42_and_cx16;
   19942          /* This is a Core-i5-670-like machine */
   19943       }
   19944       else {
   19945          /* Give a CPUID for at least a baseline machine, SSE2
   19946             only, and no CX16 */
   19947          fName = "amd64g_dirtyhelper_CPUID_baseline";
   19948          fAddr = &amd64g_dirtyhelper_CPUID_baseline;
   19949       }
   19950 
   19951       vassert(fName); vassert(fAddr);
   19952       d = unsafeIRDirty_0_N ( 0/*regparms*/,
   19953                               fName, fAddr, mkIRExprVec_0() );
   19954       /* declare guest state effects */
   19955       d->needsBBP = True;
   19956       d->nFxState = 4;
   19957       vex_bzero(&d->fxState, sizeof(d->fxState));
   19958       d->fxState[0].fx     = Ifx_Modify;
   19959       d->fxState[0].offset = OFFB_RAX;
   19960       d->fxState[0].size   = 8;
   19961       d->fxState[1].fx     = Ifx_Write;
   19962       d->fxState[1].offset = OFFB_RBX;
   19963       d->fxState[1].size   = 8;
   19964       d->fxState[2].fx     = Ifx_Modify;
   19965       d->fxState[2].offset = OFFB_RCX;
   19966       d->fxState[2].size   = 8;
   19967       d->fxState[3].fx     = Ifx_Write;
   19968       d->fxState[3].offset = OFFB_RDX;
   19969       d->fxState[3].size   = 8;
   19970       /* execute the dirty call, side-effecting guest state */
   19971       stmt( IRStmt_Dirty(d) );
   19972       /* CPUID is a serialising insn.  So, just in case someone is
   19973          using it as a memory fence ... */
   19974       stmt( IRStmt_MBE(Imbe_Fence) );
   19975       DIP("cpuid\n");
   19976       return delta;
   19977    }
   19978 
   19979    case 0xA3: /* BT Gv,Ev */
   19980       if (haveF2orF3(pfx)) goto decode_failure;
   19981       if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
   19982       delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpNone );
   19983       return delta;
   19984 
   19985    case 0xA4: /* SHLDv imm8,Gv,Ev */
   19986       modrm = getUChar(delta);
   19987       d64   = delta + lengthAMode(pfx, delta);
   19988       vex_sprintf(dis_buf, "$%d", (Int)getUChar(d64));
   19989       delta = dis_SHLRD_Gv_Ev (
   19990                  vbi, pfx, delta, modrm, sz,
   19991                  mkU8(getUChar(d64)), True, /* literal */
   19992                  dis_buf, True /* left */ );
   19993       return delta;
   19994 
   19995    case 0xA5: /* SHLDv %cl,Gv,Ev */
   19996       modrm = getUChar(delta);
   19997       delta = dis_SHLRD_Gv_Ev (
   19998                  vbi, pfx, delta, modrm, sz,
   19999                  getIRegCL(), False, /* not literal */
   20000                  "%cl", True /* left */ );
   20001       return delta;
   20002 
   20003    case 0xAB: /* BTS Gv,Ev */
   20004       if (haveF2orF3(pfx)) goto decode_failure;
   20005       if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
   20006       delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpSet );
   20007       return delta;
   20008 
   20009    case 0xAC: /* SHRDv imm8,Gv,Ev */
   20010       modrm = getUChar(delta);
   20011       d64   = delta + lengthAMode(pfx, delta);
   20012       vex_sprintf(dis_buf, "$%d", (Int)getUChar(d64));
   20013       delta = dis_SHLRD_Gv_Ev (
   20014                  vbi, pfx, delta, modrm, sz,
   20015                  mkU8(getUChar(d64)), True, /* literal */
   20016                  dis_buf, False /* right */ );
   20017       return delta;
   20018 
   20019    case 0xAD: /* SHRDv %cl,Gv,Ev */
   20020       modrm = getUChar(delta);
   20021       delta = dis_SHLRD_Gv_Ev (
   20022                  vbi, pfx, delta, modrm, sz,
   20023                  getIRegCL(), False, /* not literal */
   20024                  "%cl", False /* right */);
   20025       return delta;
   20026 
   20027    case 0xAF: /* IMUL Ev, Gv */
   20028       if (haveF2orF3(pfx)) goto decode_failure;
   20029       delta = dis_mul_E_G ( vbi, pfx, sz, delta );
   20030       return delta;
   20031 
   20032    case 0xB1: { /* CMPXCHG Gv,Ev (allowed in 16,32,64 bit) */
   20033       Bool ok = True;
   20034       if (haveF2orF3(pfx)) goto decode_failure;
   20035       if (sz != 2 && sz != 4 && sz != 8) goto decode_failure;
   20036       delta = dis_cmpxchg_G_E ( &ok, vbi, pfx, sz, delta );
   20037       if (!ok) goto decode_failure;
   20038       return delta;
   20039    }
   20040 
   20041    case 0xB0: { /* CMPXCHG Gb,Eb */
   20042       Bool ok = True;
   20043       if (haveF2orF3(pfx)) goto decode_failure;
   20044       delta = dis_cmpxchg_G_E ( &ok, vbi, pfx, 1, delta );
   20045       if (!ok) goto decode_failure;
   20046       return delta;
   20047    }
   20048 
   20049    case 0xB3: /* BTR Gv,Ev */
   20050       if (haveF2orF3(pfx)) goto decode_failure;
   20051       if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
   20052       delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpReset );
   20053       return delta;
   20054 
   20055    case 0xB6: /* MOVZXb Eb,Gv */
   20056       if (haveF2orF3(pfx)) goto decode_failure;
   20057       if (sz != 2 && sz != 4 && sz != 8)
   20058          goto decode_failure;
   20059       delta = dis_movx_E_G ( vbi, pfx, delta, 1, sz, False );
   20060       return delta;
   20061 
   20062    case 0xB7: /* MOVZXw Ew,Gv */
   20063       if (haveF2orF3(pfx)) goto decode_failure;
   20064       if (sz != 4 && sz != 8)
   20065          goto decode_failure;
   20066       delta = dis_movx_E_G ( vbi, pfx, delta, 2, sz, False );
   20067       return delta;
   20068 
   20069    case 0xBA: { /* Grp8 Ib,Ev */
   20070       Bool decode_OK = False;
   20071       if (haveF2orF3(pfx)) goto decode_failure;
   20072       modrm = getUChar(delta);
   20073       am_sz = lengthAMode(pfx,delta);
   20074       d64   = getSDisp8(delta + am_sz);
   20075       delta = dis_Grp8_Imm ( vbi, pfx, delta, modrm, am_sz, sz, d64,
   20076                              &decode_OK );
   20077       if (!decode_OK)
   20078          goto decode_failure;
   20079       return delta;
   20080    }
   20081 
   20082    case 0xBB: /* BTC Gv,Ev */
   20083       if (haveF2orF3(pfx)) goto decode_failure;
   20084       if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
   20085       delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpComp );
   20086       return delta;
   20087 
   20088    case 0xBC: /* BSF Gv,Ev */
   20089       if (haveF2(pfx)) goto decode_failure;
   20090       delta = dis_bs_E_G ( vbi, pfx, sz, delta, True );
   20091       return delta;
   20092 
   20093    case 0xBD: /* BSR Gv,Ev */
   20094       if (!haveF2orF3(pfx)
   20095           || (haveF3noF2(pfx)
   20096               && 0 == (archinfo->hwcaps & VEX_HWCAPS_AMD64_LZCNT))) {
   20097          /* no-F2 no-F3 0F BD = BSR
   20098                   or F3 0F BD = REP; BSR on older CPUs.  */
   20099          delta = dis_bs_E_G ( vbi, pfx, sz, delta, False );
   20100          return delta;
   20101       }
   20102       /* Fall through, since F3 0F BD is LZCNT, and needs to
   20103          be handled by dis_ESC_0F__SSE4. */
   20104       break;
   20105 
   20106    case 0xBE: /* MOVSXb Eb,Gv */
   20107       if (haveF2orF3(pfx)) goto decode_failure;
   20108       if (sz != 2 && sz != 4 && sz != 8)
   20109          goto decode_failure;
   20110       delta = dis_movx_E_G ( vbi, pfx, delta, 1, sz, True );
   20111       return delta;
   20112 
   20113    case 0xBF: /* MOVSXw Ew,Gv */
   20114       if (haveF2orF3(pfx)) goto decode_failure;
   20115       if (sz != 4 && sz != 8)
   20116          goto decode_failure;
   20117       delta = dis_movx_E_G ( vbi, pfx, delta, 2, sz, True );
   20118       return delta;
   20119 
   20120    case 0xC1: { /* XADD Gv,Ev */
   20121       Bool decode_OK = False;
   20122       delta = dis_xadd_G_E ( &decode_OK, vbi, pfx, sz, delta );
   20123       if (!decode_OK)
   20124          goto decode_failure;
   20125       return delta;
   20126    }
   20127 
   20128    case 0xC7: { /* CMPXCHG8B Ev, CMPXCHG16B Ev */
   20129       IRType  elemTy     = sz==4 ? Ity_I32 : Ity_I64;
   20130       IRTemp  expdHi     = newTemp(elemTy);
   20131       IRTemp  expdLo     = newTemp(elemTy);
   20132       IRTemp  dataHi     = newTemp(elemTy);
   20133       IRTemp  dataLo     = newTemp(elemTy);
   20134       IRTemp  oldHi      = newTemp(elemTy);
   20135       IRTemp  oldLo      = newTemp(elemTy);
   20136       IRTemp  flags_old  = newTemp(Ity_I64);
   20137       IRTemp  flags_new  = newTemp(Ity_I64);
   20138       IRTemp  success    = newTemp(Ity_I1);
   20139       IROp    opOR       = sz==4 ? Iop_Or32    : Iop_Or64;
   20140       IROp    opXOR      = sz==4 ? Iop_Xor32   : Iop_Xor64;
   20141       IROp    opCasCmpEQ = sz==4 ? Iop_CasCmpEQ32 : Iop_CasCmpEQ64;
   20142       IRExpr* zero       = sz==4 ? mkU32(0)    : mkU64(0);
   20143       IRTemp expdHi64    = newTemp(Ity_I64);
   20144       IRTemp expdLo64    = newTemp(Ity_I64);
   20145 
   20146       /* Translate this using a DCAS, even if there is no LOCK
   20147          prefix.  Life is too short to bother with generating two
   20148          different translations for the with/without-LOCK-prefix
   20149          cases. */
   20150       *expect_CAS = True;
   20151 
   20152       /* Decode, and generate address. */
   20153       if (have66orF2orF3(pfx)) goto decode_failure;
   20154       if (sz != 4 && sz != 8) goto decode_failure;
   20155       if (sz == 8 && !(archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16))
   20156          goto decode_failure;
   20157       modrm = getUChar(delta);
   20158       if (epartIsReg(modrm)) goto decode_failure;
   20159       if (gregLO3ofRM(modrm) != 1) goto decode_failure;
   20160       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   20161       delta += alen;
   20162 
   20163       /* cmpxchg16b requires an alignment check. */
   20164       if (sz == 8)
   20165          gen_SEGV_if_not_16_aligned( addr );
   20166 
   20167       /* Get the expected and new values. */
   20168       assign( expdHi64, getIReg64(R_RDX) );
   20169       assign( expdLo64, getIReg64(R_RAX) );
   20170 
   20171       /* These are the correctly-sized expected and new values.
   20172          However, we also get expdHi64/expdLo64 above as 64-bits
   20173          regardless, because we will need them later in the 32-bit
   20174          case (paradoxically). */
   20175       assign( expdHi, sz==4 ? unop(Iop_64to32, mkexpr(expdHi64))
   20176                             : mkexpr(expdHi64) );
   20177       assign( expdLo, sz==4 ? unop(Iop_64to32, mkexpr(expdLo64))
   20178                             : mkexpr(expdLo64) );
   20179       assign( dataHi, sz==4 ? getIReg32(R_RCX) : getIReg64(R_RCX) );
   20180       assign( dataLo, sz==4 ? getIReg32(R_RBX) : getIReg64(R_RBX) );
   20181 
   20182       /* Do the DCAS */
   20183       stmt( IRStmt_CAS(
   20184                mkIRCAS( oldHi, oldLo,
   20185                         Iend_LE, mkexpr(addr),
   20186                         mkexpr(expdHi), mkexpr(expdLo),
   20187                         mkexpr(dataHi), mkexpr(dataLo)
   20188             )));
   20189 
   20190       /* success when oldHi:oldLo == expdHi:expdLo */
   20191       assign( success,
   20192               binop(opCasCmpEQ,
   20193                     binop(opOR,
   20194                           binop(opXOR, mkexpr(oldHi), mkexpr(expdHi)),
   20195                           binop(opXOR, mkexpr(oldLo), mkexpr(expdLo))
   20196                     ),
   20197                     zero
   20198               ));
   20199 
   20200       /* If the DCAS is successful, that is to say oldHi:oldLo ==
   20201          expdHi:expdLo, then put expdHi:expdLo back in RDX:RAX,
   20202          which is where they came from originally.  Both the actual
   20203          contents of these two regs, and any shadow values, are
   20204          unchanged.  If the DCAS fails then we're putting into
   20205          RDX:RAX the value seen in memory. */
   20206       /* Now of course there's a complication in the 32-bit case
   20207          (bah!): if the DCAS succeeds, we need to leave RDX:RAX
   20208          unchanged; but if we use the same scheme as in the 64-bit
   20209          case, we get hit by the standard rule that a write to the
   20210          bottom 32 bits of an integer register zeros the upper 32
   20211          bits.  And so the upper halves of RDX and RAX mysteriously
   20212          become zero.  So we have to stuff back in the original
   20213          64-bit values which we previously stashed in
   20214          expdHi64:expdLo64, even if we're doing a cmpxchg8b. */
   20215       /* It's just _so_ much fun ... */
   20216       putIRegRDX( 8,
   20217                   IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(success)),
   20218                                 sz == 4 ? unop(Iop_32Uto64, mkexpr(oldHi))
   20219                                         : mkexpr(oldHi),
   20220                                 mkexpr(expdHi64)
   20221                 ));
   20222       putIRegRAX( 8,
   20223                   IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(success)),
   20224                                 sz == 4 ? unop(Iop_32Uto64, mkexpr(oldLo))
   20225                                         : mkexpr(oldLo),
   20226                                 mkexpr(expdLo64)
   20227                 ));
   20228 
   20229       /* Copy the success bit into the Z flag and leave the others
   20230          unchanged */
   20231       assign( flags_old, widenUto64(mk_amd64g_calculate_rflags_all()));
   20232       assign(
   20233          flags_new,
   20234          binop(Iop_Or64,
   20235                binop(Iop_And64, mkexpr(flags_old),
   20236                                 mkU64(~AMD64G_CC_MASK_Z)),
   20237                binop(Iop_Shl64,
   20238                      binop(Iop_And64,
   20239                            unop(Iop_1Uto64, mkexpr(success)), mkU64(1)),
   20240                      mkU8(AMD64G_CC_SHIFT_Z)) ));
   20241 
   20242       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   20243       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(flags_new) ));
   20244       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   20245       /* Set NDEP even though it isn't used.  This makes
   20246          redundant-PUT elimination of previous stores to this field
   20247          work better. */
   20248       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   20249 
   20250       /* Sheesh.  Aren't you glad it was me and not you that had to
   20251          write and validate all this grunge? */
   20252 
   20253       DIP("cmpxchg8b %s\n", dis_buf);
   20254       return delta;
   20255    }
   20256 
   20257    case 0xC8: /* BSWAP %eax */
   20258    case 0xC9:
   20259    case 0xCA:
   20260    case 0xCB:
   20261    case 0xCC:
   20262    case 0xCD:
   20263    case 0xCE:
   20264    case 0xCF: /* BSWAP %edi */
   20265       if (haveF2orF3(pfx)) goto decode_failure;
   20266       /* According to the AMD64 docs, this insn can have size 4 or
   20267          8. */
   20268       if (sz == 4) {
   20269          t1 = newTemp(Ity_I32);
   20270          assign( t1, getIRegRexB(4, pfx, opc-0xC8) );
   20271          t2 = math_BSWAP( t1, Ity_I32 );
   20272          putIRegRexB(4, pfx, opc-0xC8, mkexpr(t2));
   20273          DIP("bswapl %s\n", nameIRegRexB(4, pfx, opc-0xC8));
   20274          return delta;
   20275       }
   20276       if (sz == 8) {
   20277          t1 = newTemp(Ity_I64);
   20278          t2 = newTemp(Ity_I64);
   20279          assign( t1, getIRegRexB(8, pfx, opc-0xC8) );
   20280          t2 = math_BSWAP( t1, Ity_I64 );
   20281          putIRegRexB(8, pfx, opc-0xC8, mkexpr(t2));
   20282          DIP("bswapq %s\n", nameIRegRexB(8, pfx, opc-0xC8));
   20283          return delta;
   20284       }
   20285       goto decode_failure;
   20286 
   20287    default:
   20288       break;
   20289 
   20290    } /* first switch */
   20291 
   20292 
   20293    /* =-=-=-=-=-=-=-=-= MMXery =-=-=-=-=-=-=-=-= */
   20294    /* In the second switch, pick off MMX insns. */
   20295 
   20296    if (!have66orF2orF3(pfx)) {
   20297       /* So there's no SIMD prefix. */
   20298 
   20299       vassert(sz == 4 || sz == 8);
   20300 
   20301       switch (opc) { /* second switch */
   20302 
   20303       case 0x71:
   20304       case 0x72:
   20305       case 0x73: /* PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
   20306 
   20307       case 0x6E: /* MOVD (src)ireg-or-mem, (dst)mmxreg */
   20308       case 0x7E: /* MOVD (src)mmxreg, (dst)ireg-or-mem */
   20309       case 0x7F: /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
   20310       case 0x6F: /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
   20311 
   20312       case 0xFC:
   20313       case 0xFD:
   20314       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
   20315 
   20316       case 0xEC:
   20317       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
   20318 
   20319       case 0xDC:
   20320       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   20321 
   20322       case 0xF8:
   20323       case 0xF9:
   20324       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
   20325 
   20326       case 0xE8:
   20327       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
   20328 
   20329       case 0xD8:
   20330       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   20331 
   20332       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
   20333       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
   20334 
   20335       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
   20336 
   20337       case 0x74:
   20338       case 0x75:
   20339       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
   20340 
   20341       case 0x64:
   20342       case 0x65:
   20343       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
   20344 
   20345       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
   20346       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
   20347       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
   20348 
   20349       case 0x68:
   20350       case 0x69:
   20351       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
   20352 
   20353       case 0x60:
   20354       case 0x61:
   20355       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
   20356 
   20357       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
   20358       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
   20359       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
   20360       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
   20361 
   20362       case 0xF1: /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
   20363       case 0xF2:
   20364       case 0xF3:
   20365 
   20366       case 0xD1: /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
   20367       case 0xD2:
   20368       case 0xD3:
   20369 
   20370       case 0xE1: /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
   20371       case 0xE2: {
   20372          Bool decode_OK = False;
   20373          delta = dis_MMX ( &decode_OK, vbi, pfx, sz, deltaIN );
   20374          if (decode_OK)
   20375             return delta;
   20376          goto decode_failure;
   20377       }
   20378 
   20379       default:
   20380          break;
   20381       } /* second switch */
   20382 
   20383    }
   20384 
   20385    /* A couple of MMX corner cases */
   20386    if (opc == 0x0E/* FEMMS */ || opc == 0x77/* EMMS */) {
   20387       if (sz != 4)
   20388          goto decode_failure;
   20389       do_EMMS_preamble();
   20390       DIP("{f}emms\n");
   20391       return delta;
   20392    }
   20393 
   20394    /* =-=-=-=-=-=-=-=-= SSE2ery =-=-=-=-=-=-=-=-= */
   20395    /* Perhaps it's an SSE or SSE2 instruction.  We can try this
   20396       without checking the guest hwcaps because SSE2 is a baseline
   20397       facility in 64 bit mode. */
   20398    {
   20399       Bool decode_OK = False;
   20400       delta = dis_ESC_0F__SSE2 ( &decode_OK, vbi, pfx, sz, deltaIN, dres );
   20401       if (decode_OK)
   20402          return delta;
   20403    }
   20404 
   20405    /* =-=-=-=-=-=-=-=-= SSE3ery =-=-=-=-=-=-=-=-= */
   20406    /* Perhaps it's a SSE3 instruction.  FIXME: check guest hwcaps
   20407       first. */
   20408    {
   20409       Bool decode_OK = False;
   20410       delta = dis_ESC_0F__SSE3 ( &decode_OK, vbi, pfx, sz, deltaIN );
   20411       if (decode_OK)
   20412          return delta;
   20413    }
   20414 
   20415    /* =-=-=-=-=-=-=-=-= SSE4ery =-=-=-=-=-=-=-=-= */
   20416    /* Perhaps it's a SSE4 instruction.  FIXME: check guest hwcaps
   20417       first. */
   20418    {
   20419       Bool decode_OK = False;
   20420       delta = dis_ESC_0F__SSE4 ( &decode_OK,
   20421                                  archinfo, vbi, pfx, sz, deltaIN );
   20422       if (decode_OK)
   20423          return delta;
   20424    }
   20425 
   20426   decode_failure:
   20427    return deltaIN; /* fail */
   20428 }
   20429 
   20430 
   20431 /*------------------------------------------------------------*/
   20432 /*---                                                      ---*/
   20433 /*--- Top-level post-escape decoders: dis_ESC_0F38         ---*/
   20434 /*---                                                      ---*/
   20435 /*------------------------------------------------------------*/
   20436 
   20437 __attribute__((noinline))
   20438 static
   20439 Long dis_ESC_0F38 (
   20440         /*MB_OUT*/DisResult* dres,
   20441         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
   20442         Bool         resteerCisOk,
   20443         void*        callback_opaque,
   20444         VexArchInfo* archinfo,
   20445         VexAbiInfo*  vbi,
   20446         Prefix pfx, Int sz, Long deltaIN
   20447      )
   20448 {
   20449    Long   delta = deltaIN;
   20450    UChar  opc   = getUChar(delta);
   20451    delta++;
   20452    switch (opc) {
   20453 
   20454    case 0xF0:   /* 0F 38 F0 = MOVBE m16/32/64(E), r16/32/64(G) */
   20455    case 0xF1: { /* 0F 38 F1 = MOVBE r16/32/64(G), m16/32/64(E) */
   20456       if (!haveF2orF3(pfx) && !haveVEX(pfx)
   20457           && (sz == 2 || sz == 4 || sz == 8)) {
   20458          IRTemp addr  = IRTemp_INVALID;
   20459          UChar  modrm = 0;
   20460          Int    alen  = 0;
   20461          HChar  dis_buf[50];
   20462          modrm = getUChar(delta);
   20463          if (epartIsReg(modrm)) break;
   20464          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   20465          delta += alen;
   20466          IRType ty = szToITy(sz);
   20467          IRTemp src = newTemp(ty);
   20468          if (opc == 0xF0) { /* LOAD */
   20469             assign(src, loadLE(ty, mkexpr(addr)));
   20470             IRTemp dst = math_BSWAP(src, ty);
   20471             putIRegG(sz, pfx, modrm, mkexpr(dst));
   20472             DIP("movbe %s,%s\n", dis_buf, nameIRegG(sz, pfx, modrm));
   20473          } else { /* STORE */
   20474             assign(src, getIRegG(sz, pfx, modrm));
   20475             IRTemp dst = math_BSWAP(src, ty);
   20476             storeLE(mkexpr(addr), mkexpr(dst));
   20477             DIP("movbe %s,%s\n", nameIRegG(sz, pfx, modrm), dis_buf);
   20478          }
   20479          return delta;
   20480       }
   20481       /* else fall through; maybe one of the decoders below knows what
   20482          it is. */
   20483       break;
   20484    }
   20485 
   20486    default:
   20487       break;
   20488 
   20489    }
   20490 
   20491    /* =-=-=-=-=-=-=-=-= SSSE3ery =-=-=-=-=-=-=-=-= */
   20492    /* Perhaps it's an SSSE3 instruction.  FIXME: consult guest hwcaps
   20493       rather than proceeding indiscriminately. */
   20494    {
   20495       Bool decode_OK = False;
   20496       delta = dis_ESC_0F38__SupSSE3 ( &decode_OK, vbi, pfx, sz, deltaIN );
   20497       if (decode_OK)
   20498          return delta;
   20499    }
   20500 
   20501    /* =-=-=-=-=-=-=-=-= SSE4ery =-=-=-=-=-=-=-=-= */
   20502    /* Perhaps it's an SSE4 instruction.  FIXME: consult guest hwcaps
   20503       rather than proceeding indiscriminately. */
   20504    {
   20505       Bool decode_OK = False;
   20506       delta = dis_ESC_0F38__SSE4 ( &decode_OK, vbi, pfx, sz, deltaIN );
   20507       if (decode_OK)
   20508          return delta;
   20509    }
   20510 
   20511   /*decode_failure:*/
   20512    return deltaIN; /* fail */
   20513 }
   20514 
   20515 
   20516 /*------------------------------------------------------------*/
   20517 /*---                                                      ---*/
   20518 /*--- Top-level post-escape decoders: dis_ESC_0F3A         ---*/
   20519 /*---                                                      ---*/
   20520 /*------------------------------------------------------------*/
   20521 
   20522 __attribute__((noinline))
   20523 static
   20524 Long dis_ESC_0F3A (
   20525         /*MB_OUT*/DisResult* dres,
   20526         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
   20527         Bool         resteerCisOk,
   20528         void*        callback_opaque,
   20529         VexArchInfo* archinfo,
   20530         VexAbiInfo*  vbi,
   20531         Prefix pfx, Int sz, Long deltaIN
   20532      )
   20533 {
   20534    Long   delta = deltaIN;
   20535    UChar  opc   = getUChar(delta);
   20536    delta++;
   20537    switch (opc) {
   20538 
   20539    default:
   20540       break;
   20541 
   20542    }
   20543 
   20544    /* =-=-=-=-=-=-=-=-= SSSE3ery =-=-=-=-=-=-=-=-= */
   20545    /* Perhaps it's an SSSE3 instruction.  FIXME: consult guest hwcaps
   20546       rather than proceeding indiscriminately. */
   20547    {
   20548       Bool decode_OK = False;
   20549       delta = dis_ESC_0F3A__SupSSE3 ( &decode_OK, vbi, pfx, sz, deltaIN );
   20550       if (decode_OK)
   20551          return delta;
   20552    }
   20553 
   20554    /* =-=-=-=-=-=-=-=-= SSE4ery =-=-=-=-=-=-=-=-= */
   20555    /* Perhaps it's an SSE4 instruction.  FIXME: consult guest hwcaps
   20556       rather than proceeding indiscriminately. */
   20557    {
   20558       Bool decode_OK = False;
   20559       delta = dis_ESC_0F3A__SSE4 ( &decode_OK, vbi, pfx, sz, deltaIN );
   20560       if (decode_OK)
   20561          return delta;
   20562    }
   20563 
   20564    return deltaIN; /* fail */
   20565 }
   20566 
   20567 
   20568 /*------------------------------------------------------------*/
   20569 /*---                                                      ---*/
   20570 /*--- Top-level post-escape decoders: dis_ESC_0F__VEX      ---*/
   20571 /*---                                                      ---*/
   20572 /*------------------------------------------------------------*/
   20573 
   20574 /* FIXME: common up with the _256_ version below? */
   20575 static
   20576 Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG (
   20577         /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi,
   20578         Prefix pfx, Long delta, HChar* name,
   20579         /* The actual operation.  Use either 'op' or 'opfn',
   20580            but not both. */
   20581         IROp op, IRTemp(*opFn)(IRTemp,IRTemp),
   20582         Bool invertLeftArg,
   20583         Bool swapArgs
   20584      )
   20585 {
   20586    UChar  modrm = getUChar(delta);
   20587    UInt   rD    = gregOfRexRM(pfx, modrm);
   20588    UInt   rSL   = getVexNvvvv(pfx);
   20589    IRTemp tSL   = newTemp(Ity_V128);
   20590    IRTemp tSR   = newTemp(Ity_V128);
   20591    IRTemp addr  = IRTemp_INVALID;
   20592    HChar  dis_buf[50];
   20593    Int    alen  = 0;
   20594    vassert(0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*WIG?*/);
   20595 
   20596    assign(tSL, invertLeftArg ? unop(Iop_NotV128, getXMMReg(rSL))
   20597                              : getXMMReg(rSL));
   20598 
   20599    if (epartIsReg(modrm)) {
   20600       UInt rSR = eregOfRexRM(pfx, modrm);
   20601       delta += 1;
   20602       assign(tSR, getXMMReg(rSR));
   20603       DIP("%s %s,%s,%s\n",
   20604           name, nameXMMReg(rSR), nameXMMReg(rSL), nameXMMReg(rD));
   20605    } else {
   20606       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   20607       delta += alen;
   20608       assign(tSR, loadLE(Ity_V128, mkexpr(addr)));
   20609       DIP("%s %s,%s,%s\n",
   20610           name, dis_buf, nameXMMReg(rSL), nameXMMReg(rD));
   20611    }
   20612 
   20613    IRTemp res = IRTemp_INVALID;
   20614    if (op != Iop_INVALID) {
   20615       vassert(opFn == NULL);
   20616       res = newTemp(Ity_V128);
   20617       assign(res, swapArgs ? binop(op, mkexpr(tSR), mkexpr(tSL))
   20618                            : binop(op, mkexpr(tSL), mkexpr(tSR)));
   20619    } else {
   20620       vassert(opFn != NULL);
   20621       res = swapArgs ? opFn(tSR, tSL) : opFn(tSL, tSR);
   20622    }
   20623 
   20624    putYMMRegLoAndZU(rD, mkexpr(res));
   20625 
   20626    *uses_vvvv = True;
   20627    return delta;
   20628 }
   20629 
   20630 
   20631 /* Handle a VEX_NDS_128_66_0F_WIG (3-addr) insn, with a simple IROp
   20632    for the operation, no inversion of the left arg, and no swapping of
   20633    args. */
   20634 static
   20635 Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple (
   20636         /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi,
   20637         Prefix pfx, Long delta, HChar* name,
   20638         IROp op
   20639      )
   20640 {
   20641    return dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   20642              uses_vvvv, vbi, pfx, delta, name, op, NULL, False, False);
   20643 }
   20644 
   20645 
   20646 /* Handle a VEX_NDS_128_66_0F_WIG (3-addr) insn, using the given IR
   20647    generator to compute the result, no inversion of the left
   20648    arg, and no swapping of args. */
   20649 static
   20650 Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex (
   20651         /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi,
   20652         Prefix pfx, Long delta, HChar* name,
   20653         IRTemp(*opFn)(IRTemp,IRTemp)
   20654      )
   20655 {
   20656    return dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   20657              uses_vvvv, vbi, pfx, delta, name,
   20658              Iop_INVALID, opFn, False, False );
   20659 }
   20660 
   20661 
   20662 /* Vector by scalar shift of V by the amount specified at the bottom
   20663    of E. */
   20664 static ULong dis_AVX128_shiftV_byE ( VexAbiInfo* vbi,
   20665                                      Prefix pfx, Long delta,
   20666                                      HChar* opname, IROp op )
   20667 {
   20668    HChar   dis_buf[50];
   20669    Int     alen, size;
   20670    IRTemp  addr;
   20671    Bool    shl, shr, sar;
   20672    UChar   modrm = getUChar(delta);
   20673    UInt    rG    = gregOfRexRM(pfx,modrm);
   20674    UInt    rV    = getVexNvvvv(pfx);;
   20675    IRTemp  g0    = newTemp(Ity_V128);
   20676    IRTemp  g1    = newTemp(Ity_V128);
   20677    IRTemp  amt   = newTemp(Ity_I64);
   20678    IRTemp  amt8  = newTemp(Ity_I8);
   20679    if (epartIsReg(modrm)) {
   20680       UInt rE = eregOfRexRM(pfx,modrm);
   20681       assign( amt, getXMMRegLane64(rE, 0) );
   20682       DIP("%s %s,%s,%s\n", opname, nameXMMReg(rE),
   20683           nameXMMReg(rV), nameXMMReg(rG) );
   20684       delta++;
   20685    } else {
   20686       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   20687       assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
   20688       DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   20689       delta += alen;
   20690    }
   20691    assign( g0, getXMMReg(rV) );
   20692    assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
   20693 
   20694    shl = shr = sar = False;
   20695    size = 0;
   20696    switch (op) {
   20697       case Iop_ShlN16x8: shl = True; size = 32; break;
   20698       case Iop_ShlN32x4: shl = True; size = 32; break;
   20699       case Iop_ShlN64x2: shl = True; size = 64; break;
   20700       case Iop_SarN16x8: sar = True; size = 16; break;
   20701       case Iop_SarN32x4: sar = True; size = 32; break;
   20702       case Iop_ShrN16x8: shr = True; size = 16; break;
   20703       case Iop_ShrN32x4: shr = True; size = 32; break;
   20704       case Iop_ShrN64x2: shr = True; size = 64; break;
   20705       default: vassert(0);
   20706    }
   20707 
   20708    if (shl || shr) {
   20709      assign(
   20710         g1,
   20711         IRExpr_Mux0X(
   20712            unop(Iop_1Uto8,
   20713                 binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size))),
   20714            mkV128(0x0000),
   20715            binop(op, mkexpr(g0), mkexpr(amt8))
   20716         )
   20717      );
   20718    } else
   20719    if (sar) {
   20720      assign(
   20721         g1,
   20722         IRExpr_Mux0X(
   20723            unop(Iop_1Uto8,
   20724                 binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size))),
   20725            binop(op, mkexpr(g0), mkU8(size-1)),
   20726            binop(op, mkexpr(g0), mkexpr(amt8))
   20727         )
   20728      );
   20729    } else {
   20730       vassert(0);
   20731    }
   20732 
   20733    putYMMRegLoAndZU( rG, mkexpr(g1) );
   20734    return delta;
   20735 }
   20736 
   20737 
   20738 /* Vector by scalar shift of E into V, by an immediate byte.  Modified
   20739    version of dis_SSE_shiftE_imm. */
   20740 static
   20741 Long dis_AVX128_shiftE_to_V_imm( Prefix pfx,
   20742                                  Long delta, HChar* opname, IROp op )
   20743 {
   20744    Bool    shl, shr, sar;
   20745    UChar   rm   = getUChar(delta);
   20746    IRTemp  e0   = newTemp(Ity_V128);
   20747    IRTemp  e1   = newTemp(Ity_V128);
   20748    UInt    rD   = getVexNvvvv(pfx);
   20749    UChar   amt, size;
   20750    vassert(epartIsReg(rm));
   20751    vassert(gregLO3ofRM(rm) == 2
   20752            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
   20753    amt = getUChar(delta+1);
   20754    delta += 2;
   20755    DIP("%s $%d,%s,%s\n", opname,
   20756                          (Int)amt,
   20757                          nameXMMReg(eregOfRexRM(pfx,rm)),
   20758                          nameXMMReg(rD));
   20759    assign( e0, getXMMReg(eregOfRexRM(pfx,rm)) );
   20760 
   20761    shl = shr = sar = False;
   20762    size = 0;
   20763    switch (op) {
   20764       case Iop_ShlN16x8: shl = True; size = 16; break;
   20765       case Iop_ShlN32x4: shl = True; size = 32; break;
   20766       case Iop_ShlN64x2: shl = True; size = 64; break;
   20767       case Iop_SarN16x8: sar = True; size = 16; break;
   20768       case Iop_SarN32x4: sar = True; size = 32; break;
   20769       case Iop_ShrN16x8: shr = True; size = 16; break;
   20770       case Iop_ShrN32x4: shr = True; size = 32; break;
   20771       case Iop_ShrN64x2: shr = True; size = 64; break;
   20772       default: vassert(0);
   20773    }
   20774 
   20775    if (shl || shr) {
   20776      assign( e1, amt >= size
   20777                     ? mkV128(0x0000)
   20778                     : binop(op, mkexpr(e0), mkU8(amt))
   20779      );
   20780    } else
   20781    if (sar) {
   20782      assign( e1, amt >= size
   20783                     ? binop(op, mkexpr(e0), mkU8(size-1))
   20784                     : binop(op, mkexpr(e0), mkU8(amt))
   20785      );
   20786    } else {
   20787       vassert(0);
   20788    }
   20789 
   20790    putYMMRegLoAndZU( rD, mkexpr(e1) );
   20791    return delta;
   20792 }
   20793 
   20794 
   20795 /* Lower 64-bit lane only AVX128 binary operation:
   20796    G[63:0]    = V[63:0] `op` E[63:0]
   20797    G[127:64]  = V[127:64]
   20798    G[255:128] = 0.
   20799    The specified op must be of the 64F0x2 kind, so that it
   20800    copies the upper half of the left operand to the result.
   20801 */
   20802 static Long dis_AVX128_E_V_to_G_lo64 ( /*OUT*/Bool* uses_vvvv,
   20803                                        VexAbiInfo* vbi,
   20804                                        Prefix pfx, Long delta,
   20805                                        HChar* opname, IROp op )
   20806 {
   20807    HChar   dis_buf[50];
   20808    Int     alen;
   20809    IRTemp  addr;
   20810    UChar   rm    = getUChar(delta);
   20811    UInt    rG    = gregOfRexRM(pfx,rm);
   20812    UInt    rV    = getVexNvvvv(pfx);
   20813    IRExpr* vpart = getXMMReg(rV);
   20814    if (epartIsReg(rm)) {
   20815       UInt rE = eregOfRexRM(pfx,rm);
   20816       putXMMReg( rG, binop(op, vpart, getXMMReg(rE)) );
   20817       DIP("%s %s,%s,%s\n", opname,
   20818           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   20819       delta = delta+1;
   20820    } else {
   20821       /* We can only do a 64-bit memory read, so the upper half of the
   20822          E operand needs to be made simply of zeroes. */
   20823       IRTemp epart = newTemp(Ity_V128);
   20824       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   20825       assign( epart, unop( Iop_64UtoV128,
   20826                            loadLE(Ity_I64, mkexpr(addr))) );
   20827       putXMMReg( rG, binop(op, vpart, mkexpr(epart)) );
   20828       DIP("%s %s,%s,%s\n", opname,
   20829           dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   20830       delta = delta+alen;
   20831    }
   20832    putYMMRegLane128( rG, 1, mkV128(0) );
   20833    *uses_vvvv = True;
   20834    return delta;
   20835 }
   20836 
   20837 
   20838 /* Lower 64-bit lane only AVX128 unary operation:
   20839    G[63:0]    = op(E[63:0])
   20840    G[127:64]  = V[127:64]
   20841    G[255:128] = 0
   20842    The specified op must be of the 64F0x2 kind, so that it
   20843    copies the upper half of the operand to the result.
   20844 */
   20845 static Long dis_AVX128_E_V_to_G_lo64_unary ( /*OUT*/Bool* uses_vvvv,
   20846                                              VexAbiInfo* vbi,
   20847                                              Prefix pfx, Long delta,
   20848                                              HChar* opname, IROp op )
   20849 {
   20850    HChar   dis_buf[50];
   20851    Int     alen;
   20852    IRTemp  addr;
   20853    UChar   rm  = getUChar(delta);
   20854    UInt    rG  = gregOfRexRM(pfx,rm);
   20855    UInt    rV  = getVexNvvvv(pfx);
   20856    IRTemp  e64 = newTemp(Ity_I64);
   20857 
   20858    /* Fetch E[63:0] */
   20859    if (epartIsReg(rm)) {
   20860       UInt rE = eregOfRexRM(pfx,rm);
   20861       assign(e64, getXMMRegLane64(rE, 0));
   20862       DIP("%s %s,%s,%s\n", opname,
   20863           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   20864       delta += 1;
   20865    } else {
   20866       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   20867       assign(e64, loadLE(Ity_I64, mkexpr(addr)));
   20868       DIP("%s %s,%s,%s\n", opname,
   20869           dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   20870       delta += alen;
   20871    }
   20872 
   20873    /* Create a value 'arg' as V[127:64]++E[63:0] */
   20874    IRTemp arg = newTemp(Ity_V128);
   20875    assign(arg,
   20876           binop(Iop_SetV128lo64,
   20877                 getXMMReg(rV), mkexpr(e64)));
   20878    /* and apply op to it */
   20879    putYMMRegLoAndZU( rG, unop(op, mkexpr(arg)) );
   20880    *uses_vvvv = True;
   20881    return delta;
   20882 }
   20883 
   20884 
   20885 /* Lower 32-bit lane only AVX128 unary operation:
   20886    G[31:0]    = op(E[31:0])
   20887    G[127:32]  = V[127:32]
   20888    G[255:128] = 0
   20889    The specified op must be of the 32F0x4 kind, so that it
   20890    copies the upper 3/4 of the operand to the result.
   20891 */
   20892 static Long dis_AVX128_E_V_to_G_lo32_unary ( /*OUT*/Bool* uses_vvvv,
   20893                                              VexAbiInfo* vbi,
   20894                                              Prefix pfx, Long delta,
   20895                                              HChar* opname, IROp op )
   20896 {
   20897    HChar   dis_buf[50];
   20898    Int     alen;
   20899    IRTemp  addr;
   20900    UChar   rm  = getUChar(delta);
   20901    UInt    rG  = gregOfRexRM(pfx,rm);
   20902    UInt    rV  = getVexNvvvv(pfx);
   20903    IRTemp  e32 = newTemp(Ity_I32);
   20904 
   20905    /* Fetch E[31:0] */
   20906    if (epartIsReg(rm)) {
   20907       UInt rE = eregOfRexRM(pfx,rm);
   20908       assign(e32, getXMMRegLane32(rE, 0));
   20909       DIP("%s %s,%s,%s\n", opname,
   20910           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   20911       delta += 1;
   20912    } else {
   20913       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   20914       assign(e32, loadLE(Ity_I32, mkexpr(addr)));
   20915       DIP("%s %s,%s,%s\n", opname,
   20916           dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   20917       delta += alen;
   20918    }
   20919 
   20920    /* Create a value 'arg' as V[127:32]++E[31:0] */
   20921    IRTemp arg = newTemp(Ity_V128);
   20922    assign(arg,
   20923           binop(Iop_SetV128lo32,
   20924                 getXMMReg(rV), mkexpr(e32)));
   20925    /* and apply op to it */
   20926    putYMMRegLoAndZU( rG, unop(op, mkexpr(arg)) );
   20927    *uses_vvvv = True;
   20928    return delta;
   20929 }
   20930 
   20931 
   20932 /* Lower 32-bit lane only AVX128 binary operation:
   20933    G[31:0]    = V[31:0] `op` E[31:0]
   20934    G[127:32]  = V[127:32]
   20935    G[255:128] = 0.
   20936    The specified op must be of the 32F0x4 kind, so that it
   20937    copies the upper 3/4 of the left operand to the result.
   20938 */
   20939 static Long dis_AVX128_E_V_to_G_lo32 ( /*OUT*/Bool* uses_vvvv,
   20940                                        VexAbiInfo* vbi,
   20941                                        Prefix pfx, Long delta,
   20942                                        HChar* opname, IROp op )
   20943 {
   20944    HChar   dis_buf[50];
   20945    Int     alen;
   20946    IRTemp  addr;
   20947    UChar   rm    = getUChar(delta);
   20948    UInt    rG    = gregOfRexRM(pfx,rm);
   20949    UInt    rV    = getVexNvvvv(pfx);
   20950    IRExpr* vpart = getXMMReg(rV);
   20951    if (epartIsReg(rm)) {
   20952       UInt rE = eregOfRexRM(pfx,rm);
   20953       putXMMReg( rG, binop(op, vpart, getXMMReg(rE)) );
   20954       DIP("%s %s,%s,%s\n", opname,
   20955           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   20956       delta = delta+1;
   20957    } else {
   20958       /* We can only do a 32-bit memory read, so the upper 3/4 of the
   20959          E operand needs to be made simply of zeroes. */
   20960       IRTemp epart = newTemp(Ity_V128);
   20961       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   20962       assign( epart, unop( Iop_32UtoV128,
   20963                            loadLE(Ity_I32, mkexpr(addr))) );
   20964       putXMMReg( rG, binop(op, vpart, mkexpr(epart)) );
   20965       DIP("%s %s,%s,%s\n", opname,
   20966           dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   20967       delta = delta+alen;
   20968    }
   20969    putYMMRegLane128( rG, 1, mkV128(0) );
   20970    *uses_vvvv = True;
   20971    return delta;
   20972 }
   20973 
   20974 
   20975 /* All-lanes AVX128 binary operation:
   20976    G[127:0]   = V[127:0] `op` E[127:0]
   20977    G[255:128] = 0.
   20978 */
   20979 static Long dis_AVX128_E_V_to_G ( /*OUT*/Bool* uses_vvvv,
   20980                                   VexAbiInfo* vbi,
   20981                                   Prefix pfx, Long delta,
   20982                                   HChar* opname, IROp op )
   20983 {
   20984    return dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   20985              uses_vvvv, vbi, pfx, delta, opname, op,
   20986              NULL, False/*!invertLeftArg*/, False/*!swapArgs*/
   20987    );
   20988 }
   20989 
   20990 
   20991 /* Handles AVX128 32F/64F comparisons.  A derivative of
   20992    dis_SSEcmp_E_to_G.  It can fail, in which case it returns the
   20993    original delta to indicate failure. */
   20994 static
   20995 Long dis_AVX128_cmp_V_E_to_G ( /*OUT*/Bool* uses_vvvv,
   20996                                VexAbiInfo* vbi,
   20997                                Prefix pfx, Long delta,
   20998                                HChar* opname, Bool all_lanes, Int sz )
   20999 {
   21000    vassert(sz == 4 || sz == 8);
   21001    Long    deltaIN = delta;
   21002    HChar   dis_buf[50];
   21003    Int     alen;
   21004    UInt    imm8;
   21005    IRTemp  addr;
   21006    Bool    preSwap = False;
   21007    IROp    op      = Iop_INVALID;
   21008    Bool    postNot = False;
   21009    IRTemp  plain   = newTemp(Ity_V128);
   21010    UChar   rm      = getUChar(delta);
   21011    UInt    rG      = gregOfRexRM(pfx, rm);
   21012    UInt    rV      = getVexNvvvv(pfx);
   21013    IRTemp argL     = newTemp(Ity_V128);
   21014    IRTemp argR     = newTemp(Ity_V128);
   21015 
   21016    assign(argL, getXMMReg(rV));
   21017    if (epartIsReg(rm)) {
   21018       imm8 = getUChar(delta+1);
   21019       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
   21020       if (!ok) return deltaIN; /* FAIL */
   21021       UInt rE = eregOfRexRM(pfx,rm);
   21022       assign(argR, getXMMReg(rE));
   21023       delta += 1+1;
   21024       DIP("%s $%d,%s,%s,%s\n",
   21025           opname, (Int)imm8,
   21026           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   21027    } else {
   21028       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   21029       imm8 = getUChar(delta+alen);
   21030       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
   21031       if (!ok) return deltaIN; /* FAIL */
   21032       assign(argR,
   21033              all_lanes   ? loadLE(Ity_V128, mkexpr(addr))
   21034              : sz == 8   ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
   21035              : /*sz==4*/   unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr))));
   21036       delta += alen+1;
   21037       DIP("%s $%d,%s,%s,%s\n",
   21038           opname, (Int)imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   21039    }
   21040 
   21041    assign(plain, preSwap ? binop(op, mkexpr(argR), mkexpr(argL))
   21042                          : binop(op, mkexpr(argL), mkexpr(argR)));
   21043 
   21044    if (all_lanes) {
   21045       /* This is simple: just invert the result, if necessary, and
   21046          have done. */
   21047       if (postNot) {
   21048          putYMMRegLoAndZU( rG, unop(Iop_NotV128, mkexpr(plain)) );
   21049       } else {
   21050          putYMMRegLoAndZU( rG, mkexpr(plain) );
   21051       }
   21052    }
   21053    else
   21054    if (!preSwap) {
   21055       /* More complex.  It's a one-lane-only, hence need to possibly
   21056          invert only that one lane.  But at least the other lanes are
   21057          correctly "in" the result, having been copied from the left
   21058          operand (argL). */
   21059       if (postNot) {
   21060          IRExpr* mask = mkV128(sz==4 ? 0x000F : 0x00FF);
   21061          putYMMRegLoAndZU( rG, binop(Iop_XorV128, mkexpr(plain),
   21062                                                   mask) );
   21063       } else {
   21064          putYMMRegLoAndZU( rG, mkexpr(plain) );
   21065       }
   21066    }
   21067    else {
   21068       /* This is the most complex case.  One-lane-only, but the args
   21069          were swapped.  So we have to possibly invert the bottom lane,
   21070          and (definitely) we have to copy the upper lane(s) from argL
   21071          since, due to the swapping, what's currently there is from
   21072          argR, which is not correct. */
   21073       IRTemp res     = newTemp(Ity_V128);
   21074       IRTemp mask    = newTemp(Ity_V128);
   21075       IRTemp notMask = newTemp(Ity_V128);
   21076       assign(mask,    mkV128(sz==4 ? 0x000F : 0x00FF));
   21077       assign(notMask, mkV128(sz==4 ? 0xFFF0 : 0xFF00));
   21078       if (postNot) {
   21079          assign(res,
   21080                 binop(Iop_OrV128,
   21081                       binop(Iop_AndV128,
   21082                             unop(Iop_NotV128, mkexpr(plain)),
   21083                             mkexpr(mask)),
   21084                       binop(Iop_AndV128, mkexpr(argL), mkexpr(notMask))));
   21085       } else {
   21086          assign(res,
   21087                 binop(Iop_OrV128,
   21088                       binop(Iop_AndV128,
   21089                             mkexpr(plain),
   21090                             mkexpr(mask)),
   21091                       binop(Iop_AndV128, mkexpr(argL), mkexpr(notMask))));
   21092       }
   21093       putYMMRegLoAndZU( rG, mkexpr(res) );
   21094    }
   21095 
   21096    *uses_vvvv = True;
   21097    return delta;
   21098 }
   21099 
   21100 
   21101 /* Handles AVX256 32F/64F comparisons.  A derivative of
   21102    dis_SSEcmp_E_to_G.  It can fail, in which case it returns the
   21103    original delta to indicate failure. */
   21104 static
   21105 Long dis_AVX256_cmp_V_E_to_G ( /*OUT*/Bool* uses_vvvv,
   21106                                VexAbiInfo* vbi,
   21107                                Prefix pfx, Long delta,
   21108                                HChar* opname, Int sz )
   21109 {
   21110    vassert(sz == 4 || sz == 8);
   21111    Long    deltaIN = delta;
   21112    HChar   dis_buf[50];
   21113    Int     alen;
   21114    UInt    imm8;
   21115    IRTemp  addr;
   21116    Bool    preSwap = False;
   21117    IROp    op      = Iop_INVALID;
   21118    Bool    postNot = False;
   21119    IRTemp  plain   = newTemp(Ity_V256);
   21120    UChar   rm      = getUChar(delta);
   21121    UInt    rG      = gregOfRexRM(pfx, rm);
   21122    UInt    rV      = getVexNvvvv(pfx);
   21123    IRTemp argL     = newTemp(Ity_V256);
   21124    IRTemp argR     = newTemp(Ity_V256);
   21125    IRTemp argLhi   = IRTemp_INVALID;
   21126    IRTemp argLlo   = IRTemp_INVALID;
   21127    IRTemp argRhi   = IRTemp_INVALID;
   21128    IRTemp argRlo   = IRTemp_INVALID;
   21129 
   21130    assign(argL, getYMMReg(rV));
   21131    if (epartIsReg(rm)) {
   21132       imm8 = getUChar(delta+1);
   21133       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8,
   21134                              True/*all_lanes*/, sz);
   21135       if (!ok) return deltaIN; /* FAIL */
   21136       UInt rE = eregOfRexRM(pfx,rm);
   21137       assign(argR, getYMMReg(rE));
   21138       delta += 1+1;
   21139       DIP("%s $%d,%s,%s,%s\n",
   21140           opname, (Int)imm8,
   21141           nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   21142    } else {
   21143       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   21144       imm8 = getUChar(delta+alen);
   21145       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8,
   21146                              True/*all_lanes*/, sz);
   21147       if (!ok) return deltaIN; /* FAIL */
   21148       assign(argR, loadLE(Ity_V256, mkexpr(addr)) );
   21149       delta += alen+1;
   21150       DIP("%s $%d,%s,%s,%s\n",
   21151           opname, (Int)imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   21152    }
   21153 
   21154    breakupV256toV128s( preSwap ? argR : argL, &argLhi, &argLlo );
   21155    breakupV256toV128s( preSwap ? argL : argR, &argRhi, &argRlo );
   21156    assign(plain, binop( Iop_V128HLtoV256,
   21157                         binop(op, mkexpr(argLhi), mkexpr(argRhi)),
   21158                         binop(op, mkexpr(argLlo), mkexpr(argRlo)) ) );
   21159 
   21160    /* This is simple: just invert the result, if necessary, and
   21161       have done. */
   21162    if (postNot) {
   21163       putYMMReg( rG, unop(Iop_NotV256, mkexpr(plain)) );
   21164    } else {
   21165       putYMMReg( rG, mkexpr(plain) );
   21166    }
   21167 
   21168    *uses_vvvv = True;
   21169    return delta;
   21170 }
   21171 
   21172 
   21173 /* Handles AVX128 unary E-to-G all-lanes operations. */
   21174 static
   21175 Long dis_AVX128_E_to_G_unary ( /*OUT*/Bool* uses_vvvv,
   21176                                VexAbiInfo* vbi,
   21177                                Prefix pfx, Long delta,
   21178                                HChar* opname,
   21179                                IRTemp (*opFn)(IRTemp) )
   21180 {
   21181    HChar  dis_buf[50];
   21182    Int    alen;
   21183    IRTemp addr;
   21184    IRTemp res  = newTemp(Ity_V128);
   21185    IRTemp arg  = newTemp(Ity_V128);
   21186    UChar  rm   = getUChar(delta);
   21187    UInt   rG   = gregOfRexRM(pfx, rm);
   21188    if (epartIsReg(rm)) {
   21189       UInt rE = eregOfRexRM(pfx,rm);
   21190       assign(arg, getXMMReg(rE));
   21191       delta += 1;
   21192       DIP("%s %s,%s\n", opname, nameXMMReg(rE), nameXMMReg(rG));
   21193    } else {
   21194       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21195       assign(arg, loadLE(Ity_V128, mkexpr(addr)));
   21196       delta += alen;
   21197       DIP("%s %s,%s\n", opname, dis_buf, nameXMMReg(rG));
   21198    }
   21199    res = opFn(arg);
   21200    putYMMRegLoAndZU( rG, mkexpr(res) );
   21201    *uses_vvvv = False;
   21202    return delta;
   21203 }
   21204 
   21205 
   21206 /* Handles AVX128 unary E-to-G all-lanes operations. */
   21207 static
   21208 Long dis_AVX128_E_to_G_unary_all ( /*OUT*/Bool* uses_vvvv,
   21209                                    VexAbiInfo* vbi,
   21210                                    Prefix pfx, Long delta,
   21211                                    HChar* opname, IROp op )
   21212 {
   21213    HChar  dis_buf[50];
   21214    Int    alen;
   21215    IRTemp addr;
   21216    IRTemp arg  = newTemp(Ity_V128);
   21217    UChar  rm   = getUChar(delta);
   21218    UInt   rG   = gregOfRexRM(pfx, rm);
   21219    if (epartIsReg(rm)) {
   21220       UInt rE = eregOfRexRM(pfx,rm);
   21221       assign(arg, getXMMReg(rE));
   21222       delta += 1;
   21223       DIP("%s %s,%s\n", opname, nameXMMReg(rE), nameXMMReg(rG));
   21224    } else {
   21225       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21226       assign(arg, loadLE(Ity_V128, mkexpr(addr)));
   21227       delta += alen;
   21228       DIP("%s %s,%s\n", opname, dis_buf, nameXMMReg(rG));
   21229    }
   21230    putYMMRegLoAndZU( rG, unop(op, mkexpr(arg)) );
   21231    *uses_vvvv = False;
   21232    return delta;
   21233 }
   21234 
   21235 
   21236 /* FIXME: common up with the _128_ version above? */
   21237 static
   21238 Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG (
   21239         /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi,
   21240         Prefix pfx, Long delta, HChar* name,
   21241         /* The actual operation.  Use either 'op' or 'opfn',
   21242            but not both. */
   21243         IROp op, IRTemp(*opFn)(IRTemp,IRTemp),
   21244         Bool invertLeftArg,
   21245         Bool swapArgs
   21246      )
   21247 {
   21248    UChar  modrm = getUChar(delta);
   21249    UInt   rD    = gregOfRexRM(pfx, modrm);
   21250    UInt   rSL   = getVexNvvvv(pfx);
   21251    IRTemp tSL   = newTemp(Ity_V256);
   21252    IRTemp tSR   = newTemp(Ity_V256);
   21253    IRTemp addr  = IRTemp_INVALID;
   21254    HChar  dis_buf[50];
   21255    Int    alen  = 0;
   21256    vassert(1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*WIG?*/);
   21257 
   21258    assign(tSL, invertLeftArg ? unop(Iop_NotV256, getYMMReg(rSL))
   21259                              : getYMMReg(rSL));
   21260 
   21261    if (epartIsReg(modrm)) {
   21262       UInt rSR = eregOfRexRM(pfx, modrm);
   21263       delta += 1;
   21264       assign(tSR, getYMMReg(rSR));
   21265       DIP("%s %s,%s,%s\n",
   21266           name, nameYMMReg(rSR), nameYMMReg(rSL), nameYMMReg(rD));
   21267    } else {
   21268       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   21269       delta += alen;
   21270       assign(tSR, loadLE(Ity_V256, mkexpr(addr)));
   21271       DIP("%s %s,%s,%s\n",
   21272           name, dis_buf, nameYMMReg(rSL), nameYMMReg(rD));
   21273    }
   21274 
   21275    IRTemp res = IRTemp_INVALID;
   21276    if (op != Iop_INVALID) {
   21277       vassert(opFn == NULL);
   21278       res = newTemp(Ity_V256);
   21279       assign(res, swapArgs ? binop(op, mkexpr(tSR), mkexpr(tSL))
   21280                            : binop(op, mkexpr(tSL), mkexpr(tSR)));
   21281    } else {
   21282       vassert(opFn != NULL);
   21283       res = swapArgs ? opFn(tSR, tSL) : opFn(tSL, tSR);
   21284    }
   21285 
   21286    putYMMReg(rD, mkexpr(res));
   21287 
   21288    *uses_vvvv = True;
   21289    return delta;
   21290 }
   21291 
   21292 
   21293 /* All-lanes AVX256 binary operation:
   21294    G[255:0] = V[255:0] `op` E[255:0]
   21295 */
   21296 static Long dis_AVX256_E_V_to_G ( /*OUT*/Bool* uses_vvvv,
   21297                                   VexAbiInfo* vbi,
   21298                                   Prefix pfx, Long delta,
   21299                                   HChar* opname, IROp op )
   21300 {
   21301    return dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
   21302              uses_vvvv, vbi, pfx, delta, opname, op,
   21303              NULL, False/*!invertLeftArg*/, False/*!swapArgs*/
   21304    );
   21305 }
   21306 
   21307 
   21308 /* Handle a VEX_NDS_256_66_0F_WIG (3-addr) insn, using the given IR
   21309    generator to compute the result, no inversion of the left
   21310    arg, and no swapping of args. */
   21311 static
   21312 Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex (
   21313         /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi,
   21314         Prefix pfx, Long delta, HChar* name,
   21315         IRTemp(*opFn)(IRTemp,IRTemp)
   21316      )
   21317 {
   21318    return dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
   21319              uses_vvvv, vbi, pfx, delta, name,
   21320              Iop_INVALID, opFn, False, False );
   21321 }
   21322 
   21323 
   21324 /* Handles AVX256 unary E-to-G all-lanes operations. */
   21325 static
   21326 Long dis_AVX256_E_to_G_unary_all ( /*OUT*/Bool* uses_vvvv,
   21327                                    VexAbiInfo* vbi,
   21328                                    Prefix pfx, Long delta,
   21329                                    HChar* opname, IROp op )
   21330 {
   21331    HChar  dis_buf[50];
   21332    Int    alen;
   21333    IRTemp addr;
   21334    IRTemp arg  = newTemp(Ity_V256);
   21335    UChar  rm   = getUChar(delta);
   21336    UInt   rG   = gregOfRexRM(pfx, rm);
   21337    if (epartIsReg(rm)) {
   21338       UInt rE = eregOfRexRM(pfx,rm);
   21339       assign(arg, getYMMReg(rE));
   21340       delta += 1;
   21341       DIP("%s %s,%s\n", opname, nameYMMReg(rE), nameYMMReg(rG));
   21342    } else {
   21343       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21344       assign(arg, loadLE(Ity_V256, mkexpr(addr)));
   21345       delta += alen;
   21346       DIP("%s %s,%s\n", opname, dis_buf, nameYMMReg(rG));
   21347    }
   21348    putYMMReg( rG, unop(op, mkexpr(arg)) );
   21349    *uses_vvvv = False;
   21350    return delta;
   21351 }
   21352 
   21353 
   21354 /* The use of ReinterpF64asI64 is ugly.  Surely could do better if we
   21355    had a variant of Iop_64x4toV256 that took F64s as args instead. */
   21356 static Long dis_CVTDQ2PD_256 ( VexAbiInfo* vbi, Prefix pfx,
   21357                                Long delta )
   21358 {
   21359    IRTemp addr  = IRTemp_INVALID;
   21360    Int    alen  = 0;
   21361    HChar  dis_buf[50];
   21362    UChar  modrm = getUChar(delta);
   21363    IRTemp sV    = newTemp(Ity_V128);
   21364    UInt   rG    = gregOfRexRM(pfx,modrm);
   21365    if (epartIsReg(modrm)) {
   21366       UInt rE = eregOfRexRM(pfx,modrm);
   21367       assign( sV, getXMMReg(rE) );
   21368       delta += 1;
   21369       DIP("vcvtdq2pd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   21370    } else {
   21371       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21372       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   21373       delta += alen;
   21374       DIP("vcvtdq2pd %s,%s\n", dis_buf, nameYMMReg(rG) );
   21375    }
   21376    IRTemp s3, s2, s1, s0;
   21377    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   21378    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   21379    IRExpr* res
   21380       = IRExpr_Qop(
   21381            Iop_64x4toV256,
   21382            unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s3))),
   21383            unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s2))),
   21384            unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s1))),
   21385            unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s0)))
   21386         );
   21387    putYMMReg(rG, res);
   21388    return delta;
   21389 }
   21390 
   21391 
   21392 static Long dis_CVTPD2PS_256 ( VexAbiInfo* vbi, Prefix pfx,
   21393                                Long delta )
   21394 {
   21395    IRTemp addr  = IRTemp_INVALID;
   21396    Int    alen  = 0;
   21397    HChar  dis_buf[50];
   21398    UChar  modrm = getUChar(delta);
   21399    UInt   rG    = gregOfRexRM(pfx,modrm);
   21400    IRTemp argV  = newTemp(Ity_V256);
   21401    IRTemp rmode = newTemp(Ity_I32);
   21402    if (epartIsReg(modrm)) {
   21403       UInt rE = eregOfRexRM(pfx,modrm);
   21404       assign( argV, getYMMReg(rE) );
   21405       delta += 1;
   21406       DIP("vcvtpd2psy %s,%s\n", nameYMMReg(rE), nameXMMReg(rG));
   21407    } else {
   21408       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21409       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
   21410       delta += alen;
   21411       DIP("vcvtpd2psy %s,%s\n", dis_buf, nameXMMReg(rG) );
   21412    }
   21413 
   21414    assign( rmode, get_sse_roundingmode() );
   21415    IRTemp t3, t2, t1, t0;
   21416    t3 = t2 = t1 = t0 = IRTemp_INVALID;
   21417    breakupV256to64s( argV, &t3, &t2, &t1, &t0 );
   21418 #  define CVT(_t)  binop( Iop_F64toF32, mkexpr(rmode), \
   21419                           unop(Iop_ReinterpI64asF64, mkexpr(_t)) )
   21420    putXMMRegLane32F( rG, 3, CVT(t3) );
   21421    putXMMRegLane32F( rG, 2, CVT(t2) );
   21422    putXMMRegLane32F( rG, 1, CVT(t1) );
   21423    putXMMRegLane32F( rG, 0, CVT(t0) );
   21424 #  undef CVT
   21425    putYMMRegLane128( rG, 1, mkV128(0) );
   21426    return delta;
   21427 }
   21428 
   21429 
   21430 __attribute__((noinline))
   21431 static
   21432 Long dis_ESC_0F__VEX (
   21433         /*MB_OUT*/DisResult* dres,
   21434         /*OUT*/   Bool*      uses_vvvv,
   21435         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
   21436         Bool         resteerCisOk,
   21437         void*        callback_opaque,
   21438         VexArchInfo* archinfo,
   21439         VexAbiInfo*  vbi,
   21440         Prefix pfx, Int sz, Long deltaIN
   21441      )
   21442 {
   21443    IRTemp addr  = IRTemp_INVALID;
   21444    Int    alen  = 0;
   21445    HChar  dis_buf[50];
   21446    Long   delta = deltaIN;
   21447    UChar  opc   = getUChar(delta);
   21448    delta++;
   21449    *uses_vvvv = False;
   21450 
   21451    switch (opc) {
   21452 
   21453    case 0x10:
   21454       /* VMOVSD m64, xmm1 = VEX.LIG.F2.0F.WIG 10 /r */
   21455       /* Move 64 bits from E (mem only) to G (lo half xmm).
   21456          Bits 255-64 of the dest are zeroed out. */
   21457       if (haveF2no66noF3(pfx) && !epartIsReg(getUChar(delta))) {
   21458          UChar modrm = getUChar(delta);
   21459          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21460          UInt   rG   = gregOfRexRM(pfx,modrm);
   21461          IRTemp z128 = newTemp(Ity_V128);
   21462          assign(z128, mkV128(0));
   21463          putXMMReg( rG, mkexpr(z128) );
   21464          /* FIXME: ALIGNMENT CHECK? */
   21465          putXMMRegLane64( rG, 0, loadLE(Ity_I64, mkexpr(addr)) );
   21466          putYMMRegLane128( rG, 1, mkexpr(z128) );
   21467          DIP("vmovsd %s,%s\n", dis_buf, nameXMMReg(rG));
   21468          delta += alen;
   21469          goto decode_success;
   21470       }
   21471       /* VMOVSD xmm3, xmm2, xmm1 = VEX.LIG.F2.0F.WIG 10 /r */
   21472       /* Reg form. */
   21473       if (haveF2no66noF3(pfx) && epartIsReg(getUChar(delta))) {
   21474          UChar modrm = getUChar(delta);
   21475          UInt  rG    = gregOfRexRM(pfx, modrm);
   21476          UInt  rE    = eregOfRexRM(pfx, modrm);
   21477          UInt  rV    = getVexNvvvv(pfx);
   21478          delta++;
   21479          DIP("vmovsd %s,%s,%s\n",
   21480              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   21481          IRTemp res = newTemp(Ity_V128);
   21482          assign(res, binop(Iop_64HLtoV128,
   21483                            getXMMRegLane64(rV, 1),
   21484                            getXMMRegLane64(rE, 0)));
   21485          putYMMRegLoAndZU(rG, mkexpr(res));
   21486          *uses_vvvv = True;
   21487          goto decode_success;
   21488       }
   21489       /* VMOVSS m32, xmm1 = VEX.LIG.F3.0F.WIG 10 /r */
   21490       /* Move 32 bits from E (mem only) to G (lo half xmm).
   21491          Bits 255-32 of the dest are zeroed out. */
   21492       if (haveF3no66noF2(pfx) && !epartIsReg(getUChar(delta))) {
   21493          UChar modrm = getUChar(delta);
   21494          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21495          UInt   rG   = gregOfRexRM(pfx,modrm);
   21496          IRTemp z128 = newTemp(Ity_V128);
   21497          assign(z128, mkV128(0));
   21498          putXMMReg( rG, mkexpr(z128) );
   21499          /* FIXME: ALIGNMENT CHECK? */
   21500          putXMMRegLane32( rG, 0, loadLE(Ity_I32, mkexpr(addr)) );
   21501          putYMMRegLane128( rG, 1, mkexpr(z128) );
   21502          DIP("vmovss %s,%s\n", dis_buf, nameXMMReg(rG));
   21503          delta += alen;
   21504          goto decode_success;
   21505       }
   21506       /* VMOVSS xmm3, xmm2, xmm1 = VEX.LIG.F3.0F.WIG 10 /r */
   21507       /* Reg form. */
   21508       if (haveF3no66noF2(pfx) && epartIsReg(getUChar(delta))) {
   21509          UChar modrm = getUChar(delta);
   21510          UInt  rG    = gregOfRexRM(pfx, modrm);
   21511          UInt  rE    = eregOfRexRM(pfx, modrm);
   21512          UInt  rV    = getVexNvvvv(pfx);
   21513          delta++;
   21514          DIP("vmovss %s,%s,%s\n",
   21515              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   21516          IRTemp res = newTemp(Ity_V128);
   21517          assign( res, binop( Iop_64HLtoV128,
   21518                              getXMMRegLane64(rV, 1),
   21519                              binop(Iop_32HLto64,
   21520                                    getXMMRegLane32(rV, 1),
   21521                                    getXMMRegLane32(rE, 0)) ) );
   21522          putYMMRegLoAndZU(rG, mkexpr(res));
   21523          *uses_vvvv = True;
   21524          goto decode_success;
   21525       }
   21526       /* VMOVUPD xmm2/m128, xmm1 = VEX.128.66.0F.WIG 10 /r */
   21527       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   21528          UChar modrm = getUChar(delta);
   21529          UInt  rG    = gregOfRexRM(pfx, modrm);
   21530          if (epartIsReg(modrm)) {
   21531             UInt rE = eregOfRexRM(pfx,modrm);
   21532             putYMMRegLoAndZU( rG, getXMMReg( rE ));
   21533             DIP("vmovupd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   21534             delta += 1;
   21535          } else {
   21536             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21537             putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
   21538             DIP("vmovupd %s,%s\n", dis_buf, nameXMMReg(rG));
   21539             delta += alen;
   21540          }
   21541          goto decode_success;
   21542       }
   21543       /* VMOVUPD ymm2/m256, ymm1 = VEX.256.66.0F.WIG 10 /r */
   21544       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   21545          UChar modrm = getUChar(delta);
   21546          UInt  rG    = gregOfRexRM(pfx, modrm);
   21547          if (epartIsReg(modrm)) {
   21548             UInt rE = eregOfRexRM(pfx,modrm);
   21549             putYMMReg( rG, getYMMReg( rE ));
   21550             DIP("vmovupd %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   21551             delta += 1;
   21552          } else {
   21553             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21554             putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
   21555             DIP("vmovupd %s,%s\n", dis_buf, nameYMMReg(rG));
   21556             delta += alen;
   21557          }
   21558          goto decode_success;
   21559       }
   21560       /* VMOVUPS xmm2/m128, xmm1 = VEX.128.0F.WIG 10 /r */
   21561       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   21562          UChar modrm = getUChar(delta);
   21563          UInt  rG    = gregOfRexRM(pfx, modrm);
   21564          if (epartIsReg(modrm)) {
   21565             UInt rE = eregOfRexRM(pfx,modrm);
   21566             putYMMRegLoAndZU( rG, getXMMReg( rE ));
   21567             DIP("vmovups %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   21568             delta += 1;
   21569          } else {
   21570             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21571             putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
   21572             DIP("vmovups %s,%s\n", dis_buf, nameXMMReg(rG));
   21573             delta += alen;
   21574          }
   21575          goto decode_success;
   21576       }
   21577       /* VMOVUPS ymm2/m256, ymm1 = VEX.256.0F.WIG 10 /r */
   21578       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   21579          UChar modrm = getUChar(delta);
   21580          UInt  rG    = gregOfRexRM(pfx, modrm);
   21581          if (epartIsReg(modrm)) {
   21582             UInt rE = eregOfRexRM(pfx,modrm);
   21583             putYMMReg( rG, getYMMReg( rE ));
   21584             DIP("vmovups %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   21585             delta += 1;
   21586          } else {
   21587             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21588             putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
   21589             DIP("vmovups %s,%s\n", dis_buf, nameYMMReg(rG));
   21590             delta += alen;
   21591          }
   21592          goto decode_success;
   21593       }
   21594       break;
   21595 
   21596    case 0x11:
   21597       /* VMOVSD xmm1, m64 = VEX.LIG.F2.0F.WIG 11 /r */
   21598       /* Move 64 bits from G (low half xmm) to mem only. */
   21599       if (haveF2no66noF3(pfx) && !epartIsReg(getUChar(delta))) {
   21600          UChar modrm = getUChar(delta);
   21601          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21602          UInt   rG   = gregOfRexRM(pfx,modrm);
   21603          /* FIXME: ALIGNMENT CHECK? */
   21604          storeLE( mkexpr(addr), getXMMRegLane64(rG, 0));
   21605          DIP("vmovsd %s,%s\n", nameXMMReg(rG), dis_buf);
   21606          delta += alen;
   21607          goto decode_success;
   21608       }
   21609       /* VMOVSD xmm3, xmm2, xmm1 = VEX.LIG.F2.0F.WIG 11 /r */
   21610       /* Reg form. */
   21611       if (haveF2no66noF3(pfx) && epartIsReg(getUChar(delta))) {
   21612          UChar modrm = getUChar(delta);
   21613          UInt  rG    = gregOfRexRM(pfx, modrm);
   21614          UInt  rE    = eregOfRexRM(pfx, modrm);
   21615          UInt  rV    = getVexNvvvv(pfx);
   21616          delta++;
   21617          DIP("vmovsd %s,%s,%s\n",
   21618              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   21619          IRTemp res = newTemp(Ity_V128);
   21620          assign(res, binop(Iop_64HLtoV128,
   21621                            getXMMRegLane64(rV, 1),
   21622                            getXMMRegLane64(rE, 0)));
   21623          putYMMRegLoAndZU(rG, mkexpr(res));
   21624          *uses_vvvv = True;
   21625          goto decode_success;
   21626       }
   21627       /* VMOVSS xmm1, m64 = VEX.LIG.F3.0F.WIG 11 /r */
   21628       /* Move 32 bits from G (low 1/4 xmm) to mem only. */
   21629       if (haveF3no66noF2(pfx) && !epartIsReg(getUChar(delta))) {
   21630          UChar modrm = getUChar(delta);
   21631          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21632          UInt   rG   = gregOfRexRM(pfx,modrm);
   21633          /* FIXME: ALIGNMENT CHECK? */
   21634          storeLE( mkexpr(addr), getXMMRegLane32(rG, 0));
   21635          DIP("vmovss %s,%s\n", nameXMMReg(rG), dis_buf);
   21636          delta += alen;
   21637          goto decode_success;
   21638       }
   21639       /* VMOVSS xmm3, xmm2, xmm1 = VEX.LIG.F3.0F.WIG 11 /r */
   21640       /* Reg form. */
   21641       if (haveF3no66noF2(pfx) && epartIsReg(getUChar(delta))) {
   21642          UChar modrm = getUChar(delta);
   21643          UInt  rG    = gregOfRexRM(pfx, modrm);
   21644          UInt  rE    = eregOfRexRM(pfx, modrm);
   21645          UInt  rV    = getVexNvvvv(pfx);
   21646          delta++;
   21647          DIP("vmovss %s,%s,%s\n",
   21648              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   21649          IRTemp res = newTemp(Ity_V128);
   21650          assign( res, binop( Iop_64HLtoV128,
   21651                              getXMMRegLane64(rV, 1),
   21652                              binop(Iop_32HLto64,
   21653                                    getXMMRegLane32(rV, 1),
   21654                                    getXMMRegLane32(rE, 0)) ) );
   21655          putYMMRegLoAndZU(rG, mkexpr(res));
   21656          *uses_vvvv = True;
   21657          goto decode_success;
   21658       }
   21659       /* VMOVUPD xmm1, xmm2/m128 = VEX.128.66.0F.WIG 11 /r */
   21660       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   21661          UChar modrm = getUChar(delta);
   21662          UInt  rG    = gregOfRexRM(pfx,modrm);
   21663          if (epartIsReg(modrm)) {
   21664             UInt rE = eregOfRexRM(pfx,modrm);
   21665             putYMMRegLoAndZU( rE, getXMMReg(rG) );
   21666             DIP("vmovupd %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
   21667             delta += 1;
   21668          } else {
   21669             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21670             storeLE( mkexpr(addr), getXMMReg(rG) );
   21671             DIP("vmovupd %s,%s\n", nameXMMReg(rG), dis_buf);
   21672             delta += alen;
   21673          }
   21674          goto decode_success;
   21675       }
   21676       /* VMOVUPD ymm1, ymm2/m256 = VEX.256.66.0F.WIG 11 /r */
   21677       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   21678          UChar modrm = getUChar(delta);
   21679          UInt  rG    = gregOfRexRM(pfx,modrm);
   21680          if (epartIsReg(modrm)) {
   21681             UInt rE = eregOfRexRM(pfx,modrm);
   21682             putYMMReg( rE, getYMMReg(rG) );
   21683             DIP("vmovupd %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
   21684             delta += 1;
   21685          } else {
   21686             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21687             storeLE( mkexpr(addr), getYMMReg(rG) );
   21688             DIP("vmovupd %s,%s\n", nameYMMReg(rG), dis_buf);
   21689             delta += alen;
   21690          }
   21691          goto decode_success;
   21692       }
   21693       /* VMOVUPS xmm1, xmm2/m128 = VEX.128.0F.WIG 11 /r */
   21694       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   21695          UChar modrm = getUChar(delta);
   21696          UInt  rG    = gregOfRexRM(pfx,modrm);
   21697          if (epartIsReg(modrm)) {
   21698             UInt rE = eregOfRexRM(pfx,modrm);
   21699             putYMMRegLoAndZU( rE, getXMMReg(rG) );
   21700             DIP("vmovups %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
   21701             delta += 1;
   21702          } else {
   21703             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21704             storeLE( mkexpr(addr), getXMMReg(rG) );
   21705             DIP("vmovups %s,%s\n", nameXMMReg(rG), dis_buf);
   21706             delta += alen;
   21707          }
   21708          goto decode_success;
   21709       }
   21710       /* VMOVUPS ymm1, ymm2/m256 = VEX.256.0F.WIG 11 /r */
   21711       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   21712          UChar modrm = getUChar(delta);
   21713          UInt  rG    = gregOfRexRM(pfx,modrm);
   21714          if (epartIsReg(modrm)) {
   21715             UInt rE = eregOfRexRM(pfx,modrm);
   21716             putYMMReg( rE, getYMMReg(rG) );
   21717             DIP("vmovups %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
   21718             delta += 1;
   21719          } else {
   21720             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21721             storeLE( mkexpr(addr), getYMMReg(rG) );
   21722             DIP("vmovups %s,%s\n", nameYMMReg(rG), dis_buf);
   21723             delta += alen;
   21724          }
   21725          goto decode_success;
   21726       }
   21727       break;
   21728 
   21729    case 0x12:
   21730       /* VMOVDDUP xmm2/m64, xmm1 = VEX.128.F2.0F.WIG /12 r */
   21731       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   21732          delta = dis_MOVDDUP_128( vbi, pfx, delta, True/*isAvx*/ );
   21733          goto decode_success;
   21734       }
   21735       /* VMOVDDUP ymm2/m256, ymm1 = VEX.256.F2.0F.WIG /12 r */
   21736       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   21737          delta = dis_MOVDDUP_256( vbi, pfx, delta );
   21738          goto decode_success;
   21739       }
   21740       /* VMOVHLPS xmm3, xmm2, xmm1 = VEX.NDS.128.0F.WIG 12 /r */
   21741       /* Insn only exists in reg form */
   21742       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   21743           && epartIsReg(getUChar(delta))) {
   21744          UChar modrm = getUChar(delta);
   21745          UInt  rG    = gregOfRexRM(pfx, modrm);
   21746          UInt  rE    = eregOfRexRM(pfx, modrm);
   21747          UInt  rV    = getVexNvvvv(pfx);
   21748          delta++;
   21749          DIP("vmovhlps %s,%s,%s\n",
   21750              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   21751          IRTemp res = newTemp(Ity_V128);
   21752          assign(res, binop(Iop_64HLtoV128,
   21753                            getXMMRegLane64(rV, 1),
   21754                            getXMMRegLane64(rE, 1)));
   21755          putYMMRegLoAndZU(rG, mkexpr(res));
   21756          *uses_vvvv = True;
   21757          goto decode_success;
   21758       }
   21759       /* VMOVLPS m64, xmm1, xmm2 = VEX.NDS.128.0F.WIG 12 /r */
   21760       /* Insn exists only in mem form, it appears. */
   21761       /* VMOVLPD m64, xmm1, xmm2 = VEX.NDS.128.66.0F.WIG 12 /r */
   21762       /* Insn exists only in mem form, it appears. */
   21763       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   21764           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
   21765          UChar modrm = getUChar(delta);
   21766          UInt  rG    = gregOfRexRM(pfx, modrm);
   21767          UInt  rV    = getVexNvvvv(pfx);
   21768          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21769          delta += alen;
   21770          DIP("vmovlpd %s,%s,%s\n",
   21771              dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   21772          IRTemp res = newTemp(Ity_V128);
   21773          assign(res, binop(Iop_64HLtoV128,
   21774                            getXMMRegLane64(rV, 1),
   21775                            loadLE(Ity_I64, mkexpr(addr))));
   21776          putYMMRegLoAndZU(rG, mkexpr(res));
   21777          *uses_vvvv = True;
   21778          goto decode_success;
   21779       }
   21780       /* VMOVSLDUP xmm2/m128, xmm1 = VEX.NDS.128.F3.0F.WIG 12 /r */
   21781       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
   21782          delta = dis_MOVSxDUP_128( vbi, pfx, delta, True/*isAvx*/,
   21783                                    True/*isL*/ );
   21784          goto decode_success;
   21785       }
   21786       /* VMOVSLDUP ymm2/m256, ymm1 = VEX.NDS.256.F3.0F.WIG 12 /r */
   21787       if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
   21788          delta = dis_MOVSxDUP_256( vbi, pfx, delta, True/*isL*/ );
   21789          goto decode_success;
   21790       }
   21791       break;
   21792 
   21793    case 0x13:
   21794       /* VMOVLPS xmm1, m64 = VEX.128.0F.WIG 13 /r */
   21795       /* Insn exists only in mem form, it appears. */
   21796       /* VMOVLPD xmm1, m64 = VEX.128.66.0F.WIG 13 /r */
   21797       /* Insn exists only in mem form, it appears. */
   21798       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   21799           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
   21800          UChar modrm = getUChar(delta);
   21801          UInt  rG    = gregOfRexRM(pfx, modrm);
   21802          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21803          delta += alen;
   21804          storeLE( mkexpr(addr), getXMMRegLane64( rG, 0));
   21805          DIP("vmovlpd %s,%s\n", nameXMMReg(rG), dis_buf);
   21806          goto decode_success;
   21807       }
   21808       break;
   21809 
   21810    case 0x14:
   21811    case 0x15:
   21812       /* VUNPCKLPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 14 /r */
   21813       /* VUNPCKHPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 15 /r */
   21814       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   21815          Bool   hi    = opc == 0x15;
   21816          UChar  modrm = getUChar(delta);
   21817          UInt   rG    = gregOfRexRM(pfx,modrm);
   21818          UInt   rV    = getVexNvvvv(pfx);
   21819          IRTemp eV    = newTemp(Ity_V128);
   21820          IRTemp vV    = newTemp(Ity_V128);
   21821          assign( vV, getXMMReg(rV) );
   21822          if (epartIsReg(modrm)) {
   21823             UInt rE = eregOfRexRM(pfx,modrm);
   21824             assign( eV, getXMMReg(rE) );
   21825             delta += 1;
   21826             DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
   21827                 nameXMMReg(rE), nameXMMReg(rG));
   21828          } else {
   21829             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21830             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   21831             delta += alen;
   21832             DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
   21833                 dis_buf, nameXMMReg(rG));
   21834          }
   21835          IRTemp res = math_UNPCKxPS_128( eV, vV, hi );
   21836          putYMMRegLoAndZU( rG, mkexpr(res) );
   21837          *uses_vvvv = True;
   21838          goto decode_success;
   21839       }
   21840       /* VUNPCKLPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 14 /r */
   21841       /* VUNPCKHPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 15 /r */
   21842       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   21843          Bool   hi    = opc == 0x15;
   21844          UChar  modrm = getUChar(delta);
   21845          UInt   rG    = gregOfRexRM(pfx,modrm);
   21846          UInt   rV    = getVexNvvvv(pfx);
   21847          IRTemp eV    = newTemp(Ity_V256);
   21848          IRTemp vV    = newTemp(Ity_V256);
   21849          assign( vV, getYMMReg(rV) );
   21850          if (epartIsReg(modrm)) {
   21851             UInt rE = eregOfRexRM(pfx,modrm);
   21852             assign( eV, getYMMReg(rE) );
   21853             delta += 1;
   21854             DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
   21855                 nameYMMReg(rE), nameYMMReg(rG));
   21856          } else {
   21857             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21858             assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
   21859             delta += alen;
   21860             DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
   21861                 dis_buf, nameYMMReg(rG));
   21862          }
   21863          IRTemp res = math_UNPCKxPS_256( eV, vV, hi );
   21864          putYMMReg( rG, mkexpr(res) );
   21865          *uses_vvvv = True;
   21866          goto decode_success;
   21867       }
   21868       /* VUNPCKLPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 14 /r */
   21869       /* VUNPCKHPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 15 /r */
   21870       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   21871          Bool   hi    = opc == 0x15;
   21872          UChar  modrm = getUChar(delta);
   21873          UInt   rG    = gregOfRexRM(pfx,modrm);
   21874          UInt   rV    = getVexNvvvv(pfx);
   21875          IRTemp eV    = newTemp(Ity_V128);
   21876          IRTemp vV    = newTemp(Ity_V128);
   21877          assign( vV, getXMMReg(rV) );
   21878          if (epartIsReg(modrm)) {
   21879             UInt rE = eregOfRexRM(pfx,modrm);
   21880             assign( eV, getXMMReg(rE) );
   21881             delta += 1;
   21882             DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
   21883                 nameXMMReg(rE), nameXMMReg(rG));
   21884          } else {
   21885             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21886             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   21887             delta += alen;
   21888             DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
   21889                 dis_buf, nameXMMReg(rG));
   21890          }
   21891          IRTemp res = math_UNPCKxPD_128( eV, vV, hi );
   21892          putYMMRegLoAndZU( rG, mkexpr(res) );
   21893          *uses_vvvv = True;
   21894          goto decode_success;
   21895       }
   21896       /* VUNPCKLPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 14 /r */
   21897       /* VUNPCKHPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 15 /r */
   21898       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   21899          Bool   hi    = opc == 0x15;
   21900          UChar  modrm = getUChar(delta);
   21901          UInt   rG    = gregOfRexRM(pfx,modrm);
   21902          UInt   rV    = getVexNvvvv(pfx);
   21903          IRTemp eV    = newTemp(Ity_V256);
   21904          IRTemp vV    = newTemp(Ity_V256);
   21905          assign( vV, getYMMReg(rV) );
   21906          if (epartIsReg(modrm)) {
   21907             UInt rE = eregOfRexRM(pfx,modrm);
   21908             assign( eV, getYMMReg(rE) );
   21909             delta += 1;
   21910             DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
   21911                 nameYMMReg(rE), nameYMMReg(rG));
   21912          } else {
   21913             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21914             assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
   21915             delta += alen;
   21916             DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
   21917                 dis_buf, nameYMMReg(rG));
   21918          }
   21919          IRTemp res = math_UNPCKxPD_256( eV, vV, hi );
   21920          putYMMReg( rG, mkexpr(res) );
   21921          *uses_vvvv = True;
   21922          goto decode_success;
   21923       }
   21924       break;
   21925 
   21926    case 0x16:
   21927       /* VMOVLHPS xmm3, xmm2, xmm1 = VEX.NDS.128.0F.WIG 16 /r */
   21928       /* Insn only exists in reg form */
   21929       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   21930           && epartIsReg(getUChar(delta))) {
   21931          UChar modrm = getUChar(delta);
   21932          UInt  rG    = gregOfRexRM(pfx, modrm);
   21933          UInt  rE    = eregOfRexRM(pfx, modrm);
   21934          UInt  rV    = getVexNvvvv(pfx);
   21935          delta++;
   21936          DIP("vmovlhps %s,%s,%s\n",
   21937              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   21938          IRTemp res = newTemp(Ity_V128);
   21939          assign(res, binop(Iop_64HLtoV128,
   21940                            getXMMRegLane64(rE, 0),
   21941                            getXMMRegLane64(rV, 0)));
   21942          putYMMRegLoAndZU(rG, mkexpr(res));
   21943          *uses_vvvv = True;
   21944          goto decode_success;
   21945       }
   21946       /* VMOVHPS m64, xmm1, xmm2 = VEX.NDS.128.0F.WIG 16 /r */
   21947       /* Insn exists only in mem form, it appears. */
   21948       /* VMOVHPD m64, xmm1, xmm2 = VEX.NDS.128.66.0F.WIG 16 /r */
   21949       /* Insn exists only in mem form, it appears. */
   21950       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   21951           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
   21952          UChar modrm = getUChar(delta);
   21953          UInt  rG    = gregOfRexRM(pfx, modrm);
   21954          UInt  rV    = getVexNvvvv(pfx);
   21955          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21956          delta += alen;
   21957          DIP("vmovhp%c %s,%s,%s\n", have66(pfx) ? 'd' : 's',
   21958              dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   21959          IRTemp res = newTemp(Ity_V128);
   21960          assign(res, binop(Iop_64HLtoV128,
   21961                            loadLE(Ity_I64, mkexpr(addr)),
   21962                            getXMMRegLane64(rV, 0)));
   21963          putYMMRegLoAndZU(rG, mkexpr(res));
   21964          *uses_vvvv = True;
   21965          goto decode_success;
   21966       }
   21967       /* VMOVSHDUP xmm2/m128, xmm1 = VEX.NDS.128.F3.0F.WIG 16 /r */
   21968       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
   21969          delta = dis_MOVSxDUP_128( vbi, pfx, delta, True/*isAvx*/,
   21970                                    False/*!isL*/ );
   21971          goto decode_success;
   21972       }
   21973       /* VMOVSHDUP ymm2/m256, ymm1 = VEX.NDS.256.F3.0F.WIG 16 /r */
   21974       if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
   21975          delta = dis_MOVSxDUP_256( vbi, pfx, delta, False/*!isL*/ );
   21976          goto decode_success;
   21977       }
   21978       break;
   21979 
   21980    case 0x17:
   21981       /* VMOVHPS xmm1, m64 = VEX.128.0F.WIG 17 /r */
   21982       /* Insn exists only in mem form, it appears. */
   21983       /* VMOVHPD xmm1, m64 = VEX.128.66.0F.WIG 17 /r */
   21984       /* Insn exists only in mem form, it appears. */
   21985       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   21986           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
   21987          UChar modrm = getUChar(delta);
   21988          UInt  rG    = gregOfRexRM(pfx, modrm);
   21989          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21990          delta += alen;
   21991          storeLE( mkexpr(addr), getXMMRegLane64( rG, 1));
   21992          DIP("vmovhp%c %s,%s\n", have66(pfx) ? 'd' : 's',
   21993              nameXMMReg(rG), dis_buf);
   21994          goto decode_success;
   21995       }
   21996       break;
   21997 
   21998    case 0x28:
   21999       /* VMOVAPD xmm2/m128, xmm1 = VEX.128.66.0F.WIG 28 /r */
   22000       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22001          UChar modrm = getUChar(delta);
   22002          UInt  rG    = gregOfRexRM(pfx, modrm);
   22003          if (epartIsReg(modrm)) {
   22004             UInt rE = eregOfRexRM(pfx,modrm);
   22005             putYMMRegLoAndZU( rG, getXMMReg( rE ));
   22006             DIP("vmovapd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   22007             delta += 1;
   22008          } else {
   22009             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22010             gen_SEGV_if_not_16_aligned( addr );
   22011             putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
   22012             DIP("vmovapd %s,%s\n", dis_buf, nameXMMReg(rG));
   22013             delta += alen;
   22014          }
   22015          goto decode_success;
   22016       }
   22017       /* VMOVAPD ymm2/m256, ymm1 = VEX.256.66.0F.WIG 28 /r */
   22018       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22019          UChar modrm = getUChar(delta);
   22020          UInt  rG    = gregOfRexRM(pfx, modrm);
   22021          if (epartIsReg(modrm)) {
   22022             UInt rE = eregOfRexRM(pfx,modrm);
   22023             putYMMReg( rG, getYMMReg( rE ));
   22024             DIP("vmovapd %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   22025             delta += 1;
   22026          } else {
   22027             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22028             gen_SEGV_if_not_32_aligned( addr );
   22029             putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
   22030             DIP("vmovapd %s,%s\n", dis_buf, nameYMMReg(rG));
   22031             delta += alen;
   22032          }
   22033          goto decode_success;
   22034       }
   22035       /* VMOVAPS xmm2/m128, xmm1 = VEX.128.0F.WIG 28 /r */
   22036       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22037          UChar modrm = getUChar(delta);
   22038          UInt  rG    = gregOfRexRM(pfx, modrm);
   22039          if (epartIsReg(modrm)) {
   22040             UInt rE = eregOfRexRM(pfx,modrm);
   22041             putYMMRegLoAndZU( rG, getXMMReg( rE ));
   22042             DIP("vmovaps %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   22043             delta += 1;
   22044          } else {
   22045             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22046             gen_SEGV_if_not_16_aligned( addr );
   22047             putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
   22048             DIP("vmovaps %s,%s\n", dis_buf, nameXMMReg(rG));
   22049             delta += alen;
   22050          }
   22051          goto decode_success;
   22052       }
   22053       /* VMOVAPS ymm2/m256, ymm1 = VEX.256.0F.WIG 28 /r */
   22054       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22055          UChar modrm = getUChar(delta);
   22056          UInt  rG    = gregOfRexRM(pfx, modrm);
   22057          if (epartIsReg(modrm)) {
   22058             UInt rE = eregOfRexRM(pfx,modrm);
   22059             putYMMReg( rG, getYMMReg( rE ));
   22060             DIP("vmovaps %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   22061             delta += 1;
   22062          } else {
   22063             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22064             gen_SEGV_if_not_32_aligned( addr );
   22065             putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
   22066             DIP("vmovaps %s,%s\n", dis_buf, nameYMMReg(rG));
   22067             delta += alen;
   22068          }
   22069          goto decode_success;
   22070       }
   22071       break;
   22072 
   22073    case 0x29:
   22074       /* VMOVAPD xmm1, xmm2/m128 = VEX.128.66.0F.WIG 29 /r */
   22075       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22076          UChar modrm = getUChar(delta);
   22077          UInt  rG    = gregOfRexRM(pfx,modrm);
   22078          if (epartIsReg(modrm)) {
   22079             UInt rE = eregOfRexRM(pfx,modrm);
   22080             putYMMRegLoAndZU( rE, getXMMReg(rG) );
   22081             DIP("vmovapd %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
   22082             delta += 1;
   22083          } else {
   22084             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22085             gen_SEGV_if_not_16_aligned( addr );
   22086             storeLE( mkexpr(addr), getXMMReg(rG) );
   22087             DIP("vmovapd %s,%s\n", nameXMMReg(rG), dis_buf );
   22088             delta += alen;
   22089          }
   22090          goto decode_success;
   22091       }
   22092       /* VMOVAPD ymm1, ymm2/m256 = VEX.256.66.0F.WIG 29 /r */
   22093       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22094          UChar modrm = getUChar(delta);
   22095          UInt  rG    = gregOfRexRM(pfx,modrm);
   22096          if (epartIsReg(modrm)) {
   22097             UInt rE = eregOfRexRM(pfx,modrm);
   22098             putYMMReg( rE, getYMMReg(rG) );
   22099             DIP("vmovapd %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
   22100             delta += 1;
   22101          } else {
   22102             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22103             gen_SEGV_if_not_32_aligned( addr );
   22104             storeLE( mkexpr(addr), getYMMReg(rG) );
   22105             DIP("vmovapd %s,%s\n", nameYMMReg(rG), dis_buf );
   22106             delta += alen;
   22107          }
   22108          goto decode_success;
   22109       }
   22110       /* VMOVAPS xmm1, xmm2/m128 = VEX.128.0F.WIG 29 /r */
   22111       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22112          UChar modrm = getUChar(delta);
   22113          UInt  rG    = gregOfRexRM(pfx,modrm);
   22114          if (epartIsReg(modrm)) {
   22115             UInt rE = eregOfRexRM(pfx,modrm);
   22116             putYMMRegLoAndZU( rE, getXMMReg(rG) );
   22117             DIP("vmovaps %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
   22118             delta += 1;
   22119             goto decode_success;
   22120          } else {
   22121             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22122             gen_SEGV_if_not_16_aligned( addr );
   22123             storeLE( mkexpr(addr), getXMMReg(rG) );
   22124             DIP("vmovaps %s,%s\n", nameXMMReg(rG), dis_buf );
   22125             delta += alen;
   22126             goto decode_success;
   22127          }
   22128       }
   22129       /* VMOVAPS ymm1, ymm2/m256 = VEX.256.0F.WIG 29 /r */
   22130       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22131          UChar modrm = getUChar(delta);
   22132          UInt  rG    = gregOfRexRM(pfx,modrm);
   22133          if (epartIsReg(modrm)) {
   22134             UInt rE = eregOfRexRM(pfx,modrm);
   22135             putYMMReg( rE, getYMMReg(rG) );
   22136             DIP("vmovaps %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
   22137             delta += 1;
   22138             goto decode_success;
   22139          } else {
   22140             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22141             gen_SEGV_if_not_32_aligned( addr );
   22142             storeLE( mkexpr(addr), getYMMReg(rG) );
   22143             DIP("vmovaps %s,%s\n", nameYMMReg(rG), dis_buf );
   22144             delta += alen;
   22145             goto decode_success;
   22146          }
   22147       }
   22148       break;
   22149 
   22150    case 0x2A: {
   22151       IRTemp rmode = newTemp(Ity_I32);
   22152       assign( rmode, get_sse_roundingmode() );
   22153       /* VCVTSI2SD r/m32, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.W0 2A /r */
   22154       if (haveF2no66noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
   22155          UChar  modrm = getUChar(delta);
   22156          UInt   rV    = getVexNvvvv(pfx);
   22157          UInt   rD    = gregOfRexRM(pfx, modrm);
   22158          IRTemp arg32 = newTemp(Ity_I32);
   22159          if (epartIsReg(modrm)) {
   22160             UInt rS = eregOfRexRM(pfx,modrm);
   22161             assign( arg32, getIReg32(rS) );
   22162             delta += 1;
   22163             DIP("vcvtsi2sdl %s,%s,%s\n",
   22164                 nameIReg32(rS), nameXMMReg(rV), nameXMMReg(rD));
   22165          } else {
   22166             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22167             assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   22168             delta += alen;
   22169             DIP("vcvtsi2sdl %s,%s,%s\n",
   22170                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   22171          }
   22172          putXMMRegLane64F( rD, 0,
   22173                            unop(Iop_I32StoF64, mkexpr(arg32)));
   22174          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   22175          putYMMRegLane128( rD, 1, mkV128(0) );
   22176          *uses_vvvv = True;
   22177          goto decode_success;
   22178       }
   22179       /* VCVTSI2SD r/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.W1 2A /r */
   22180       if (haveF2no66noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
   22181          UChar  modrm = getUChar(delta);
   22182          UInt   rV    = getVexNvvvv(pfx);
   22183          UInt   rD    = gregOfRexRM(pfx, modrm);
   22184          IRTemp arg64 = newTemp(Ity_I64);
   22185          if (epartIsReg(modrm)) {
   22186             UInt rS = eregOfRexRM(pfx,modrm);
   22187             assign( arg64, getIReg64(rS) );
   22188             delta += 1;
   22189             DIP("vcvtsi2sdq %s,%s,%s\n",
   22190                 nameIReg64(rS), nameXMMReg(rV), nameXMMReg(rD));
   22191          } else {
   22192             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22193             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   22194             delta += alen;
   22195             DIP("vcvtsi2sdq %s,%s,%s\n",
   22196                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   22197          }
   22198          putXMMRegLane64F( rD, 0,
   22199                            binop( Iop_I64StoF64,
   22200                                   get_sse_roundingmode(),
   22201                                   mkexpr(arg64)) );
   22202          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   22203          putYMMRegLane128( rD, 1, mkV128(0) );
   22204          *uses_vvvv = True;
   22205          goto decode_success;
   22206       }
   22207       /* VCVTSI2SS r/m64, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.W1 2A /r */
   22208       if (haveF3no66noF2(pfx) && 1==getRexW(pfx)/*W1*/) {
   22209          UChar  modrm = getUChar(delta);
   22210          UInt   rV    = getVexNvvvv(pfx);
   22211          UInt   rD    = gregOfRexRM(pfx, modrm);
   22212          IRTemp arg64 = newTemp(Ity_I64);
   22213          if (epartIsReg(modrm)) {
   22214             UInt rS = eregOfRexRM(pfx,modrm);
   22215             assign( arg64, getIReg64(rS) );
   22216             delta += 1;
   22217             DIP("vcvtsi2ssq %s,%s,%s\n",
   22218                 nameIReg64(rS), nameXMMReg(rV), nameXMMReg(rD));
   22219          } else {
   22220             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22221             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   22222             delta += alen;
   22223             DIP("vcvtsi2ssq %s,%s,%s\n",
   22224                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   22225          }
   22226          putXMMRegLane32F( rD, 0,
   22227                            binop(Iop_F64toF32,
   22228                                  mkexpr(rmode),
   22229                                  binop(Iop_I64StoF64, mkexpr(rmode),
   22230                                                       mkexpr(arg64)) ) );
   22231          putXMMRegLane32( rD, 1, getXMMRegLane32( rV, 1 ));
   22232          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   22233          putYMMRegLane128( rD, 1, mkV128(0) );
   22234          *uses_vvvv = True;
   22235          goto decode_success;
   22236       }
   22237       /* VCVTSI2SS r/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.W0 2A /r */
   22238       if (haveF3no66noF2(pfx) && 0==getRexW(pfx)/*W0*/) {
   22239          UChar  modrm = getUChar(delta);
   22240          UInt   rV    = getVexNvvvv(pfx);
   22241          UInt   rD    = gregOfRexRM(pfx, modrm);
   22242          IRTemp arg32 = newTemp(Ity_I32);
   22243          if (epartIsReg(modrm)) {
   22244             UInt rS = eregOfRexRM(pfx,modrm);
   22245             assign( arg32, getIReg32(rS) );
   22246             delta += 1;
   22247             DIP("vcvtsi2ssl %s,%s,%s\n",
   22248                 nameIReg32(rS), nameXMMReg(rV), nameXMMReg(rD));
   22249          } else {
   22250             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22251             assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   22252             delta += alen;
   22253             DIP("vcvtsi2ssl %s,%s,%s\n",
   22254                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   22255          }
   22256          putXMMRegLane32F( rD, 0,
   22257                            binop(Iop_F64toF32,
   22258                                  mkexpr(rmode),
   22259                                  unop(Iop_I32StoF64, mkexpr(arg32)) ) );
   22260          putXMMRegLane32( rD, 1, getXMMRegLane32( rV, 1 ));
   22261          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   22262          putYMMRegLane128( rD, 1, mkV128(0) );
   22263          *uses_vvvv = True;
   22264          goto decode_success;
   22265       }
   22266       break;
   22267    }
   22268 
   22269    case 0x2B:
   22270       /* VMOVNTPD xmm1, m128 = VEX.128.66.0F.WIG 2B /r */
   22271       /* VMOVNTPS xmm1, m128 = VEX.128.0F.WIG 2B /r */
   22272       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   22273           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
   22274          UChar  modrm = getUChar(delta);
   22275          UInt   rS    = gregOfRexRM(pfx, modrm);
   22276          IRTemp tS    = newTemp(Ity_V128);
   22277          assign(tS, getXMMReg(rS));
   22278          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   22279          delta += alen;
   22280          gen_SEGV_if_not_16_aligned(addr);
   22281          storeLE(mkexpr(addr), mkexpr(tS));
   22282          DIP("vmovntp%c %s,%s\n", have66(pfx) ? 'd' : 's',
   22283              nameXMMReg(rS), dis_buf);
   22284          goto decode_success;
   22285       }
   22286       /* VMOVNTPD ymm1, m256 = VEX.256.66.0F.WIG 2B /r */
   22287       /* VMOVNTPS ymm1, m256 = VEX.256.0F.WIG 2B /r */
   22288       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   22289           && 1==getVexL(pfx)/*256*/ && !epartIsReg(getUChar(delta))) {
   22290          UChar  modrm = getUChar(delta);
   22291          UInt   rS    = gregOfRexRM(pfx, modrm);
   22292          IRTemp tS    = newTemp(Ity_V256);
   22293          assign(tS, getYMMReg(rS));
   22294          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   22295          delta += alen;
   22296          gen_SEGV_if_not_32_aligned(addr);
   22297          storeLE(mkexpr(addr), mkexpr(tS));
   22298          DIP("vmovntp%c %s,%s\n", have66(pfx) ? 'd' : 's',
   22299              nameYMMReg(rS), dis_buf);
   22300          goto decode_success;
   22301       }
   22302       break;
   22303 
   22304    case 0x2C:
   22305       /* VCVTTSD2SI xmm1/m32, r32 = VEX.LIG.F2.0F.W0 2C /r */
   22306       if (haveF2no66noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
   22307          delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
   22308          goto decode_success;
   22309       }
   22310       /* VCVTTSD2SI xmm1/m64, r64 = VEX.LIG.F2.0F.W1 2C /r */
   22311       if (haveF2no66noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
   22312          delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
   22313          goto decode_success;
   22314       }
   22315       /* VCVTTSS2SI xmm1/m32, r32 = VEX.LIG.F3.0F.W0 2C /r */
   22316       if (haveF3no66noF2(pfx) && 0==getRexW(pfx)/*W0*/) {
   22317          delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
   22318          goto decode_success;
   22319       }
   22320       /* VCVTTSS2SI xmm1/m64, r64 = VEX.LIG.F3.0F.W1 2C /r */
   22321       if (haveF3no66noF2(pfx) && 1==getRexW(pfx)/*W1*/) {
   22322          delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
   22323          goto decode_success;
   22324       }
   22325       break;
   22326 
   22327    case 0x2D:
   22328       /* VCVTSD2SI xmm1/m32, r32 = VEX.LIG.F2.0F.W0 2D /r */
   22329       if (haveF2no66noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
   22330          delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
   22331          goto decode_success;
   22332       }
   22333       /* VCVTSD2SI xmm1/m64, r64 = VEX.LIG.F2.0F.W1 2D /r */
   22334       if (haveF2no66noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
   22335          delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
   22336          goto decode_success;
   22337       }
   22338       /* VCVTSS2SI xmm1/m32, r32 = VEX.LIG.F3.0F.W0 2D /r */
   22339       if (haveF3no66noF2(pfx) && 0==getRexW(pfx)/*W0*/) {
   22340          delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
   22341          goto decode_success;
   22342       }
   22343       /* VCVTSS2SI xmm1/m64, r64 = VEX.LIG.F3.0F.W1 2D /r */
   22344       if (haveF3no66noF2(pfx) && 1==getRexW(pfx)/*W1*/) {
   22345          delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
   22346          goto decode_success;
   22347       }
   22348       break;
   22349 
   22350    case 0x2E:
   22351    case 0x2F:
   22352       /* VUCOMISD xmm2/m64, xmm1 = VEX.LIG.66.0F.WIG 2E /r */
   22353       /* VCOMISD  xmm2/m64, xmm1 = VEX.LIG.66.0F.WIG 2F /r */
   22354       if (have66noF2noF3(pfx)) {
   22355          delta = dis_COMISD( vbi, pfx, delta, True/*isAvx*/, opc );
   22356          goto decode_success;
   22357       }
   22358       /* VUCOMISS xmm2/m32, xmm1 = VEX.LIG.0F.WIG 2E /r */
   22359       /* VCOMISS xmm2/m32, xmm1  = VEX.LIG.0F.WIG 2F /r */
   22360       if (haveNo66noF2noF3(pfx)) {
   22361          delta = dis_COMISS( vbi, pfx, delta, True/*isAvx*/, opc );
   22362          goto decode_success;
   22363       }
   22364       break;
   22365 
   22366    case 0x50:
   22367       /* VMOVMSKPD xmm2, r32 = VEX.128.66.0F.WIG 50 /r */
   22368       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22369          delta = dis_MOVMSKPD_128( vbi, pfx, delta, True/*isAvx*/ );
   22370          goto decode_success;
   22371       }
   22372       /* VMOVMSKPD ymm2, r32 = VEX.256.66.0F.WIG 50 /r */
   22373       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22374          delta = dis_MOVMSKPD_256( vbi, pfx, delta );
   22375          goto decode_success;
   22376       }
   22377       /* VMOVMSKPS xmm2, r32 = VEX.128.0F.WIG 50 /r */
   22378       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22379          delta = dis_MOVMSKPS_128( vbi, pfx, delta, True/*isAvx*/ );
   22380          goto decode_success;
   22381       }
   22382       /* VMOVMSKPS ymm2, r32 = VEX.256.0F.WIG 50 /r */
   22383       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22384          delta = dis_MOVMSKPS_256( vbi, pfx, delta );
   22385          goto decode_success;
   22386       }
   22387       break;
   22388 
   22389    case 0x51:
   22390       /* VSQRTSS xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F3.0F.WIG 51 /r */
   22391       if (haveF3no66noF2(pfx)) {
   22392          delta = dis_AVX128_E_V_to_G_lo32_unary(
   22393                     uses_vvvv, vbi, pfx, delta, "vsqrtss", Iop_Sqrt32F0x4 );
   22394          goto decode_success;
   22395       }
   22396       /* VSQRTPS xmm2/m128(E), xmm1(G) = VEX.NDS.128.0F.WIG 51 /r */
   22397       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22398          delta = dis_AVX128_E_to_G_unary_all(
   22399                     uses_vvvv, vbi, pfx, delta, "vsqrtps", Iop_Sqrt32Fx4 );
   22400          goto decode_success;
   22401       }
   22402       /* VSQRTPS ymm2/m256(E), ymm1(G) = VEX.NDS.256.0F.WIG 51 /r */
   22403       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22404          delta = dis_AVX256_E_to_G_unary_all(
   22405                     uses_vvvv, vbi, pfx, delta, "vsqrtps", Iop_Sqrt32Fx8 );
   22406          goto decode_success;
   22407       }
   22408       /* VSQRTSD xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F2.0F.WIG 51 /r */
   22409       if (haveF2no66noF3(pfx)) {
   22410          delta = dis_AVX128_E_V_to_G_lo64_unary(
   22411                     uses_vvvv, vbi, pfx, delta, "vsqrtsd", Iop_Sqrt64F0x2 );
   22412          goto decode_success;
   22413       }
   22414       /* VSQRTPD xmm2/m128(E), xmm1(G) = VEX.NDS.128.66.0F.WIG 51 /r */
   22415       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22416          delta = dis_AVX128_E_to_G_unary_all(
   22417                     uses_vvvv, vbi, pfx, delta, "vsqrtpd", Iop_Sqrt64Fx2 );
   22418          goto decode_success;
   22419       }
   22420       /* VSQRTPD ymm2/m256(E), ymm1(G) = VEX.NDS.256.66.0F.WIG 51 /r */
   22421       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22422          delta = dis_AVX256_E_to_G_unary_all(
   22423                     uses_vvvv, vbi, pfx, delta, "vsqrtpd", Iop_Sqrt64Fx4 );
   22424          goto decode_success;
   22425       }
   22426       break;
   22427 
   22428    case 0x52:
   22429       /* VRSQRTSS xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F3.0F.WIG 52 /r */
   22430       if (haveF3no66noF2(pfx)) {
   22431          delta = dis_AVX128_E_V_to_G_lo32_unary(
   22432                     uses_vvvv, vbi, pfx, delta, "vrsqrtss", Iop_RSqrt32F0x4 );
   22433          goto decode_success;
   22434       }
   22435       /* VRSQRTPS xmm2/m128(E), xmm1(G) = VEX.NDS.128.0F.WIG 52 /r */
   22436       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22437          delta = dis_AVX128_E_to_G_unary_all(
   22438                     uses_vvvv, vbi, pfx, delta, "vrsqrtps", Iop_RSqrt32Fx4 );
   22439          goto decode_success;
   22440       }
   22441       /* VRSQRTPS ymm2/m256(E), ymm1(G) = VEX.NDS.256.0F.WIG 52 /r */
   22442       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22443          delta = dis_AVX256_E_to_G_unary_all(
   22444                     uses_vvvv, vbi, pfx, delta, "vrsqrtps", Iop_RSqrt32Fx8 );
   22445          goto decode_success;
   22446       }
   22447       break;
   22448 
   22449    case 0x53:
   22450       /* VRCPSS xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F3.0F.WIG 53 /r */
   22451       if (haveF3no66noF2(pfx)) {
   22452          delta = dis_AVX128_E_V_to_G_lo32_unary(
   22453                     uses_vvvv, vbi, pfx, delta, "vrcpss", Iop_Recip32F0x4 );
   22454          goto decode_success;
   22455       }
   22456       /* VRCPPS xmm2/m128(E), xmm1(G) = VEX.NDS.128.0F.WIG 53 /r */
   22457       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22458          delta = dis_AVX128_E_to_G_unary_all(
   22459                     uses_vvvv, vbi, pfx, delta, "vrcpps", Iop_Recip32Fx4 );
   22460          goto decode_success;
   22461       }
   22462       /* VRCPPS ymm2/m256(E), ymm1(G) = VEX.NDS.256.0F.WIG 53 /r */
   22463       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22464          delta = dis_AVX256_E_to_G_unary_all(
   22465                     uses_vvvv, vbi, pfx, delta, "vrcpps", Iop_Recip32Fx8 );
   22466          goto decode_success;
   22467       }
   22468       break;
   22469 
   22470    case 0x54:
   22471       /* VANDPD r/m, rV, r ::: r = rV & r/m */
   22472       /* VANDPD = VEX.NDS.128.66.0F.WIG 54 /r */
   22473       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22474          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   22475                     uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV128 );
   22476          goto decode_success;
   22477       }
   22478       /* VANDPD r/m, rV, r ::: r = rV & r/m */
   22479       /* VANDPD = VEX.NDS.256.66.0F.WIG 54 /r */
   22480       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22481          delta = dis_AVX256_E_V_to_G(
   22482                     uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV256 );
   22483          goto decode_success;
   22484       }
   22485       /* VANDPS = VEX.NDS.128.0F.WIG 54 /r */
   22486       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22487          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   22488                     uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV128 );
   22489          goto decode_success;
   22490       }
   22491       /* VANDPS = VEX.NDS.256.0F.WIG 54 /r */
   22492       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22493          delta = dis_AVX256_E_V_to_G(
   22494                     uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV256 );
   22495          goto decode_success;
   22496       }
   22497       break;
   22498 
   22499    case 0x55:
   22500       /* VANDNPD r/m, rV, r ::: r = (not rV) & r/m */
   22501       /* VANDNPD = VEX.NDS.128.66.0F.WIG 55 /r */
   22502       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22503          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   22504                     uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV128,
   22505                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
   22506          goto decode_success;
   22507       }
   22508       /* VANDNPD = VEX.NDS.256.66.0F.WIG 55 /r */
   22509       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22510          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
   22511                     uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV256,
   22512                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
   22513          goto decode_success;
   22514       }
   22515       /* VANDNPS = VEX.NDS.128.0F.WIG 55 /r */
   22516       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22517          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   22518                     uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV128,
   22519                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
   22520          goto decode_success;
   22521       }
   22522       /* VANDNPS = VEX.NDS.256.0F.WIG 55 /r */
   22523       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22524          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
   22525                     uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV256,
   22526                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
   22527          goto decode_success;
   22528       }
   22529       break;
   22530 
   22531    case 0x56:
   22532       /* VORPD r/m, rV, r ::: r = rV | r/m */
   22533       /* VORPD = VEX.NDS.128.66.0F.WIG 56 /r */
   22534       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22535          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   22536                     uses_vvvv, vbi, pfx, delta, "vorpd", Iop_OrV128 );
   22537          goto decode_success;
   22538       }
   22539       /* VORPD r/m, rV, r ::: r = rV | r/m */
   22540       /* VORPD = VEX.NDS.256.66.0F.WIG 56 /r */
   22541       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22542          delta = dis_AVX256_E_V_to_G(
   22543                     uses_vvvv, vbi, pfx, delta, "vorpd", Iop_OrV256 );
   22544          goto decode_success;
   22545       }
   22546       /* VORPS r/m, rV, r ::: r = rV | r/m */
   22547       /* VORPS = VEX.NDS.128.0F.WIG 56 /r */
   22548       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22549          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   22550                     uses_vvvv, vbi, pfx, delta, "vorps", Iop_OrV128 );
   22551          goto decode_success;
   22552       }
   22553       /* VORPS r/m, rV, r ::: r = rV | r/m */
   22554       /* VORPS = VEX.NDS.256.0F.WIG 56 /r */
   22555       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22556          delta = dis_AVX256_E_V_to_G(
   22557                     uses_vvvv, vbi, pfx, delta, "vorps", Iop_OrV256 );
   22558          goto decode_success;
   22559       }
   22560       break;
   22561 
   22562    case 0x57:
   22563       /* VXORPD r/m, rV, r ::: r = rV ^ r/m */
   22564       /* VXORPD = VEX.NDS.128.66.0F.WIG 57 /r */
   22565       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22566          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   22567                     uses_vvvv, vbi, pfx, delta, "vxorpd", Iop_XorV128 );
   22568          goto decode_success;
   22569       }
   22570       /* VXORPD r/m, rV, r ::: r = rV ^ r/m */
   22571       /* VXORPD = VEX.NDS.256.66.0F.WIG 57 /r */
   22572       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22573          delta = dis_AVX256_E_V_to_G(
   22574                     uses_vvvv, vbi, pfx, delta, "vxorpd", Iop_XorV256 );
   22575          goto decode_success;
   22576       }
   22577       /* VXORPS r/m, rV, r ::: r = rV ^ r/m */
   22578       /* VXORPS = VEX.NDS.128.0F.WIG 57 /r */
   22579       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22580          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   22581                     uses_vvvv, vbi, pfx, delta, "vxorps", Iop_XorV128 );
   22582          goto decode_success;
   22583       }
   22584       /* VXORPS r/m, rV, r ::: r = rV ^ r/m */
   22585       /* VXORPS = VEX.NDS.256.0F.WIG 57 /r */
   22586       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22587          delta = dis_AVX256_E_V_to_G(
   22588                     uses_vvvv, vbi, pfx, delta, "vxorps", Iop_XorV256 );
   22589          goto decode_success;
   22590       }
   22591       break;
   22592 
   22593    case 0x58:
   22594       /* VADDSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 58 /r */
   22595       if (haveF2no66noF3(pfx)) {
   22596          delta = dis_AVX128_E_V_to_G_lo64(
   22597                     uses_vvvv, vbi, pfx, delta, "vaddsd", Iop_Add64F0x2 );
   22598          goto decode_success;
   22599       }
   22600       /* VADDSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 58 /r */
   22601       if (haveF3no66noF2(pfx)) {
   22602          delta = dis_AVX128_E_V_to_G_lo32(
   22603                     uses_vvvv, vbi, pfx, delta, "vaddss", Iop_Add32F0x4 );
   22604          goto decode_success;
   22605       }
   22606       /* VADDPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 58 /r */
   22607       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22608          delta = dis_AVX128_E_V_to_G(
   22609                     uses_vvvv, vbi, pfx, delta, "vaddps", Iop_Add32Fx4 );
   22610          goto decode_success;
   22611       }
   22612       /* VADDPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 58 /r */
   22613       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22614          delta = dis_AVX256_E_V_to_G(
   22615                     uses_vvvv, vbi, pfx, delta, "vaddps", Iop_Add32Fx8 );
   22616          goto decode_success;
   22617       }
   22618       /* VADDPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 58 /r */
   22619       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22620          delta = dis_AVX128_E_V_to_G(
   22621                     uses_vvvv, vbi, pfx, delta, "vaddpd", Iop_Add64Fx2 );
   22622          goto decode_success;
   22623       }
   22624       /* VADDPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 58 /r */
   22625       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22626          delta = dis_AVX256_E_V_to_G(
   22627                     uses_vvvv, vbi, pfx, delta, "vaddpd", Iop_Add64Fx4 );
   22628          goto decode_success;
   22629       }
   22630       break;
   22631 
   22632    case 0x59:
   22633       /* VMULSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 59 /r */
   22634       if (haveF2no66noF3(pfx)) {
   22635          delta = dis_AVX128_E_V_to_G_lo64(
   22636                     uses_vvvv, vbi, pfx, delta, "vmulsd", Iop_Mul64F0x2 );
   22637          goto decode_success;
   22638       }
   22639       /* VMULSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 59 /r */
   22640       if (haveF3no66noF2(pfx)) {
   22641          delta = dis_AVX128_E_V_to_G_lo32(
   22642                     uses_vvvv, vbi, pfx, delta, "vmulss", Iop_Mul32F0x4 );
   22643          goto decode_success;
   22644       }
   22645       /* VMULPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 59 /r */
   22646       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22647          delta = dis_AVX128_E_V_to_G(
   22648                     uses_vvvv, vbi, pfx, delta, "vmulps", Iop_Mul32Fx4 );
   22649          goto decode_success;
   22650       }
   22651       /* VMULPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 59 /r */
   22652       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22653          delta = dis_AVX256_E_V_to_G(
   22654                     uses_vvvv, vbi, pfx, delta, "vmulps", Iop_Mul32Fx8 );
   22655          goto decode_success;
   22656       }
   22657       /* VMULPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 59 /r */
   22658       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22659          delta = dis_AVX128_E_V_to_G(
   22660                     uses_vvvv, vbi, pfx, delta, "vmulpd", Iop_Mul64Fx2 );
   22661          goto decode_success;
   22662       }
   22663       /* VMULPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 59 /r */
   22664       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22665          delta = dis_AVX256_E_V_to_G(
   22666                     uses_vvvv, vbi, pfx, delta, "vmulpd", Iop_Mul64Fx4 );
   22667          goto decode_success;
   22668       }
   22669       break;
   22670 
   22671    case 0x5A:
   22672       /* VCVTPS2PD xmm2/m64, xmm1 = VEX.128.0F.WIG 5A /r */
   22673       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22674          delta = dis_CVTPS2PD_128( vbi, pfx, delta, True/*isAvx*/ );
   22675          goto decode_success;
   22676       }
   22677       /* VCVTPS2PD xmm2/m128, ymm1 = VEX.256.0F.WIG 5A /r */
   22678       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22679          delta = dis_CVTPS2PD_256( vbi, pfx, delta );
   22680          goto decode_success;
   22681       }
   22682       /* VCVTPD2PS xmm2/m128, xmm1 = VEX.128.66.0F.WIG 5A /r */
   22683       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22684          delta = dis_CVTPD2PS_128( vbi, pfx, delta, True/*isAvx*/ );
   22685          goto decode_success;
   22686       }
   22687       /* VCVTPD2PS ymm2/m256, xmm1 = VEX.256.66.0F.WIG 5A /r */
   22688       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22689          delta = dis_CVTPD2PS_256( vbi, pfx, delta );
   22690          goto decode_success;
   22691       }
   22692       /* VCVTSD2SS xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5A /r */
   22693       if (haveF2no66noF3(pfx)) {
   22694          UChar  modrm = getUChar(delta);
   22695          UInt   rV    = getVexNvvvv(pfx);
   22696          UInt   rD    = gregOfRexRM(pfx, modrm);
   22697          IRTemp f64lo = newTemp(Ity_F64);
   22698          IRTemp rmode = newTemp(Ity_I32);
   22699          assign( rmode, get_sse_roundingmode() );
   22700          if (epartIsReg(modrm)) {
   22701             UInt rS = eregOfRexRM(pfx,modrm);
   22702             assign(f64lo, getXMMRegLane64F(rS, 0));
   22703             delta += 1;
   22704             DIP("vcvtsd2ss %s,%s,%s\n",
   22705                 nameXMMReg(rS), nameXMMReg(rV), nameXMMReg(rD));
   22706          } else {
   22707             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22708             assign(f64lo, loadLE(Ity_F64, mkexpr(addr)) );
   22709             delta += alen;
   22710             DIP("vcvtsd2ss %s,%s,%s\n",
   22711                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   22712          }
   22713          putXMMRegLane32F( rD, 0,
   22714                            binop( Iop_F64toF32, mkexpr(rmode),
   22715                                                 mkexpr(f64lo)) );
   22716          putXMMRegLane32( rD, 1, getXMMRegLane32( rV, 1 ));
   22717          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   22718          putYMMRegLane128( rD, 1, mkV128(0) );
   22719          *uses_vvvv = True;
   22720          goto decode_success;
   22721       }
   22722       /* VCVTSS2SD xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5A /r */
   22723       if (haveF3no66noF2(pfx)) {
   22724          UChar  modrm = getUChar(delta);
   22725          UInt   rV    = getVexNvvvv(pfx);
   22726          UInt   rD    = gregOfRexRM(pfx, modrm);
   22727          IRTemp f32lo = newTemp(Ity_F32);
   22728          if (epartIsReg(modrm)) {
   22729             UInt rS = eregOfRexRM(pfx,modrm);
   22730             assign(f32lo, getXMMRegLane32F(rS, 0));
   22731             delta += 1;
   22732             DIP("vcvtss2sd %s,%s,%s\n",
   22733                 nameXMMReg(rS), nameXMMReg(rV), nameXMMReg(rD));
   22734          } else {
   22735             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22736             assign(f32lo, loadLE(Ity_F32, mkexpr(addr)) );
   22737             delta += alen;
   22738             DIP("vcvtss2sd %s,%s,%s\n",
   22739                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   22740          }
   22741          putXMMRegLane64F( rD, 0,
   22742                            unop( Iop_F32toF64, mkexpr(f32lo)) );
   22743          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   22744          putYMMRegLane128( rD, 1, mkV128(0) );
   22745          *uses_vvvv = True;
   22746          goto decode_success;
   22747       }
   22748       break;
   22749 
   22750    case 0x5B:
   22751       /* VCVTPS2DQ xmm2/m128, xmm1 = VEX.128.66.0F.WIG 5B /r */
   22752       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22753          delta = dis_CVTxPS2DQ_128( vbi, pfx, delta,
   22754                                     True/*isAvx*/, False/*!r2zero*/ );
   22755          goto decode_success;
   22756       }
   22757       /* VCVTPS2DQ ymm2/m256, ymm1 = VEX.256.66.0F.WIG 5B /r */
   22758       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22759          delta = dis_CVTxPS2DQ_256( vbi, pfx, delta,
   22760                                     False/*!r2zero*/ );
   22761          goto decode_success;
   22762       }
   22763       /* VCVTTPS2DQ xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 5B /r */
   22764       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
   22765          delta = dis_CVTxPS2DQ_128( vbi, pfx, delta,
   22766                                     True/*isAvx*/, True/*r2zero*/ );
   22767          goto decode_success;
   22768       }
   22769       /* VCVTTPS2DQ ymm2/m256, ymm1 = VEX.256.F3.0F.WIG 5B /r */
   22770       if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
   22771          delta = dis_CVTxPS2DQ_256( vbi, pfx, delta,
   22772                                     True/*r2zero*/ );
   22773          goto decode_success;
   22774       }
   22775       /* VCVTDQ2PS xmm2/m128, xmm1 = VEX.128.0F.WIG 5B /r */
   22776       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22777          delta = dis_CVTDQ2PS_128 ( vbi, pfx, delta, True/*isAvx*/ );
   22778          goto decode_success;
   22779       }
   22780       /* VCVTDQ2PS ymm2/m256, ymm1 = VEX.256.0F.WIG 5B /r */
   22781       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22782          delta = dis_CVTDQ2PS_256 ( vbi, pfx, delta );
   22783          goto decode_success;
   22784       }
   22785       break;
   22786 
   22787    case 0x5C:
   22788       /* VSUBSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5C /r */
   22789       if (haveF2no66noF3(pfx)) {
   22790          delta = dis_AVX128_E_V_to_G_lo64(
   22791                     uses_vvvv, vbi, pfx, delta, "vsubsd", Iop_Sub64F0x2 );
   22792          goto decode_success;
   22793       }
   22794       /* VSUBSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5C /r */
   22795       if (haveF3no66noF2(pfx)) {
   22796          delta = dis_AVX128_E_V_to_G_lo32(
   22797                     uses_vvvv, vbi, pfx, delta, "vsubss", Iop_Sub32F0x4 );
   22798          goto decode_success;
   22799       }
   22800       /* VSUBPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5C /r */
   22801       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22802          delta = dis_AVX128_E_V_to_G(
   22803                     uses_vvvv, vbi, pfx, delta, "vsubps", Iop_Sub32Fx4 );
   22804          goto decode_success;
   22805       }
   22806       /* VSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5C /r */
   22807       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22808          delta = dis_AVX256_E_V_to_G(
   22809                     uses_vvvv, vbi, pfx, delta, "vsubps", Iop_Sub32Fx8 );
   22810          goto decode_success;
   22811       }
   22812       /* VSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5C /r */
   22813       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22814          delta = dis_AVX128_E_V_to_G(
   22815                     uses_vvvv, vbi, pfx, delta, "vsubpd", Iop_Sub64Fx2 );
   22816          goto decode_success;
   22817       }
   22818       /* VSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5C /r */
   22819       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22820          delta = dis_AVX256_E_V_to_G(
   22821                     uses_vvvv, vbi, pfx, delta, "vsubpd", Iop_Sub64Fx4 );
   22822          goto decode_success;
   22823       }
   22824       break;
   22825 
   22826    case 0x5D:
   22827       /* VMINSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5D /r */
   22828       if (haveF2no66noF3(pfx)) {
   22829          delta = dis_AVX128_E_V_to_G_lo64(
   22830                     uses_vvvv, vbi, pfx, delta, "vminsd", Iop_Min64F0x2 );
   22831          goto decode_success;
   22832       }
   22833       /* VMINSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5D /r */
   22834       if (haveF3no66noF2(pfx)) {
   22835          delta = dis_AVX128_E_V_to_G_lo32(
   22836                     uses_vvvv, vbi, pfx, delta, "vminss", Iop_Min32F0x4 );
   22837          goto decode_success;
   22838       }
   22839       /* VMINPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5D /r */
   22840       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22841          delta = dis_AVX128_E_V_to_G(
   22842                     uses_vvvv, vbi, pfx, delta, "vminps", Iop_Min32Fx4 );
   22843          goto decode_success;
   22844       }
   22845       /* VMINPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5D /r */
   22846       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22847          delta = dis_AVX256_E_V_to_G(
   22848                     uses_vvvv, vbi, pfx, delta, "vminps", Iop_Min32Fx8 );
   22849          goto decode_success;
   22850       }
   22851       /* VMINPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5D /r */
   22852       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22853          delta = dis_AVX128_E_V_to_G(
   22854                     uses_vvvv, vbi, pfx, delta, "vminpd", Iop_Min64Fx2 );
   22855          goto decode_success;
   22856       }
   22857       /* VMINPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5D /r */
   22858       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22859          delta = dis_AVX256_E_V_to_G(
   22860                     uses_vvvv, vbi, pfx, delta, "vminpd", Iop_Min64Fx4 );
   22861          goto decode_success;
   22862       }
   22863       break;
   22864 
   22865    case 0x5E:
   22866       /* VDIVSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5E /r */
   22867       if (haveF2no66noF3(pfx)) {
   22868          delta = dis_AVX128_E_V_to_G_lo64(
   22869                     uses_vvvv, vbi, pfx, delta, "vdivsd", Iop_Div64F0x2 );
   22870          goto decode_success;
   22871       }
   22872       /* VDIVSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5E /r */
   22873       if (haveF3no66noF2(pfx)) {
   22874          delta = dis_AVX128_E_V_to_G_lo32(
   22875                     uses_vvvv, vbi, pfx, delta, "vdivss", Iop_Div32F0x4 );
   22876          goto decode_success;
   22877       }
   22878       /* VDIVPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5E /r */
   22879       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22880          delta = dis_AVX128_E_V_to_G(
   22881                     uses_vvvv, vbi, pfx, delta, "vdivps", Iop_Div32Fx4 );
   22882          goto decode_success;
   22883       }
   22884       /* VDIVPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5E /r */
   22885       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22886          delta = dis_AVX256_E_V_to_G(
   22887                     uses_vvvv, vbi, pfx, delta, "vdivps", Iop_Div32Fx8 );
   22888          goto decode_success;
   22889       }
   22890       /* VDIVPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5E /r */
   22891       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22892          delta = dis_AVX128_E_V_to_G(
   22893                     uses_vvvv, vbi, pfx, delta, "vdivpd", Iop_Div64Fx2 );
   22894          goto decode_success;
   22895       }
   22896       /* VDIVPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5E /r */
   22897       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22898          delta = dis_AVX256_E_V_to_G(
   22899                     uses_vvvv, vbi, pfx, delta, "vdivpd", Iop_Div64Fx4 );
   22900          goto decode_success;
   22901       }
   22902       break;
   22903 
   22904    case 0x5F:
   22905       /* VMAXSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5F /r */
   22906       if (haveF2no66noF3(pfx)) {
   22907          delta = dis_AVX128_E_V_to_G_lo64(
   22908                     uses_vvvv, vbi, pfx, delta, "vmaxsd", Iop_Max64F0x2 );
   22909          goto decode_success;
   22910       }
   22911       /* VMAXSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5F /r */
   22912       if (haveF3no66noF2(pfx)) {
   22913          delta = dis_AVX128_E_V_to_G_lo32(
   22914                     uses_vvvv, vbi, pfx, delta, "vmaxss", Iop_Max32F0x4 );
   22915          goto decode_success;
   22916       }
   22917       /* VMAXPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5F /r */
   22918       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22919          delta = dis_AVX128_E_V_to_G(
   22920                     uses_vvvv, vbi, pfx, delta, "vmaxps", Iop_Max32Fx4 );
   22921          goto decode_success;
   22922       }
   22923       /* VMAXPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5F /r */
   22924       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22925          delta = dis_AVX256_E_V_to_G(
   22926                     uses_vvvv, vbi, pfx, delta, "vmaxps", Iop_Max32Fx8 );
   22927          goto decode_success;
   22928       }
   22929       /* VMAXPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5F /r */
   22930       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22931          delta = dis_AVX128_E_V_to_G(
   22932                     uses_vvvv, vbi, pfx, delta, "vmaxpd", Iop_Max64Fx2 );
   22933          goto decode_success;
   22934       }
   22935       /* VMAXPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5F /r */
   22936       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   22937          delta = dis_AVX256_E_V_to_G(
   22938                     uses_vvvv, vbi, pfx, delta, "vmaxpd", Iop_Max64Fx4 );
   22939          goto decode_success;
   22940       }
   22941       break;
   22942 
   22943    case 0x60:
   22944       /* VPUNPCKLBW r/m, rV, r ::: r = interleave-lo-bytes(rV, r/m) */
   22945       /* VPUNPCKLBW = VEX.NDS.128.66.0F.WIG 60 /r */
   22946       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22947          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   22948                     uses_vvvv, vbi, pfx, delta, "vpunpcklbw",
   22949                     Iop_InterleaveLO8x16, NULL,
   22950                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   22951          goto decode_success;
   22952       }
   22953       break;
   22954 
   22955    case 0x61:
   22956       /* VPUNPCKLWD r/m, rV, r ::: r = interleave-lo-words(rV, r/m) */
   22957       /* VPUNPCKLWD = VEX.NDS.128.66.0F.WIG 61 /r */
   22958       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22959          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   22960                     uses_vvvv, vbi, pfx, delta, "vpunpcklwd",
   22961                     Iop_InterleaveLO16x8, NULL,
   22962                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   22963          goto decode_success;
   22964       }
   22965       break;
   22966 
   22967    case 0x62:
   22968       /* VPUNPCKLDQ r/m, rV, r ::: r = interleave-lo-dwords(rV, r/m) */
   22969       /* VPUNPCKLDQ = VEX.NDS.128.66.0F.WIG 62 /r */
   22970       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22971          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   22972                     uses_vvvv, vbi, pfx, delta, "vpunpckldq",
   22973                     Iop_InterleaveLO32x4, NULL,
   22974                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   22975          goto decode_success;
   22976       }
   22977       break;
   22978 
   22979    case 0x63:
   22980       /* VPACKSSWB r/m, rV, r ::: r = QNarrowBin16Sto8Sx16(rV, r/m) */
   22981       /* VPACKSSWB = VEX.NDS.128.66.0F.WIG 63 /r */
   22982       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22983          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   22984                     uses_vvvv, vbi, pfx, delta, "vpacksswb",
   22985                     Iop_QNarrowBin16Sto8Sx16, NULL,
   22986                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   22987          goto decode_success;
   22988       }
   22989       break;
   22990 
   22991    case 0x64:
   22992       /* VPCMPGTB r/m, rV, r ::: r = rV `>s-by-8s` r/m */
   22993       /* VPCMPGTB = VEX.NDS.128.66.0F.WIG 64 /r */
   22994       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   22995          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   22996                     uses_vvvv, vbi, pfx, delta, "vpcmpgtb", Iop_CmpGT8Sx16 );
   22997          goto decode_success;
   22998       }
   22999       break;
   23000 
   23001    case 0x65:
   23002       /* VPCMPGTW r/m, rV, r ::: r = rV `>s-by-16s` r/m */
   23003       /* VPCMPGTW = VEX.NDS.128.66.0F.WIG 65 /r */
   23004       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23005          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   23006                     uses_vvvv, vbi, pfx, delta, "vpcmpgtw", Iop_CmpGT16Sx8 );
   23007          goto decode_success;
   23008       }
   23009       break;
   23010 
   23011    case 0x66:
   23012       /* VPCMPGTD r/m, rV, r ::: r = rV `>s-by-32s` r/m */
   23013       /* VPCMPGTD = VEX.NDS.128.66.0F.WIG 66 /r */
   23014       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23015          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   23016                     uses_vvvv, vbi, pfx, delta, "vpcmpgtd", Iop_CmpGT32Sx4 );
   23017          goto decode_success;
   23018       }
   23019       break;
   23020 
   23021    case 0x67:
   23022       /* VPACKUSWB r/m, rV, r ::: r = QNarrowBin16Sto8Ux16(rV, r/m) */
   23023       /* VPACKUSWB = VEX.NDS.128.66.0F.WIG 67 /r */
   23024       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23025          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   23026                     uses_vvvv, vbi, pfx, delta, "vpackuswb",
   23027                     Iop_QNarrowBin16Sto8Ux16, NULL,
   23028                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   23029          goto decode_success;
   23030       }
   23031       break;
   23032 
   23033    case 0x68:
   23034       /* VPUNPCKHBW r/m, rV, r ::: r = interleave-hi-bytes(rV, r/m) */
   23035       /* VPUNPCKHBW = VEX.NDS.128.0F.WIG 68 /r */
   23036       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23037          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   23038                     uses_vvvv, vbi, pfx, delta, "vpunpckhbw",
   23039                     Iop_InterleaveHI8x16, NULL,
   23040                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   23041          goto decode_success;
   23042       }
   23043       break;
   23044 
   23045    case 0x69:
   23046       /* VPUNPCKHWD r/m, rV, r ::: r = interleave-hi-words(rV, r/m) */
   23047       /* VPUNPCKHWD = VEX.NDS.128.0F.WIG 69 /r */
   23048       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23049          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   23050                     uses_vvvv, vbi, pfx, delta, "vpunpckhwd",
   23051                     Iop_InterleaveHI16x8, NULL,
   23052                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   23053          goto decode_success;
   23054       }
   23055       break;
   23056 
   23057    case 0x6A:
   23058       /* VPUNPCKHDQ r/m, rV, r ::: r = interleave-hi-dwords(rV, r/m) */
   23059       /* VPUNPCKHDQ = VEX.NDS.128.66.0F.WIG 6A /r */
   23060       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23061          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   23062                     uses_vvvv, vbi, pfx, delta, "vpunpckhdq",
   23063                     Iop_InterleaveHI32x4, NULL,
   23064                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   23065          goto decode_success;
   23066       }
   23067       break;
   23068 
   23069    case 0x6B:
   23070       /* VPACKSSDW r/m, rV, r ::: r = QNarrowBin32Sto16Sx8(rV, r/m) */
   23071       /* VPACKSSDW = VEX.NDS.128.66.0F.WIG 6B /r */
   23072       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23073          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   23074                     uses_vvvv, vbi, pfx, delta, "vpackssdw",
   23075                     Iop_QNarrowBin32Sto16Sx8, NULL,
   23076                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   23077          goto decode_success;
   23078       }
   23079       break;
   23080 
   23081    case 0x6C:
   23082       /* VPUNPCKLQDQ r/m, rV, r ::: r = interleave-lo-64bitses(rV, r/m) */
   23083       /* VPUNPCKLQDQ = VEX.NDS.128.0F.WIG 6C /r */
   23084       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23085          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   23086                     uses_vvvv, vbi, pfx, delta, "vpunpcklqdq",
   23087                     Iop_InterleaveLO64x2, NULL,
   23088                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   23089          goto decode_success;
   23090       }
   23091       break;
   23092 
   23093    case 0x6D:
   23094       /* VPUNPCKHQDQ r/m, rV, r ::: r = interleave-hi-64bitses(rV, r/m) */
   23095       /* VPUNPCKHQDQ = VEX.NDS.128.0F.WIG 6D /r */
   23096       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23097          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   23098                     uses_vvvv, vbi, pfx, delta, "vpunpckhqdq",
   23099                     Iop_InterleaveHI64x2, NULL,
   23100                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   23101          goto decode_success;
   23102       }
   23103       break;
   23104 
   23105    case 0x6E:
   23106       /* VMOVD r32/m32, xmm1 = VEX.128.66.0F.W0 6E */
   23107       if (have66noF2noF3(pfx)
   23108           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   23109          vassert(sz == 2); /* even tho we are transferring 4, not 2. */
   23110          UChar modrm = getUChar(delta);
   23111          if (epartIsReg(modrm)) {
   23112             delta += 1;
   23113             putYMMRegLoAndZU(
   23114                gregOfRexRM(pfx,modrm),
   23115                unop( Iop_32UtoV128, getIReg32(eregOfRexRM(pfx,modrm)) )
   23116             );
   23117             DIP("vmovd %s, %s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   23118                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   23119         } else {
   23120             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   23121             delta += alen;
   23122             putYMMRegLoAndZU(
   23123                gregOfRexRM(pfx,modrm),
   23124                unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)))
   23125                              );
   23126             DIP("vmovd %s, %s\n", dis_buf,
   23127                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   23128          }
   23129          goto decode_success;
   23130       }
   23131       /* VMOVQ r64/m64, xmm1 = VEX.128.66.0F.W1 6E */
   23132       if (have66noF2noF3(pfx)
   23133           && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
   23134          vassert(sz == 2); /* even tho we are transferring 8, not 2. */
   23135          UChar modrm = getUChar(delta);
   23136          if (epartIsReg(modrm)) {
   23137             delta += 1;
   23138             putYMMRegLoAndZU(
   23139                gregOfRexRM(pfx,modrm),
   23140                unop( Iop_64UtoV128, getIReg64(eregOfRexRM(pfx,modrm)) )
   23141             );
   23142             DIP("vmovq %s, %s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   23143                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   23144         } else {
   23145             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   23146             delta += alen;
   23147             putYMMRegLoAndZU(
   23148                gregOfRexRM(pfx,modrm),
   23149                unop( Iop_64UtoV128,loadLE(Ity_I64, mkexpr(addr)))
   23150                              );
   23151             DIP("vmovq %s, %s\n", dis_buf,
   23152                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   23153          }
   23154          goto decode_success;
   23155       }
   23156       break;
   23157 
   23158    case 0x6F:
   23159       /* VMOVDQA ymm2/m256, ymm1 = VEX.256.66.0F.WIG 6F */
   23160       /* VMOVDQU ymm2/m256, ymm1 = VEX.256.F3.0F.WIG 6F */
   23161       if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
   23162           && 1==getVexL(pfx)/*256*/) {
   23163          UChar  modrm = getUChar(delta);
   23164          UInt   rD    = gregOfRexRM(pfx, modrm);
   23165          IRTemp tD    = newTemp(Ity_V256);
   23166          Bool   isA   = have66noF2noF3(pfx);
   23167          UChar  ch    = isA ? 'a' : 'u';
   23168          if (epartIsReg(modrm)) {
   23169             UInt rS = eregOfRexRM(pfx, modrm);
   23170             delta += 1;
   23171             assign(tD, getYMMReg(rS));
   23172             DIP("vmovdq%c %s,%s\n", ch, nameYMMReg(rS), nameYMMReg(rD));
   23173          } else {
   23174             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   23175             delta += alen;
   23176             if (isA)
   23177                gen_SEGV_if_not_32_aligned(addr);
   23178             assign(tD, loadLE(Ity_V256, mkexpr(addr)));
   23179             DIP("vmovdq%c %s,%s\n", ch, dis_buf, nameYMMReg(rD));
   23180          }
   23181          putYMMReg(rD, mkexpr(tD));
   23182          goto decode_success;
   23183       }
   23184       /* VMOVDQA xmm2/m128, xmm1 = VEX.128.66.0F.WIG 6F */
   23185       /* VMOVDQU xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 6F */
   23186       if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
   23187           && 0==getVexL(pfx)/*128*/) {
   23188          UChar  modrm = getUChar(delta);
   23189          UInt   rD    = gregOfRexRM(pfx, modrm);
   23190          IRTemp tD    = newTemp(Ity_V128);
   23191          Bool   isA   = have66noF2noF3(pfx);
   23192          UChar  ch    = isA ? 'a' : 'u';
   23193          if (epartIsReg(modrm)) {
   23194             UInt rS = eregOfRexRM(pfx, modrm);
   23195             delta += 1;
   23196             assign(tD, getXMMReg(rS));
   23197             DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), nameXMMReg(rD));
   23198          } else {
   23199             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   23200             delta += alen;
   23201             if (isA)
   23202                gen_SEGV_if_not_16_aligned(addr);
   23203             assign(tD, loadLE(Ity_V128, mkexpr(addr)));
   23204             DIP("vmovdq%c %s,%s\n", ch, dis_buf, nameXMMReg(rD));
   23205          }
   23206          putYMMRegLoAndZU(rD, mkexpr(tD));
   23207          goto decode_success;
   23208       }
   23209       break;
   23210 
   23211    case 0x70:
   23212       /* VPSHUFD imm8, xmm2/m128, xmm1 = VEX.128.66.0F.WIG 70 /r ib */
   23213       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23214          delta = dis_PSHUFD_32x4( vbi, pfx, delta, True/*writesYmm*/);
   23215          goto decode_success;
   23216       }
   23217       /* VPSHUFLW imm8, xmm2/m128, xmm1 = VEX.128.F2.0F.WIG 70 /r ib */
   23218       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23219          delta = dis_PSHUFxW_128( vbi, pfx, delta,
   23220                                   True/*isAvx*/, False/*!xIsH*/ );
   23221          goto decode_success;
   23222       }
   23223       /* VPSHUFHW imm8, xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 70 /r ib */
   23224       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
   23225          delta = dis_PSHUFxW_128( vbi, pfx, delta,
   23226                                   True/*isAvx*/, True/*xIsH*/ );
   23227          goto decode_success;
   23228       }
   23229       break;
   23230 
   23231    case 0x71:
   23232       /* VPSRLW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /2 ib */
   23233       /* VPSRAW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /4 ib */
   23234       /* VPSLLW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /6 ib */
   23235       if (have66noF2noF3(pfx)
   23236           && 0==getVexL(pfx)/*128*/
   23237           && epartIsReg(getUChar(delta))) {
   23238          if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
   23239             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   23240                                                 "vpsrlw", Iop_ShrN16x8 );
   23241             *uses_vvvv = True;
   23242             goto decode_success;
   23243          }
   23244          if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
   23245             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   23246                                                 "vpsraw", Iop_SarN16x8 );
   23247             *uses_vvvv = True;
   23248             goto decode_success;
   23249          }
   23250          if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
   23251             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   23252                                                 "vpsllw", Iop_ShlN16x8 );
   23253             *uses_vvvv = True;
   23254             goto decode_success;
   23255          }
   23256          /* else fall through */
   23257       }
   23258       break;
   23259 
   23260    case 0x72:
   23261       /* VPSRLD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /2 ib */
   23262       /* VPSRAD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /4 ib */
   23263       /* VPSLLD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /6 ib */
   23264       if (have66noF2noF3(pfx)
   23265           && 0==getVexL(pfx)/*128*/
   23266           && epartIsReg(getUChar(delta))) {
   23267          if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
   23268             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   23269                                                 "vpsrld", Iop_ShrN32x4 );
   23270             *uses_vvvv = True;
   23271             goto decode_success;
   23272          }
   23273          if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
   23274             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   23275                                                 "vpsrad", Iop_SarN32x4 );
   23276             *uses_vvvv = True;
   23277             goto decode_success;
   23278          }
   23279          if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
   23280             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   23281                                                 "vpslld", Iop_ShlN32x4 );
   23282             *uses_vvvv = True;
   23283             goto decode_success;
   23284          }
   23285          /* else fall through */
   23286       }
   23287       break;
   23288 
   23289    case 0x73:
   23290       /* VPSRLDQ imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /3 ib */
   23291       /* VPSLLDQ imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /7 ib */
   23292       /* VPSRLQ  imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /2 ib */
   23293       /* VPSLLQ  imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /6 ib */
   23294       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   23295           && epartIsReg(getUChar(delta))) {
   23296          Int    rS   = eregOfRexRM(pfx,getUChar(delta));
   23297          Int    rD   = getVexNvvvv(pfx);
   23298          IRTemp vecS = newTemp(Ity_V128);
   23299          if (gregLO3ofRM(getUChar(delta)) == 3) {
   23300             Int imm = (Int)getUChar(delta+1);
   23301             DIP("vpsrldq $%d,%s,%s\n", imm, nameXMMReg(rS), nameXMMReg(rD));
   23302             delta += 2;
   23303             assign( vecS, getXMMReg(rS) );
   23304             putYMMRegLoAndZU(rD, mkexpr(math_PSRLDQ( vecS, imm )));
   23305             *uses_vvvv = True;
   23306             goto decode_success;
   23307          }
   23308          if (gregLO3ofRM(getUChar(delta)) == 7) {
   23309             Int imm = (Int)getUChar(delta+1);
   23310             DIP("vpslldq $%d,%s,%s\n", imm, nameXMMReg(rS), nameXMMReg(rD));
   23311             delta += 2;
   23312             assign( vecS, getXMMReg(rS) );
   23313             putYMMRegLoAndZU(rD, mkexpr(math_PSLLDQ( vecS, imm )));
   23314             *uses_vvvv = True;
   23315             goto decode_success;
   23316          }
   23317          if (gregLO3ofRM(getUChar(delta)) == 2) {
   23318             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   23319                                                 "vpsrlq", Iop_ShrN64x2 );
   23320             *uses_vvvv = True;
   23321             goto decode_success;
   23322          }
   23323          if (gregLO3ofRM(getUChar(delta)) == 6) {
   23324             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   23325                                                 "vpsllq", Iop_ShlN64x2 );
   23326             *uses_vvvv = True;
   23327             goto decode_success;
   23328          }
   23329          /* else fall through */
   23330       }
   23331       break;
   23332 
   23333    case 0x74:
   23334       /* VPCMPEQB r/m, rV, r ::: r = rV `eq-by-8s` r/m */
   23335       /* VPCMPEQB = VEX.NDS.128.66.0F.WIG 74 /r */
   23336       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23337          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   23338                     uses_vvvv, vbi, pfx, delta, "vpcmpeqb", Iop_CmpEQ8x16 );
   23339          goto decode_success;
   23340       }
   23341       break;
   23342 
   23343    case 0x75:
   23344       /* VPCMPEQW r/m, rV, r ::: r = rV `eq-by-16s` r/m */
   23345       /* VPCMPEQW = VEX.NDS.128.66.0F.WIG 75 /r */
   23346       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23347          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   23348                     uses_vvvv, vbi, pfx, delta, "vpcmpeqw", Iop_CmpEQ16x8 );
   23349          goto decode_success;
   23350       }
   23351       break;
   23352 
   23353    case 0x76:
   23354       /* VPCMPEQD r/m, rV, r ::: r = rV `eq-by-32s` r/m */
   23355       /* VPCMPEQD = VEX.NDS.128.66.0F.WIG 76 /r */
   23356       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23357          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   23358                     uses_vvvv, vbi, pfx, delta, "vpcmpeqd", Iop_CmpEQ32x4 );
   23359          goto decode_success;
   23360       }
   23361       break;
   23362 
   23363    case 0x77:
   23364       /* VZEROUPPER = VEX.128.0F.WIG 77 */
   23365       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23366          Int i;
   23367          IRTemp zero128 = newTemp(Ity_V128);
   23368          assign(zero128, mkV128(0));
   23369          for (i = 0; i < 16; i++) {
   23370             putYMMRegLane128(i, 1, mkexpr(zero128));
   23371          }
   23372          DIP("vzeroupper\n");
   23373          goto decode_success;
   23374       }
   23375       /* VZEROALL = VEX.256.0F.WIG 77 */
   23376       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   23377          Int i;
   23378          IRTemp zero128 = newTemp(Ity_V128);
   23379          assign(zero128, mkV128(0));
   23380          for (i = 0; i < 16; i++) {
   23381             putYMMRegLoAndZU(i, mkexpr(zero128));
   23382          }
   23383          DIP("vzeroall\n");
   23384          goto decode_success;
   23385       }
   23386       break;
   23387 
   23388    case 0x7C:
   23389    case 0x7D:
   23390       /* VHADDPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.F2.0F.WIG 7C /r */
   23391       /* VHSUBPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.F2.0F.WIG 7D /r */
   23392       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23393          IRTemp sV     = newTemp(Ity_V128);
   23394          IRTemp dV     = newTemp(Ity_V128);
   23395          Bool   isAdd  = opc == 0x7C;
   23396          HChar* str    = isAdd ? "add" : "sub";
   23397          UChar modrm   = getUChar(delta);
   23398          UInt   rG     = gregOfRexRM(pfx,modrm);
   23399          UInt   rV     = getVexNvvvv(pfx);
   23400          if (epartIsReg(modrm)) {
   23401             UInt rE = eregOfRexRM(pfx,modrm);
   23402             assign( sV, getXMMReg(rE) );
   23403             DIP("vh%spd %s,%s,%s\n", str, nameXMMReg(rE),
   23404                 nameXMMReg(rV), nameXMMReg(rG));
   23405             delta += 1;
   23406          } else {
   23407             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23408             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   23409             DIP("vh%spd %s,%s,%s\n", str, dis_buf,
   23410                 nameXMMReg(rV), nameXMMReg(rG));
   23411             delta += alen;
   23412          }
   23413          assign( dV, getXMMReg(rV) );
   23414          putYMMRegLoAndZU( rG, mkexpr( math_HADDPS_128 ( dV, sV, isAdd ) ) );
   23415          *uses_vvvv = True;
   23416          goto decode_success;
   23417       }
   23418       /* VHADDPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.F2.0F.WIG 7C /r */
   23419       /* VHSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.F2.0F.WIG 7D /r */
   23420       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   23421          IRTemp sV     = newTemp(Ity_V256);
   23422          IRTemp dV     = newTemp(Ity_V256);
   23423          IRTemp s1, s0, d1, d0;
   23424          Bool   isAdd  = opc == 0x7C;
   23425          HChar* str    = isAdd ? "add" : "sub";
   23426          UChar modrm   = getUChar(delta);
   23427          UInt   rG     = gregOfRexRM(pfx,modrm);
   23428          UInt   rV     = getVexNvvvv(pfx);
   23429          s1 = s0 = d1 = d0 = IRTemp_INVALID;
   23430          if (epartIsReg(modrm)) {
   23431             UInt rE = eregOfRexRM(pfx,modrm);
   23432             assign( sV, getYMMReg(rE) );
   23433             DIP("vh%spd %s,%s,%s\n", str, nameYMMReg(rE),
   23434                 nameYMMReg(rV), nameYMMReg(rG));
   23435             delta += 1;
   23436          } else {
   23437             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23438             assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   23439             DIP("vh%spd %s,%s,%s\n", str, dis_buf,
   23440                 nameYMMReg(rV), nameYMMReg(rG));
   23441             delta += alen;
   23442          }
   23443          assign( dV, getYMMReg(rV) );
   23444          breakupV256toV128s( dV, &d1, &d0 );
   23445          breakupV256toV128s( sV, &s1, &s0 );
   23446          putYMMReg( rG, binop(Iop_V128HLtoV256,
   23447                               mkexpr( math_HADDPS_128 ( d1, s1, isAdd ) ),
   23448                               mkexpr( math_HADDPS_128 ( d0, s0, isAdd ) ) ) );
   23449          *uses_vvvv = True;
   23450          goto decode_success;
   23451       }
   23452       /* VHADDPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 7C /r */
   23453       /* VHSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 7D /r */
   23454       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23455          IRTemp sV     = newTemp(Ity_V128);
   23456          IRTemp dV     = newTemp(Ity_V128);
   23457          Bool   isAdd  = opc == 0x7C;
   23458          HChar* str    = isAdd ? "add" : "sub";
   23459          UChar modrm   = getUChar(delta);
   23460          UInt   rG     = gregOfRexRM(pfx,modrm);
   23461          UInt   rV     = getVexNvvvv(pfx);
   23462          if (epartIsReg(modrm)) {
   23463             UInt rE = eregOfRexRM(pfx,modrm);
   23464             assign( sV, getXMMReg(rE) );
   23465             DIP("vh%spd %s,%s,%s\n", str, nameXMMReg(rE),
   23466                 nameXMMReg(rV), nameXMMReg(rG));
   23467             delta += 1;
   23468          } else {
   23469             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23470             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   23471             DIP("vh%spd %s,%s,%s\n", str, dis_buf,
   23472                 nameXMMReg(rV), nameXMMReg(rG));
   23473             delta += alen;
   23474          }
   23475          assign( dV, getXMMReg(rV) );
   23476          putYMMRegLoAndZU( rG, mkexpr( math_HADDPD_128 ( dV, sV, isAdd ) ) );
   23477          *uses_vvvv = True;
   23478          goto decode_success;
   23479       }
   23480       /* VHADDPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 7C /r */
   23481       /* VHSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 7D /r */
   23482       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   23483          IRTemp sV     = newTemp(Ity_V256);
   23484          IRTemp dV     = newTemp(Ity_V256);
   23485          IRTemp s1, s0, d1, d0;
   23486          Bool   isAdd  = opc == 0x7C;
   23487          HChar* str    = isAdd ? "add" : "sub";
   23488          UChar modrm   = getUChar(delta);
   23489          UInt   rG     = gregOfRexRM(pfx,modrm);
   23490          UInt   rV     = getVexNvvvv(pfx);
   23491          s1 = s0 = d1 = d0 = IRTemp_INVALID;
   23492          if (epartIsReg(modrm)) {
   23493             UInt rE = eregOfRexRM(pfx,modrm);
   23494             assign( sV, getYMMReg(rE) );
   23495             DIP("vh%spd %s,%s,%s\n", str, nameYMMReg(rE),
   23496                 nameYMMReg(rV), nameYMMReg(rG));
   23497             delta += 1;
   23498          } else {
   23499             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23500             assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   23501             DIP("vh%spd %s,%s,%s\n", str, dis_buf,
   23502                 nameYMMReg(rV), nameYMMReg(rG));
   23503             delta += alen;
   23504          }
   23505          assign( dV, getYMMReg(rV) );
   23506          breakupV256toV128s( dV, &d1, &d0 );
   23507          breakupV256toV128s( sV, &s1, &s0 );
   23508          putYMMReg( rG, binop(Iop_V128HLtoV256,
   23509                               mkexpr( math_HADDPD_128 ( d1, s1, isAdd ) ),
   23510                               mkexpr( math_HADDPD_128 ( d0, s0, isAdd ) ) ) );
   23511          *uses_vvvv = True;
   23512          goto decode_success;
   23513       }
   23514       break;
   23515 
   23516    case 0x7E:
   23517       /* Note the Intel docs don't make sense for this.  I think they
   23518          are wrong.  They seem to imply it is a store when in fact I
   23519          think it is a load.  Also it's unclear whether this is W0, W1
   23520          or WIG. */
   23521       /* VMOVQ xmm2/m64, xmm1 = VEX.128.F3.0F.W0 7E /r */
   23522       if (haveF3no66noF2(pfx)
   23523           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   23524          vassert(sz == 4); /* even tho we are transferring 8, not 4. */
   23525          UChar modrm = getUChar(delta);
   23526          UInt  rG    = gregOfRexRM(pfx,modrm);
   23527          if (epartIsReg(modrm)) {
   23528             UInt rE = eregOfRexRM(pfx,modrm);
   23529             putXMMRegLane64( rG, 0, getXMMRegLane64( rE, 0 ));
   23530             DIP("vmovq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   23531             delta += 1;
   23532          } else {
   23533             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23534             putXMMRegLane64( rG, 0, loadLE(Ity_I64, mkexpr(addr)) );
   23535             DIP("vmovq %s,%s\n", dis_buf, nameXMMReg(rG));
   23536             delta += alen;
   23537          }
   23538          /* zero bits 255:64 */
   23539          putXMMRegLane64( rG, 1, mkU64(0) );
   23540          putYMMRegLane128( rG, 1, mkV128(0) );
   23541          goto decode_success;
   23542       }
   23543       /* VMOVQ xmm1, r64 = VEX.128.66.0F.W1 7E /r (reg case only) */
   23544       /* Moves from G to E, so is a store-form insn */
   23545       /* Intel docs list this in the VMOVD entry for some reason. */
   23546       if (have66noF2noF3(pfx)
   23547           && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
   23548          UChar modrm = getUChar(delta);
   23549          UInt  rG    = gregOfRexRM(pfx,modrm);
   23550          if (epartIsReg(modrm)) {
   23551             UInt rE = eregOfRexRM(pfx,modrm);
   23552             DIP("vmovq %s,%s\n", nameXMMReg(rG), nameIReg64(rE));
   23553             putIReg64(rE, getXMMRegLane64(rG, 0));
   23554             delta += 1;
   23555          } else {
   23556             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23557             storeLE( mkexpr(addr), getXMMRegLane64(rG, 0) );
   23558             DIP("vmovq %s,%s\n", dis_buf, nameXMMReg(rG));
   23559             delta += alen;
   23560          }
   23561          goto decode_success;
   23562       }
   23563       /* VMOVD xmm1, m32/r32 = VEX.128.66.0F.W0 7E /r (reg case only) */
   23564       /* Moves from G to E, so is a store-form insn */
   23565       if (have66noF2noF3(pfx)
   23566           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   23567          UChar modrm = getUChar(delta);
   23568          UInt  rG    = gregOfRexRM(pfx,modrm);
   23569          if (epartIsReg(modrm)) {
   23570             UInt rE = eregOfRexRM(pfx,modrm);
   23571             DIP("vmovd %s,%s\n", nameXMMReg(rG), nameIReg32(rE));
   23572             putIReg32(rE, getXMMRegLane32(rG, 0));
   23573             delta += 1;
   23574          } else {
   23575             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23576             storeLE( mkexpr(addr), getXMMRegLane32(rG, 0) );
   23577             DIP("vmovd %s,%s\n", dis_buf, nameXMMReg(rG));
   23578             delta += alen;
   23579          }
   23580          goto decode_success;
   23581       }
   23582       break;
   23583 
   23584    case 0x7F:
   23585       /* VMOVDQA ymm1, ymm2/m256 = VEX.256.66.0F.WIG 7F */
   23586       /* VMOVDQU ymm1, ymm2/m256 = VEX.256.F3.0F.WIG 7F */
   23587       if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
   23588           && 1==getVexL(pfx)/*256*/) {
   23589          UChar  modrm = getUChar(delta);
   23590          UInt   rS    = gregOfRexRM(pfx, modrm);
   23591          IRTemp tS    = newTemp(Ity_V256);
   23592          Bool   isA   = have66noF2noF3(pfx);
   23593          UChar  ch    = isA ? 'a' : 'u';
   23594          assign(tS, getYMMReg(rS));
   23595          if (epartIsReg(modrm)) {
   23596             UInt rD = eregOfRexRM(pfx, modrm);
   23597             delta += 1;
   23598             putYMMReg(rD, mkexpr(tS));
   23599             DIP("vmovdq%c %s,%s\n", ch, nameYMMReg(rS), nameYMMReg(rD));
   23600          } else {
   23601             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   23602             delta += alen;
   23603             if (isA)
   23604                gen_SEGV_if_not_32_aligned(addr);
   23605             storeLE(mkexpr(addr), mkexpr(tS));
   23606             DIP("vmovdq%c %s,%s\n", ch, nameYMMReg(rS), dis_buf);
   23607          }
   23608          goto decode_success;
   23609       }
   23610       /* VMOVDQA xmm1, xmm2/m128 = VEX.128.66.0F.WIG 7F */
   23611       /* VMOVDQU xmm1, xmm2/m128 = VEX.128.F3.0F.WIG 7F */
   23612       if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
   23613           && 0==getVexL(pfx)/*128*/) {
   23614          UChar  modrm = getUChar(delta);
   23615          UInt   rS    = gregOfRexRM(pfx, modrm);
   23616          IRTemp tS    = newTemp(Ity_V128);
   23617          Bool   isA   = have66noF2noF3(pfx);
   23618          UChar  ch    = isA ? 'a' : 'u';
   23619          assign(tS, getXMMReg(rS));
   23620          if (epartIsReg(modrm)) {
   23621             UInt rD = eregOfRexRM(pfx, modrm);
   23622             delta += 1;
   23623             putYMMRegLoAndZU(rD, mkexpr(tS));
   23624             DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), nameXMMReg(rD));
   23625          } else {
   23626             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   23627             delta += alen;
   23628             if (isA)
   23629                gen_SEGV_if_not_16_aligned(addr);
   23630             storeLE(mkexpr(addr), mkexpr(tS));
   23631             DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), dis_buf);
   23632          }
   23633          goto decode_success;
   23634       }
   23635       break;
   23636 
   23637    case 0xAE:
   23638       /* VSTMXCSR m32 = VEX.LZ.0F.WIG AE /3 */
   23639       if (haveNo66noF2noF3(pfx)
   23640           && 0==getVexL(pfx)/*LZ*/
   23641           && 0==getRexW(pfx) /* be paranoid -- Intel docs don't require this */
   23642           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 3
   23643           && sz == 4) {
   23644          delta = dis_STMXCSR(vbi, pfx, delta, True/*isAvx*/);
   23645          goto decode_success;
   23646       }
   23647       /* VLDMXCSR m32 = VEX.LZ.0F.WIG AE /2 */
   23648       if (haveNo66noF2noF3(pfx)
   23649           && 0==getVexL(pfx)/*LZ*/
   23650           && 0==getRexW(pfx) /* be paranoid -- Intel docs don't require this */
   23651           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 2
   23652           && sz == 4) {
   23653          delta = dis_LDMXCSR(vbi, pfx, delta, True/*isAvx*/);
   23654          goto decode_success;
   23655       }
   23656       break;
   23657 
   23658    case 0xC2:
   23659       /* VCMPSD xmm3/m64(E=argL), xmm2(V=argR), xmm1(G) */
   23660       /* = VEX.NDS.LIG.F2.0F.WIG C2 /r ib */
   23661       if (haveF2no66noF3(pfx)) {
   23662          Long delta0 = delta;
   23663          delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   23664                                           "vcmpsd", False/*!all_lanes*/,
   23665                                           8/*sz*/);
   23666          if (delta > delta0) goto decode_success;
   23667          /* else fall through -- decoding has failed */
   23668       }
   23669       /* VCMPSS xmm3/m32(E=argL), xmm2(V=argR), xmm1(G) */
   23670       /* = VEX.NDS.LIG.F3.0F.WIG C2 /r ib */
   23671       if (haveF3no66noF2(pfx)) {
   23672          Long delta0 = delta;
   23673          delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   23674                                           "vcmpss", False/*!all_lanes*/,
   23675                                           4/*sz*/);
   23676          if (delta > delta0) goto decode_success;
   23677          /* else fall through -- decoding has failed */
   23678       }
   23679       /* VCMPPD xmm3/m128(E=argL), xmm2(V=argR), xmm1(G) */
   23680       /* = VEX.NDS.128.66.0F.WIG C2 /r ib */
   23681       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23682          Long delta0 = delta;
   23683          delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   23684                                           "vcmppd", True/*all_lanes*/,
   23685                                           8/*sz*/);
   23686          if (delta > delta0) goto decode_success;
   23687          /* else fall through -- decoding has failed */
   23688       }
   23689       /* VCMPPD ymm3/m256(E=argL), ymm2(V=argR), ymm1(G) */
   23690       /* = VEX.NDS.256.66.0F.WIG C2 /r ib */
   23691       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   23692          Long delta0 = delta;
   23693          delta = dis_AVX256_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   23694                                           "vcmppd", 8/*sz*/);
   23695          if (delta > delta0) goto decode_success;
   23696          /* else fall through -- decoding has failed */
   23697       }
   23698       /* VCMPPS xmm3/m128(E=argL), xmm2(V=argR), xmm1(G) */
   23699       /* = VEX.NDS.128.0F.WIG C2 /r ib */
   23700       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23701          Long delta0 = delta;
   23702          delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   23703                                           "vcmpps", True/*all_lanes*/,
   23704                                           4/*sz*/);
   23705          if (delta > delta0) goto decode_success;
   23706          /* else fall through -- decoding has failed */
   23707       }
   23708       /* VCMPPS ymm3/m256(E=argL), ymm2(V=argR), ymm1(G) */
   23709       /* = VEX.NDS.256.0F.WIG C2 /r ib */
   23710       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   23711          Long delta0 = delta;
   23712          delta = dis_AVX256_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   23713                                           "vcmpps", 4/*sz*/);
   23714          if (delta > delta0) goto decode_success;
   23715          /* else fall through -- decoding has failed */
   23716       }
   23717       break;
   23718 
   23719    case 0xC4:
   23720       /* VPINSRW r32/m16, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG C4 /r ib */
   23721       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23722          UChar  modrm = getUChar(delta);
   23723          UInt   rG    = gregOfRexRM(pfx, modrm);
   23724          UInt   rV    = getVexNvvvv(pfx);
   23725          Int    imm8;
   23726          IRTemp new16 = newTemp(Ity_I16);
   23727 
   23728          if ( epartIsReg( modrm ) ) {
   23729             imm8 = (Int)(getUChar(delta+1) & 7);
   23730             assign( new16, unop(Iop_32to16,
   23731                                 getIReg32(eregOfRexRM(pfx,modrm))) );
   23732             delta += 1+1;
   23733             DIP( "vpinsrw $%d,%s,%s\n", imm8,
   23734                  nameIReg32( eregOfRexRM(pfx, modrm) ), nameXMMReg(rG) );
   23735          } else {
   23736             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   23737             imm8 = (Int)(getUChar(delta+alen) & 7);
   23738             assign( new16, loadLE( Ity_I16, mkexpr(addr) ));
   23739             delta += alen+1;
   23740             DIP( "vpinsrw $%d,%s,%s\n",
   23741                  imm8, dis_buf, nameXMMReg(rG) );
   23742          }
   23743 
   23744          IRTemp src_vec = newTemp(Ity_V128);
   23745          assign(src_vec, getXMMReg( rV ));
   23746          IRTemp res_vec = math_PINSRW_128( src_vec, new16, imm8 );
   23747          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   23748          *uses_vvvv = True;
   23749          goto decode_success;
   23750       }
   23751       break;
   23752 
   23753    case 0xC5:
   23754       /* VPEXTRW imm8, xmm1, reg32 = VEX.128.66.0F.W0 C5 /r ib */
   23755       if (have66noF2noF3(pfx)
   23756          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   23757          Long delta0 = delta;
   23758          delta = dis_PEXTRW_128_EregOnly_toG( vbi, pfx, delta,
   23759                                               True/*isAvx*/ );
   23760          if (delta > delta0) goto decode_success;
   23761          /* else fall through -- decoding has failed */
   23762       }
   23763       break;
   23764 
   23765    case 0xC6:
   23766       /* VSHUFPS imm8, xmm3/m128, xmm2, xmm1, xmm2 */
   23767       /* = VEX.NDS.128.0F.WIG C6 /r ib */
   23768       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23769          Int    imm8 = 0;
   23770          IRTemp eV   = newTemp(Ity_V128);
   23771          IRTemp vV   = newTemp(Ity_V128);
   23772          UInt  modrm = getUChar(delta);
   23773          UInt  rG    = gregOfRexRM(pfx,modrm);
   23774          UInt  rV    = getVexNvvvv(pfx);
   23775          assign( vV, getXMMReg(rV) );
   23776          if (epartIsReg(modrm)) {
   23777             UInt rE = eregOfRexRM(pfx,modrm);
   23778             assign( eV, getXMMReg(rE) );
   23779             imm8 = (Int)getUChar(delta+1);
   23780             delta += 1+1;
   23781             DIP("vshufps $%d,%s,%s,%s\n",
   23782                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   23783          } else {
   23784             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   23785             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   23786             imm8 = (Int)getUChar(delta+alen);
   23787             delta += 1+alen;
   23788             DIP("vshufps $%d,%s,%s,%s\n",
   23789                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   23790          }
   23791          IRTemp res = math_SHUFPS_128( eV, vV, imm8 );
   23792          putYMMRegLoAndZU( rG, mkexpr(res) );
   23793          *uses_vvvv = True;
   23794          goto decode_success;
   23795       }
   23796       /* VSHUFPS imm8, ymm3/m256, ymm2, ymm1, ymm2 */
   23797       /* = VEX.NDS.256.0F.WIG C6 /r ib */
   23798       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   23799          Int    imm8 = 0;
   23800          IRTemp eV   = newTemp(Ity_V256);
   23801          IRTemp vV   = newTemp(Ity_V256);
   23802          UInt  modrm = getUChar(delta);
   23803          UInt  rG    = gregOfRexRM(pfx,modrm);
   23804          UInt  rV    = getVexNvvvv(pfx);
   23805          assign( vV, getYMMReg(rV) );
   23806          if (epartIsReg(modrm)) {
   23807             UInt rE = eregOfRexRM(pfx,modrm);
   23808             assign( eV, getYMMReg(rE) );
   23809             imm8 = (Int)getUChar(delta+1);
   23810             delta += 1+1;
   23811             DIP("vshufps $%d,%s,%s,%s\n",
   23812                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   23813          } else {
   23814             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   23815             assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
   23816             imm8 = (Int)getUChar(delta+alen);
   23817             delta += 1+alen;
   23818             DIP("vshufps $%d,%s,%s,%s\n",
   23819                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   23820          }
   23821          IRTemp res = math_SHUFPS_256( eV, vV, imm8 );
   23822          putYMMReg( rG, mkexpr(res) );
   23823          *uses_vvvv = True;
   23824          goto decode_success;
   23825       }
   23826       /* VSHUFPD imm8, xmm3/m128, xmm2, xmm1, xmm2 */
   23827       /* = VEX.NDS.128.66.0F.WIG C6 /r ib */
   23828       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23829          Int    imm8 = 0;
   23830          IRTemp eV   = newTemp(Ity_V128);
   23831          IRTemp vV   = newTemp(Ity_V128);
   23832          UInt  modrm = getUChar(delta);
   23833          UInt  rG    = gregOfRexRM(pfx,modrm);
   23834          UInt  rV    = getVexNvvvv(pfx);
   23835          assign( vV, getXMMReg(rV) );
   23836          if (epartIsReg(modrm)) {
   23837             UInt rE = eregOfRexRM(pfx,modrm);
   23838             assign( eV, getXMMReg(rE) );
   23839             imm8 = (Int)getUChar(delta+1);
   23840             delta += 1+1;
   23841             DIP("vshufpd $%d,%s,%s,%s\n",
   23842                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   23843          } else {
   23844             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   23845             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   23846             imm8 = (Int)getUChar(delta+alen);
   23847             delta += 1+alen;
   23848             DIP("vshufpd $%d,%s,%s,%s\n",
   23849                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   23850          }
   23851          IRTemp res = math_SHUFPD_128( eV, vV, imm8 );
   23852          putYMMRegLoAndZU( rG, mkexpr(res) );
   23853          *uses_vvvv = True;
   23854          goto decode_success;
   23855       }
   23856       /* VSHUFPD imm8, ymm3/m256, ymm2, ymm1, ymm2 */
   23857       /* = VEX.NDS.256.66.0F.WIG C6 /r ib */
   23858       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   23859          Int    imm8 = 0;
   23860          IRTemp eV   = newTemp(Ity_V256);
   23861          IRTemp vV   = newTemp(Ity_V256);
   23862          UInt  modrm = getUChar(delta);
   23863          UInt  rG    = gregOfRexRM(pfx,modrm);
   23864          UInt  rV    = getVexNvvvv(pfx);
   23865          assign( vV, getYMMReg(rV) );
   23866          if (epartIsReg(modrm)) {
   23867             UInt rE = eregOfRexRM(pfx,modrm);
   23868             assign( eV, getYMMReg(rE) );
   23869             imm8 = (Int)getUChar(delta+1);
   23870             delta += 1+1;
   23871             DIP("vshufpd $%d,%s,%s,%s\n",
   23872                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   23873          } else {
   23874             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   23875             assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
   23876             imm8 = (Int)getUChar(delta+alen);
   23877             delta += 1+alen;
   23878             DIP("vshufpd $%d,%s,%s,%s\n",
   23879                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   23880          }
   23881          IRTemp res = math_SHUFPD_256( eV, vV, imm8 );
   23882          putYMMReg( rG, mkexpr(res) );
   23883          *uses_vvvv = True;
   23884          goto decode_success;
   23885       }
   23886       break;
   23887 
   23888    case 0xD0:
   23889       /* VADDSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D0 /r */
   23890       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23891          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   23892                     uses_vvvv, vbi, pfx, delta,
   23893                     "vaddsubpd", math_ADDSUBPD_128 );
   23894          goto decode_success;
   23895       }
   23896       /* VADDSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D0 /r */
   23897       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   23898          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   23899                     uses_vvvv, vbi, pfx, delta,
   23900                     "vaddsubpd", math_ADDSUBPD_256 );
   23901          goto decode_success;
   23902       }
   23903       /* VADDSUBPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.F2.0F.WIG D0 /r */
   23904       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23905          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   23906                     uses_vvvv, vbi, pfx, delta,
   23907                     "vaddsubps", math_ADDSUBPS_128 );
   23908          goto decode_success;
   23909       }
   23910       /* VADDSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.F2.0F.WIG D0 /r */
   23911       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   23912          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   23913                     uses_vvvv, vbi, pfx, delta,
   23914                     "vaddsubps", math_ADDSUBPS_256 );
   23915          goto decode_success;
   23916       }
   23917       break;
   23918 
   23919    case 0xD1:
   23920       /* VPSRLW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D1 /r */
   23921       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23922          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   23923                                         "vpsrlw", Iop_ShrN16x8 );
   23924          *uses_vvvv = True;
   23925          goto decode_success;
   23926 
   23927       }
   23928       break;
   23929 
   23930    case 0xD2:
   23931       /* VPSRLD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D2 /r */
   23932       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23933          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   23934                                         "vpsrld", Iop_ShrN32x4 );
   23935          *uses_vvvv = True;
   23936          goto decode_success;
   23937       }
   23938       break;
   23939 
   23940    case 0xD3:
   23941       /* VPSRLQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D3 /r */
   23942       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23943          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   23944                                         "vpsrlq", Iop_ShrN64x2 );
   23945          *uses_vvvv = True;
   23946          goto decode_success;
   23947       }
   23948       break;
   23949 
   23950    case 0xD4:
   23951       /* VPADDQ r/m, rV, r ::: r = rV + r/m */
   23952       /* VPADDQ = VEX.NDS.128.66.0F.WIG D4 /r */
   23953       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23954          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   23955                     uses_vvvv, vbi, pfx, delta, "vpaddq", Iop_Add64x2 );
   23956          goto decode_success;
   23957       }
   23958       break;
   23959 
   23960    case 0xD5:
   23961       /* VPMULLW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D5 /r */
   23962       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23963          delta = dis_AVX128_E_V_to_G(
   23964                     uses_vvvv, vbi, pfx, delta, "vpmullw", Iop_Mul16x8 );
   23965          goto decode_success;
   23966       }
   23967       break;
   23968 
   23969    case 0xD6:
   23970       /* I can't even find any Intel docs for this one. */
   23971       /* Basically: 66 0F D6 = MOVQ -- move 64 bits from G (lo half
   23972          xmm) to E (mem or lo half xmm).  Looks like L==0(128), W==0
   23973          (WIG, maybe?) */
   23974       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   23975           && 0==getRexW(pfx)/*this might be redundant, dunno*/) {
   23976          UChar modrm = getUChar(delta);
   23977          UInt  rG    = gregOfRexRM(pfx,modrm);
   23978          if (epartIsReg(modrm)) {
   23979             /* fall through, awaiting test case */
   23980             /* dst: lo half copied, hi half zeroed */
   23981          } else {
   23982             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23983             storeLE( mkexpr(addr), getXMMRegLane64( rG, 0 ));
   23984             DIP("vmovq %s,%s\n", nameXMMReg(rG), dis_buf );
   23985             delta += alen;
   23986             goto decode_success;
   23987          }
   23988       }
   23989       break;
   23990 
   23991    case 0xD7:
   23992       /* VEX.128.66.0F.WIG D7 /r = VPMOVMSKB xmm1, r32 */
   23993       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23994          delta = dis_PMOVMSKB_128( vbi, pfx, delta, True/*isAvx*/ );
   23995          goto decode_success;
   23996       }
   23997       break;
   23998 
   23999    case 0xD8:
   24000       /* VPSUBUSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D8 /r */
   24001       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24002          delta = dis_AVX128_E_V_to_G(
   24003                     uses_vvvv, vbi, pfx, delta, "vpsubusb", Iop_QSub8Ux16 );
   24004          goto decode_success;
   24005       }
   24006      break;
   24007 
   24008    case 0xD9:
   24009       /* VPSUBUSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D9 /r */
   24010       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24011          delta = dis_AVX128_E_V_to_G(
   24012                     uses_vvvv, vbi, pfx, delta, "vpsubusw", Iop_QSub16Ux8 );
   24013          goto decode_success;
   24014       }
   24015       break;
   24016 
   24017    case 0xDA:
   24018       /* VPMINUB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DA /r */
   24019       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24020          delta = dis_AVX128_E_V_to_G(
   24021                     uses_vvvv, vbi, pfx, delta, "vpminub", Iop_Min8Ux16 );
   24022          goto decode_success;
   24023       }
   24024       break;
   24025 
   24026    case 0xDB:
   24027       /* VPAND r/m, rV, r ::: r = rV & r/m */
   24028       /* VEX.NDS.128.66.0F.WIG DB /r = VPAND xmm3/m128, xmm2, xmm1 */
   24029       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24030          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24031                     uses_vvvv, vbi, pfx, delta, "vpand", Iop_AndV128 );
   24032          goto decode_success;
   24033       }
   24034       break;
   24035 
   24036    case 0xDC:
   24037       /* VPADDUSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DC /r */
   24038       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24039          delta = dis_AVX128_E_V_to_G(
   24040                     uses_vvvv, vbi, pfx, delta, "vpaddusb", Iop_QAdd8Ux16 );
   24041          goto decode_success;
   24042       }
   24043       break;
   24044 
   24045    case 0xDD:
   24046       /* VPADDUSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DD /r */
   24047       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24048          delta = dis_AVX128_E_V_to_G(
   24049                     uses_vvvv, vbi, pfx, delta, "vpaddusw", Iop_QAdd16Ux8 );
   24050          goto decode_success;
   24051       }
   24052       break;
   24053 
   24054    case 0xDE:
   24055       /* VPMAXUB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DE /r */
   24056       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24057          delta = dis_AVX128_E_V_to_G(
   24058                     uses_vvvv, vbi, pfx, delta, "vpmaxub", Iop_Max8Ux16 );
   24059          goto decode_success;
   24060       }
   24061       break;
   24062 
   24063    case 0xDF:
   24064       /* VPANDN r/m, rV, r ::: r = rV & ~r/m (is that correct, re the ~ ?) */
   24065       /* VEX.NDS.128.66.0F.WIG DF /r = VPANDN xmm3/m128, xmm2, xmm1 */
   24066       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24067          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   24068                     uses_vvvv, vbi, pfx, delta, "vpandn", Iop_AndV128,
   24069                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
   24070          goto decode_success;
   24071       }
   24072       break;
   24073 
   24074    case 0xE0:
   24075       /* VPAVGB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E0 /r */
   24076       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24077          delta = dis_AVX128_E_V_to_G(
   24078                     uses_vvvv, vbi, pfx, delta, "vpavgb", Iop_Avg8Ux16 );
   24079          goto decode_success;
   24080       }
   24081       break;
   24082 
   24083    case 0xE1:
   24084       /* VPSRAW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E1 /r */
   24085       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24086          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   24087                                         "vpsraw", Iop_SarN16x8 );
   24088          *uses_vvvv = True;
   24089          goto decode_success;
   24090       }
   24091       break;
   24092 
   24093    case 0xE2:
   24094       /* VPSRAD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E2 /r */
   24095       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24096          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   24097                                         "vpsrad", Iop_SarN32x4 );
   24098          *uses_vvvv = True;
   24099          goto decode_success;
   24100       }
   24101       break;
   24102 
   24103    case 0xE3:
   24104       /* VPAVGW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E3 /r */
   24105       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24106          delta = dis_AVX128_E_V_to_G(
   24107                     uses_vvvv, vbi, pfx, delta, "vpavgw", Iop_Avg16Ux8 );
   24108          goto decode_success;
   24109       }
   24110       break;
   24111 
   24112    case 0xE4:
   24113       /* VPMULHUW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E4 /r */
   24114       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24115          delta = dis_AVX128_E_V_to_G(
   24116                     uses_vvvv, vbi, pfx, delta, "vpmulhuw", Iop_MulHi16Ux8 );
   24117          goto decode_success;
   24118       }
   24119       break;
   24120 
   24121    case 0xE5:
   24122       /* VPMULHW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E5 /r */
   24123       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24124          delta = dis_AVX128_E_V_to_G(
   24125                     uses_vvvv, vbi, pfx, delta, "vpmulhw", Iop_MulHi16Sx8 );
   24126          goto decode_success;
   24127       }
   24128       break;
   24129 
   24130    case 0xE6:
   24131       /* VCVTDQ2PD xmm2/m64, xmm1 = VEX.128.F3.0F.WIG E6 /r */
   24132       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
   24133          delta = dis_CVTDQ2PD_128(vbi, pfx, delta, True/*isAvx*/);
   24134          goto decode_success;
   24135       }
   24136       /* VCVTDQ2PD xmm2/m128, ymm1 = VEX.256.F3.0F.WIG E6 /r */
   24137       if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
   24138          delta = dis_CVTDQ2PD_256(vbi, pfx, delta);
   24139          goto decode_success;
   24140       }
   24141       /* VCVTTPD2DQ xmm2/m128, xmm1 = VEX.128.66.0F.WIG E6 /r */
   24142       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24143          delta = dis_CVTxPD2DQ_128(vbi, pfx, delta, True/*isAvx*/,
   24144                                    True/*r2zero*/);
   24145          goto decode_success;
   24146       }
   24147       /* VCVTTPD2DQ ymm2/m256, xmm1 = VEX.256.66.0F.WIG E6 /r */
   24148       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24149          delta = dis_CVTxPD2DQ_256(vbi, pfx, delta, True/*r2zero*/);
   24150          goto decode_success;
   24151       }
   24152       /* VCVTPD2DQ xmm2/m128, xmm1 = VEX.128.F2.0F.WIG E6 /r */
   24153       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24154          delta = dis_CVTxPD2DQ_128(vbi, pfx, delta, True/*isAvx*/,
   24155                                    False/*!r2zero*/);
   24156          goto decode_success;
   24157       }
   24158       /* VCVTPD2DQ ymm2/m256, xmm1 = VEX.256.F2.0F.WIG E6 /r */
   24159       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24160          delta = dis_CVTxPD2DQ_256(vbi, pfx, delta, False/*!r2zero*/);
   24161          goto decode_success;
   24162       }
   24163       break;
   24164 
   24165    case 0xE7:
   24166       /* VMOVNTDQ xmm1, m128 = VEX.128.66.0F.WIG E7 /r */
   24167       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24168          UChar modrm = getUChar(delta);
   24169          UInt rG     = gregOfRexRM(pfx,modrm);
   24170          if (!epartIsReg(modrm)) {
   24171             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24172             gen_SEGV_if_not_16_aligned( addr );
   24173             storeLE( mkexpr(addr), getXMMReg(rG) );
   24174             DIP("vmovntdq %s,%s\n", dis_buf, nameXMMReg(rG));
   24175             delta += alen;
   24176             goto decode_success;
   24177          }
   24178          /* else fall through */
   24179       }
   24180       /* VMOVNTDQ ymm1, m256 = VEX.256.66.0F.WIG E7 /r */
   24181       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24182          UChar modrm = getUChar(delta);
   24183          UInt rG     = gregOfRexRM(pfx,modrm);
   24184          if (!epartIsReg(modrm)) {
   24185             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24186             gen_SEGV_if_not_32_aligned( addr );
   24187             storeLE( mkexpr(addr), getYMMReg(rG) );
   24188             DIP("vmovntdq %s,%s\n", dis_buf, nameYMMReg(rG));
   24189             delta += alen;
   24190             goto decode_success;
   24191          }
   24192          /* else fall through */
   24193       }
   24194       break;
   24195 
   24196    case 0xE8:
   24197       /* VPSUBSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E8 /r */
   24198       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24199          delta = dis_AVX128_E_V_to_G(
   24200                     uses_vvvv, vbi, pfx, delta, "vpsubsb", Iop_QSub8Sx16 );
   24201          goto decode_success;
   24202       }
   24203       break;
   24204 
   24205    case 0xE9:
   24206       /* VPSUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E9 /r */
   24207       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24208          delta = dis_AVX128_E_V_to_G(
   24209                     uses_vvvv, vbi, pfx, delta, "vpsubsw", Iop_QSub16Sx8 );
   24210          goto decode_success;
   24211       }
   24212       break;
   24213 
   24214    case 0xEA:
   24215       /* VPMINSW r/m, rV, r ::: r = min-signed16s(rV, r/m) */
   24216       /* VPMINSW = VEX.NDS.128.66.0F.WIG EA /r */
   24217       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24218          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24219                     uses_vvvv, vbi, pfx, delta, "vpminsw", Iop_Min16Sx8 );
   24220          goto decode_success;
   24221       }
   24222       break;
   24223 
   24224    case 0xEB:
   24225       /* VPOR r/m, rV, r ::: r = rV | r/m */
   24226       /* VPOR = VEX.NDS.128.66.0F.WIG EB /r */
   24227       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24228          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24229                     uses_vvvv, vbi, pfx, delta, "vpor", Iop_OrV128 );
   24230          goto decode_success;
   24231       }
   24232       break;
   24233 
   24234    case 0xEC:
   24235       /* VPADDSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG EC /r */
   24236       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24237          delta = dis_AVX128_E_V_to_G(
   24238                     uses_vvvv, vbi, pfx, delta, "vpaddsb", Iop_QAdd8Sx16 );
   24239          goto decode_success;
   24240       }
   24241       break;
   24242 
   24243    case 0xED:
   24244       /* VPADDSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG ED /r */
   24245       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24246          delta = dis_AVX128_E_V_to_G(
   24247                     uses_vvvv, vbi, pfx, delta, "vpaddsw", Iop_QAdd16Sx8 );
   24248          goto decode_success;
   24249       }
   24250       break;
   24251 
   24252    case 0xEE:
   24253       /* VPMAXSW r/m, rV, r ::: r = max-signed16s(rV, r/m) */
   24254       /* VPMAXSW = VEX.NDS.128.66.0F.WIG EE /r */
   24255       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24256          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24257                     uses_vvvv, vbi, pfx, delta, "vpmaxsw", Iop_Max16Sx8 );
   24258          goto decode_success;
   24259       }
   24260       break;
   24261 
   24262    case 0xEF:
   24263       /* VPXOR r/m, rV, r ::: r = rV ^ r/m */
   24264       /* VPXOR = VEX.NDS.128.66.0F.WIG EF /r */
   24265       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24266          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24267                     uses_vvvv, vbi, pfx, delta, "vpxor", Iop_XorV128 );
   24268          goto decode_success;
   24269       }
   24270       break;
   24271 
   24272    case 0xF0:
   24273       /* VLDDQU m256, ymm1 = VEX.256.F2.0F.WIG F0 /r */
   24274       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24275          UChar  modrm = getUChar(delta);
   24276          UInt   rD    = gregOfRexRM(pfx, modrm);
   24277          IRTemp tD    = newTemp(Ity_V256);
   24278          if (epartIsReg(modrm)) break;
   24279          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   24280          delta += alen;
   24281          assign(tD, loadLE(Ity_V256, mkexpr(addr)));
   24282          DIP("vlddqu %s,%s\n", dis_buf, nameYMMReg(rD));
   24283          putYMMReg(rD, mkexpr(tD));
   24284          goto decode_success;
   24285       }
   24286       /* VLDDQU m128, xmm1 = VEX.128.F2.0F.WIG F0 /r */
   24287       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24288          UChar  modrm = getUChar(delta);
   24289          UInt   rD    = gregOfRexRM(pfx, modrm);
   24290          IRTemp tD    = newTemp(Ity_V128);
   24291          if (epartIsReg(modrm)) break;
   24292          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   24293          delta += alen;
   24294          assign(tD, loadLE(Ity_V128, mkexpr(addr)));
   24295          DIP("vlddqu %s,%s\n", dis_buf, nameXMMReg(rD));
   24296          putYMMRegLoAndZU(rD, mkexpr(tD));
   24297          goto decode_success;
   24298       }
   24299       break;
   24300 
   24301    case 0xF1:
   24302       /* VPSLLW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F1 /r */
   24303       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24304          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   24305                                         "vpsllw", Iop_ShlN16x8 );
   24306          *uses_vvvv = True;
   24307          goto decode_success;
   24308 
   24309       }
   24310       break;
   24311 
   24312    case 0xF2:
   24313       /* VPSLLD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F2 /r */
   24314       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24315          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   24316                                         "vpslld", Iop_ShlN32x4 );
   24317          *uses_vvvv = True;
   24318          goto decode_success;
   24319       }
   24320       break;
   24321 
   24322    case 0xF3:
   24323       /* VPSLLQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F3 /r */
   24324       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24325          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   24326                                         "vpsllq", Iop_ShlN64x2 );
   24327          *uses_vvvv = True;
   24328          goto decode_success;
   24329       }
   24330       break;
   24331 
   24332    case 0xF4:
   24333       /* VPMULUDQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F4 /r */
   24334       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24335          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   24336                     uses_vvvv, vbi, pfx, delta,
   24337                     "vpmuludq", math_PMULUDQ_128 );
   24338          goto decode_success;
   24339       }
   24340       break;
   24341 
   24342    case 0xF5:
   24343       /* VPMADDWD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F5 /r */
   24344       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24345          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   24346                     uses_vvvv, vbi, pfx, delta,
   24347                     "vpmaddwd", math_PMADDWD_128 );
   24348          goto decode_success;
   24349       }
   24350       break;
   24351 
   24352    case 0xF6:
   24353       /* VPSADBW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F6 /r */
   24354       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24355          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   24356                     uses_vvvv, vbi, pfx, delta,
   24357                     "vpsadbw", math_PSADBW_128 );
   24358          goto decode_success;
   24359       }
   24360       break;
   24361 
   24362    case 0xF7:
   24363       /* VMASKMOVDQU xmm2, xmm1 = VEX.128.66.0F.WIG F7 /r */
   24364       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   24365           && epartIsReg(getUChar(delta))) {
   24366          delta = dis_MASKMOVDQU( vbi, pfx, delta, True/*isAvx*/ );
   24367          goto decode_success;
   24368       }
   24369       break;
   24370 
   24371    case 0xF8:
   24372       /* VPSUBB r/m, rV, r ::: r = rV - r/m */
   24373       /* VPSUBB = VEX.NDS.128.66.0F.WIG F8 /r */
   24374       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24375          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24376                     uses_vvvv, vbi, pfx, delta, "vpsubb", Iop_Sub8x16 );
   24377          goto decode_success;
   24378       }
   24379       break;
   24380 
   24381    case 0xF9:
   24382       /* VPSUBW r/m, rV, r ::: r = rV - r/m */
   24383       /* VPSUBW = VEX.NDS.128.66.0F.WIG F9 /r */
   24384       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24385          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24386                     uses_vvvv, vbi, pfx, delta, "vpsubw", Iop_Sub16x8 );
   24387          goto decode_success;
   24388       }
   24389       break;
   24390 
   24391    case 0xFA:
   24392       /* VPSUBD r/m, rV, r ::: r = rV - r/m */
   24393       /* VPSUBD = VEX.NDS.128.66.0F.WIG FA /r */
   24394       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24395          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24396                     uses_vvvv, vbi, pfx, delta, "vpsubd", Iop_Sub32x4 );
   24397          goto decode_success;
   24398       }
   24399       break;
   24400 
   24401    case 0xFB:
   24402       /* VPSUBQ r/m, rV, r ::: r = rV - r/m */
   24403       /* VPSUBQ = VEX.NDS.128.66.0F.WIG FB /r */
   24404       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24405          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24406                     uses_vvvv, vbi, pfx, delta, "vpsubq", Iop_Sub64x2 );
   24407          goto decode_success;
   24408       }
   24409       break;
   24410 
   24411    case 0xFC:
   24412       /* VPADDB r/m, rV, r ::: r = rV + r/m */
   24413       /* VPADDB = VEX.NDS.128.66.0F.WIG FC /r */
   24414       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24415          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24416                     uses_vvvv, vbi, pfx, delta, "vpaddb", Iop_Add8x16 );
   24417          goto decode_success;
   24418       }
   24419       break;
   24420 
   24421    case 0xFD:
   24422       /* VPADDW r/m, rV, r ::: r = rV + r/m */
   24423       /* VPADDW = VEX.NDS.128.66.0F.WIG FD /r */
   24424       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24425          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24426                     uses_vvvv, vbi, pfx, delta, "vpaddw", Iop_Add16x8 );
   24427          goto decode_success;
   24428       }
   24429       break;
   24430 
   24431    case 0xFE:
   24432       /* VPADDD r/m, rV, r ::: r = rV + r/m */
   24433       /* VPADDD = VEX.NDS.128.66.0F.WIG FE /r */
   24434       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24435          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24436                     uses_vvvv, vbi, pfx, delta, "vpaddd", Iop_Add32x4 );
   24437          goto decode_success;
   24438       }
   24439       break;
   24440 
   24441    default:
   24442       break;
   24443 
   24444    }
   24445 
   24446   //decode_failure:
   24447    return deltaIN;
   24448 
   24449   decode_success:
   24450    return delta;
   24451 }
   24452 
   24453 
   24454 /*------------------------------------------------------------*/
   24455 /*---                                                      ---*/
   24456 /*--- Top-level post-escape decoders: dis_ESC_0F38__VEX    ---*/
   24457 /*---                                                      ---*/
   24458 /*------------------------------------------------------------*/
   24459 
   24460 static IRTemp math_PERMILPS_VAR_128 ( IRTemp dataV, IRTemp ctrlV )
   24461 {
   24462    /* In the control vector, zero out all but the bottom two bits of
   24463       each 32-bit lane. */
   24464    IRExpr* cv1 = binop(Iop_ShrN32x4,
   24465                        binop(Iop_ShlN32x4, mkexpr(ctrlV), mkU8(30)),
   24466                        mkU8(30));
   24467    /* And use the resulting cleaned-up control vector as steering
   24468       in a Perm operation. */
   24469    IRTemp res = newTemp(Ity_V128);
   24470    assign(res, binop(Iop_Perm32x4, mkexpr(dataV), cv1));
   24471    return res;
   24472 }
   24473 
   24474 static IRTemp math_PERMILPS_VAR_256 ( IRTemp dataV, IRTemp ctrlV )
   24475 {
   24476    IRTemp dHi, dLo, cHi, cLo;
   24477    dHi = dLo = cHi = cLo = IRTemp_INVALID;
   24478    breakupV256toV128s( dataV, &dHi, &dLo );
   24479    breakupV256toV128s( ctrlV, &cHi, &cLo );
   24480    IRTemp rHi = math_PERMILPS_VAR_128( dHi, cHi );
   24481    IRTemp rLo = math_PERMILPS_VAR_128( dLo, cLo );
   24482    IRTemp res = newTemp(Ity_V256);
   24483    assign(res, binop(Iop_V128HLtoV256, mkexpr(rHi), mkexpr(rLo)));
   24484    return res;
   24485 }
   24486 
   24487 static IRTemp math_PERMILPD_VAR_128 ( IRTemp dataV, IRTemp ctrlV )
   24488 {
   24489    /* No cleverness here .. */
   24490    IRTemp dHi, dLo, cHi, cLo;
   24491    dHi = dLo = cHi = cLo = IRTemp_INVALID;
   24492    breakupV128to64s( dataV, &dHi, &dLo );
   24493    breakupV128to64s( ctrlV, &cHi, &cLo );
   24494    IRExpr* rHi
   24495       = IRExpr_Mux0X( unop(Iop_64to8,
   24496                            binop(Iop_And64, mkexpr(cHi), mkU64(2))),
   24497                       mkexpr(dLo), mkexpr(dHi) );
   24498    IRExpr* rLo
   24499       = IRExpr_Mux0X( unop(Iop_64to8,
   24500                            binop(Iop_And64, mkexpr(cLo), mkU64(2))),
   24501                       mkexpr(dLo), mkexpr(dHi) );
   24502    IRTemp res = newTemp(Ity_V128);
   24503    assign(res, binop(Iop_64HLtoV128, rHi, rLo));
   24504    return res;
   24505 }
   24506 
   24507 static IRTemp math_PERMILPD_VAR_256 ( IRTemp dataV, IRTemp ctrlV )
   24508 {
   24509    IRTemp dHi, dLo, cHi, cLo;
   24510    dHi = dLo = cHi = cLo = IRTemp_INVALID;
   24511    breakupV256toV128s( dataV, &dHi, &dLo );
   24512    breakupV256toV128s( ctrlV, &cHi, &cLo );
   24513    IRTemp rHi = math_PERMILPD_VAR_128( dHi, cHi );
   24514    IRTemp rLo = math_PERMILPD_VAR_128( dLo, cLo );
   24515    IRTemp res = newTemp(Ity_V256);
   24516    assign(res, binop(Iop_V128HLtoV256, mkexpr(rHi), mkexpr(rLo)));
   24517    return res;
   24518 }
   24519 
   24520 __attribute__((noinline))
   24521 static
   24522 Long dis_ESC_0F38__VEX (
   24523         /*MB_OUT*/DisResult* dres,
   24524         /*OUT*/   Bool*      uses_vvvv,
   24525         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
   24526         Bool         resteerCisOk,
   24527         void*        callback_opaque,
   24528         VexArchInfo* archinfo,
   24529         VexAbiInfo*  vbi,
   24530         Prefix pfx, Int sz, Long deltaIN
   24531      )
   24532 {
   24533    IRTemp addr  = IRTemp_INVALID;
   24534    Int    alen  = 0;
   24535    HChar  dis_buf[50];
   24536    Long   delta = deltaIN;
   24537    UChar  opc   = getUChar(delta);
   24538    delta++;
   24539    *uses_vvvv = False;
   24540 
   24541    switch (opc) {
   24542 
   24543    case 0x00:
   24544       /* VPSHUFB r/m, rV, r ::: r = shuf(rV, r/m) */
   24545       /* VPSHUFB = VEX.NDS.128.66.0F38.WIG 00 /r */
   24546       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24547          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   24548                     uses_vvvv, vbi, pfx, delta, "vpshufb", math_PSHUFB_XMM );
   24549          goto decode_success;
   24550       }
   24551       break;
   24552 
   24553    case 0x01:
   24554    case 0x02:
   24555    case 0x03:
   24556       /* VPHADDW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 01 /r */
   24557       /* VPHADDD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 02 /r */
   24558       /* VPHADDSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 03 /r */
   24559       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24560          delta = dis_PHADD_128( vbi, pfx, delta, True/*isAvx*/, opc );
   24561          *uses_vvvv = True;
   24562          goto decode_success;
   24563       }
   24564       break;
   24565 
   24566    case 0x04:
   24567       /* VPMADDUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 04 /r */
   24568       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24569          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   24570                     uses_vvvv, vbi, pfx, delta, "vpmaddubsw",
   24571                     math_PMADDUBSW_128 );
   24572          goto decode_success;
   24573       }
   24574       break;
   24575 
   24576    case 0x05:
   24577    case 0x06:
   24578    case 0x07:
   24579       /* VPHSUBW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 05 /r */
   24580       /* VPHSUBD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 06 /r */
   24581       /* VPHSUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 07 /r */
   24582       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24583          delta = dis_PHADD_128( vbi, pfx, delta, True/*isAvx*/, opc );
   24584          *uses_vvvv = True;
   24585          goto decode_success;
   24586       }
   24587       break;
   24588 
   24589    case 0x08:
   24590    case 0x09:
   24591    case 0x0A:
   24592       /* VPSIGNB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 08 /r */
   24593       /* VPSIGNW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 09 /r */
   24594       /* VPSIGND xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 0A /r */
   24595       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24596          IRTemp sV      = newTemp(Ity_V128);
   24597          IRTemp dV      = newTemp(Ity_V128);
   24598          IRTemp sHi, sLo, dHi, dLo;
   24599          sHi = sLo = dHi = dLo = IRTemp_INVALID;
   24600          UChar  ch      = '?';
   24601          Int    laneszB = 0;
   24602          UChar  modrm   = getUChar(delta);
   24603          UInt   rG      = gregOfRexRM(pfx,modrm);
   24604          UInt   rV      = getVexNvvvv(pfx);
   24605 
   24606          switch (opc) {
   24607             case 0x08: laneszB = 1; ch = 'b'; break;
   24608             case 0x09: laneszB = 2; ch = 'w'; break;
   24609             case 0x0A: laneszB = 4; ch = 'd'; break;
   24610             default: vassert(0);
   24611          }
   24612 
   24613          assign( dV, getXMMReg(rV) );
   24614 
   24615          if (epartIsReg(modrm)) {
   24616             UInt rE = eregOfRexRM(pfx,modrm);
   24617             assign( sV, getXMMReg(rE) );
   24618             delta += 1;
   24619             DIP("vpsign%c %s,%s,%s\n", ch, nameXMMReg(rE),
   24620                 nameXMMReg(rV), nameXMMReg(rG));
   24621          } else {
   24622             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24623             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   24624             delta += alen;
   24625             DIP("vpsign%c %s,%s,%s\n", ch, dis_buf,
   24626                 nameXMMReg(rV), nameXMMReg(rG));
   24627          }
   24628 
   24629          breakupV128to64s( dV, &dHi, &dLo );
   24630          breakupV128to64s( sV, &sHi, &sLo );
   24631 
   24632          putYMMRegLoAndZU(
   24633             rG,
   24634             binop(Iop_64HLtoV128,
   24635                   dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
   24636                   dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
   24637             )
   24638          );
   24639          *uses_vvvv = True;
   24640          goto decode_success;
   24641       }
   24642       break;
   24643 
   24644    case 0x0B:
   24645       /* VPMULHRSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 0B /r */
   24646       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24647          IRTemp sV      = newTemp(Ity_V128);
   24648          IRTemp dV      = newTemp(Ity_V128);
   24649          IRTemp sHi, sLo, dHi, dLo;
   24650          sHi = sLo = dHi = dLo = IRTemp_INVALID;
   24651          UChar  modrm   = getUChar(delta);
   24652          UInt   rG      = gregOfRexRM(pfx,modrm);
   24653          UInt   rV      = getVexNvvvv(pfx);
   24654 
   24655          assign( dV, getXMMReg(rV) );
   24656 
   24657          if (epartIsReg(modrm)) {
   24658             UInt rE = eregOfRexRM(pfx,modrm);
   24659             assign( sV, getXMMReg(rE) );
   24660             delta += 1;
   24661             DIP("vpmulhrsw %s,%s,%s\n", nameXMMReg(rE),
   24662                 nameXMMReg(rV), nameXMMReg(rG));
   24663          } else {
   24664             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24665             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   24666             delta += alen;
   24667             DIP("vpmulhrsw %s,%s,%s\n", dis_buf,
   24668                 nameXMMReg(rV), nameXMMReg(rG));
   24669          }
   24670 
   24671          breakupV128to64s( dV, &dHi, &dLo );
   24672          breakupV128to64s( sV, &sHi, &sLo );
   24673 
   24674          putYMMRegLoAndZU(
   24675             rG,
   24676             binop(Iop_64HLtoV128,
   24677                   dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
   24678                   dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
   24679             )
   24680          );
   24681          *uses_vvvv = True;
   24682          goto decode_success;
   24683       }
   24684       break;
   24685 
   24686    case 0x0C:
   24687       /* VPERMILPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0C /r */
   24688       if (have66noF2noF3(pfx)
   24689           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   24690          UChar  modrm = getUChar(delta);
   24691          UInt   rG    = gregOfRexRM(pfx, modrm);
   24692          UInt   rV    = getVexNvvvv(pfx);
   24693          IRTemp ctrlV = newTemp(Ity_V128);
   24694          if (epartIsReg(modrm)) {
   24695             UInt rE = eregOfRexRM(pfx, modrm);
   24696             delta += 1;
   24697             DIP("vpermilps %s,%s,%s\n",
   24698                 nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   24699             assign(ctrlV, getXMMReg(rE));
   24700          } else {
   24701             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   24702             delta += alen;
   24703             DIP("vpermilps %s,%s,%s\n",
   24704                 dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   24705             assign(ctrlV, loadLE(Ity_V128, mkexpr(addr)));
   24706          }
   24707          IRTemp dataV = newTemp(Ity_V128);
   24708          assign(dataV, getXMMReg(rV));
   24709          IRTemp resV = math_PERMILPS_VAR_128(dataV, ctrlV);
   24710          putYMMRegLoAndZU(rG, mkexpr(resV));
   24711          *uses_vvvv = True;
   24712          goto decode_success;
   24713       }
   24714       /* VPERMILPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0C /r */
   24715       if (have66noF2noF3(pfx)
   24716           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   24717          UChar  modrm = getUChar(delta);
   24718          UInt   rG    = gregOfRexRM(pfx, modrm);
   24719          UInt   rV    = getVexNvvvv(pfx);
   24720          IRTemp ctrlV = newTemp(Ity_V256);
   24721          if (epartIsReg(modrm)) {
   24722             UInt rE = eregOfRexRM(pfx, modrm);
   24723             delta += 1;
   24724             DIP("vpermilps %s,%s,%s\n",
   24725                 nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   24726             assign(ctrlV, getYMMReg(rE));
   24727          } else {
   24728             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   24729             delta += alen;
   24730             DIP("vpermilps %s,%s,%s\n",
   24731                 dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   24732             assign(ctrlV, loadLE(Ity_V256, mkexpr(addr)));
   24733          }
   24734          IRTemp dataV = newTemp(Ity_V256);
   24735          assign(dataV, getYMMReg(rV));
   24736          IRTemp resV = math_PERMILPS_VAR_256(dataV, ctrlV);
   24737          putYMMReg(rG, mkexpr(resV));
   24738          *uses_vvvv = True;
   24739          goto decode_success;
   24740       }
   24741       break;
   24742 
   24743    case 0x0D:
   24744       /* VPERMILPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0D /r */
   24745       if (have66noF2noF3(pfx)
   24746           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   24747          UChar  modrm = getUChar(delta);
   24748          UInt   rG    = gregOfRexRM(pfx, modrm);
   24749          UInt   rV    = getVexNvvvv(pfx);
   24750          IRTemp ctrlV = newTemp(Ity_V128);
   24751          if (epartIsReg(modrm)) {
   24752             UInt rE = eregOfRexRM(pfx, modrm);
   24753             delta += 1;
   24754             DIP("vpermilpd %s,%s,%s\n",
   24755                 nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   24756             assign(ctrlV, getXMMReg(rE));
   24757          } else {
   24758             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   24759             delta += alen;
   24760             DIP("vpermilpd %s,%s,%s\n",
   24761                 dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   24762             assign(ctrlV, loadLE(Ity_V128, mkexpr(addr)));
   24763          }
   24764          IRTemp dataV = newTemp(Ity_V128);
   24765          assign(dataV, getXMMReg(rV));
   24766          IRTemp resV = math_PERMILPD_VAR_128(dataV, ctrlV);
   24767          putYMMRegLoAndZU(rG, mkexpr(resV));
   24768          *uses_vvvv = True;
   24769          goto decode_success;
   24770       }
   24771       /* VPERMILPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0D /r */
   24772       if (have66noF2noF3(pfx)
   24773           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   24774          UChar  modrm = getUChar(delta);
   24775          UInt   rG    = gregOfRexRM(pfx, modrm);
   24776          UInt   rV    = getVexNvvvv(pfx);
   24777          IRTemp ctrlV = newTemp(Ity_V256);
   24778          if (epartIsReg(modrm)) {
   24779             UInt rE = eregOfRexRM(pfx, modrm);
   24780             delta += 1;
   24781             DIP("vpermilpd %s,%s,%s\n",
   24782                 nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   24783             assign(ctrlV, getYMMReg(rE));
   24784          } else {
   24785             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   24786             delta += alen;
   24787             DIP("vpermilpd %s,%s,%s\n",
   24788                 dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   24789             assign(ctrlV, loadLE(Ity_V256, mkexpr(addr)));
   24790          }
   24791          IRTemp dataV = newTemp(Ity_V256);
   24792          assign(dataV, getYMMReg(rV));
   24793          IRTemp resV = math_PERMILPD_VAR_256(dataV, ctrlV);
   24794          putYMMReg(rG, mkexpr(resV));
   24795          *uses_vvvv = True;
   24796          goto decode_success;
   24797       }
   24798       break;
   24799 
   24800    case 0x0E:
   24801       /* VTESTPS xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 0E /r */
   24802       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24803          delta = dis_xTESTy_128( vbi, pfx, delta, True/*isAvx*/, 32 );
   24804          goto decode_success;
   24805       }
   24806       /* VTESTPS ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 0E /r */
   24807       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24808          delta = dis_xTESTy_256( vbi, pfx, delta, 32 );
   24809          goto decode_success;
   24810       }
   24811       break;
   24812 
   24813    case 0x0F:
   24814       /* VTESTPD xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 0F /r */
   24815       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24816          delta = dis_xTESTy_128( vbi, pfx, delta, True/*isAvx*/, 64 );
   24817          goto decode_success;
   24818       }
   24819       /* VTESTPD ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 0F /r */
   24820       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24821          delta = dis_xTESTy_256( vbi, pfx, delta, 64 );
   24822          goto decode_success;
   24823       }
   24824       break;
   24825 
   24826    case 0x17:
   24827       /* VPTEST xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 17 /r */
   24828       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24829          delta = dis_xTESTy_128( vbi, pfx, delta, True/*isAvx*/, 0 );
   24830          goto decode_success;
   24831       }
   24832       /* VPTEST ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 17 /r */
   24833       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24834          delta = dis_xTESTy_256( vbi, pfx, delta, 0 );
   24835          goto decode_success;
   24836       }
   24837       break;
   24838 
   24839    case 0x18:
   24840       /* VBROADCASTSS m32, xmm1 = VEX.128.66.0F38.WIG 18 /r */
   24841       if (have66noF2noF3(pfx)
   24842           && 0==getVexL(pfx)/*128*/
   24843           && !epartIsReg(getUChar(delta))) {
   24844          UChar modrm = getUChar(delta);
   24845          UInt  rG    = gregOfRexRM(pfx, modrm);
   24846          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   24847          delta += alen;
   24848          DIP("vbroadcastss %s,%s\n", dis_buf, nameXMMReg(rG));
   24849          IRTemp t32 = newTemp(Ity_I32);
   24850          assign(t32, loadLE(Ity_I32, mkexpr(addr)));
   24851          IRTemp t64 = newTemp(Ity_I64);
   24852          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   24853          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
   24854          putYMMRegLoAndZU(rG, res);
   24855          goto decode_success;
   24856       }
   24857       /* VBROADCASTSS m32, ymm1 = VEX.256.66.0F38.WIG 18 /r */
   24858       if (have66noF2noF3(pfx)
   24859           && 1==getVexL(pfx)/*256*/
   24860           && !epartIsReg(getUChar(delta))) {
   24861          UChar modrm = getUChar(delta);
   24862          UInt  rG    = gregOfRexRM(pfx, modrm);
   24863          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   24864          delta += alen;
   24865          DIP("vbroadcastss %s,%s\n", dis_buf, nameYMMReg(rG));
   24866          IRTemp t32 = newTemp(Ity_I32);
   24867          assign(t32, loadLE(Ity_I32, mkexpr(addr)));
   24868          IRTemp t64 = newTemp(Ity_I64);
   24869          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   24870          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   24871                                                   mkexpr(t64), mkexpr(t64));
   24872          putYMMReg(rG, res);
   24873          goto decode_success;
   24874       }
   24875       break;
   24876 
   24877    case 0x19:
   24878       /* VBROADCASTSD m64, ymm1 = VEX.256.66.0F38.WIG 19 /r */
   24879       if (have66noF2noF3(pfx)
   24880           && 1==getVexL(pfx)/*256*/
   24881           && !epartIsReg(getUChar(delta))) {
   24882          UChar modrm = getUChar(delta);
   24883          UInt  rG    = gregOfRexRM(pfx, modrm);
   24884          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   24885          delta += alen;
   24886          DIP("vbroadcastsd %s,%s\n", dis_buf, nameYMMReg(rG));
   24887          IRTemp t64 = newTemp(Ity_I64);
   24888          assign(t64, loadLE(Ity_I64, mkexpr(addr)));
   24889          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   24890                                                   mkexpr(t64), mkexpr(t64));
   24891          putYMMReg(rG, res);
   24892          goto decode_success;
   24893       }
   24894       break;
   24895 
   24896    case 0x1A:
   24897       /* VBROADCASTF128 m128, ymm1 = VEX.256.66.0F38.WIG 1A /r */
   24898       if (have66noF2noF3(pfx)
   24899           && 1==getVexL(pfx)/*256*/
   24900           && !epartIsReg(getUChar(delta))) {
   24901          UChar modrm = getUChar(delta);
   24902          UInt  rG    = gregOfRexRM(pfx, modrm);
   24903          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   24904          delta += alen;
   24905          DIP("vbroadcastf128 %s,%s\n", dis_buf, nameYMMReg(rG));
   24906          IRTemp t128 = newTemp(Ity_V128);
   24907          assign(t128, loadLE(Ity_V128, mkexpr(addr)));
   24908          putYMMReg( rG, binop(Iop_V128HLtoV256, mkexpr(t128), mkexpr(t128)) );
   24909          goto decode_success;
   24910       }
   24911       break;
   24912 
   24913    case 0x1C:
   24914       /* VPABSB xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 1C /r */
   24915       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24916          delta = dis_AVX128_E_to_G_unary(
   24917                     uses_vvvv, vbi, pfx, delta,
   24918                     "vpabsb", math_PABS_XMM_pap1 );
   24919          goto decode_success;
   24920       }
   24921       break;
   24922 
   24923    case 0x1D:
   24924       /* VPABSW xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 1D /r */
   24925       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24926          delta = dis_AVX128_E_to_G_unary(
   24927                     uses_vvvv, vbi, pfx, delta,
   24928                     "vpabsw", math_PABS_XMM_pap2 );
   24929          goto decode_success;
   24930       }
   24931       break;
   24932 
   24933    case 0x1E:
   24934       /* VPABSD xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 1E /r */
   24935       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24936          delta = dis_AVX128_E_to_G_unary(
   24937                     uses_vvvv, vbi, pfx, delta,
   24938                     "vpabsd", math_PABS_XMM_pap4 );
   24939          goto decode_success;
   24940       }
   24941       break;
   24942 
   24943    case 0x20:
   24944       /* VPMOVSXBW xmm2/m64, xmm1 */
   24945       /* VPMOVSXBW = VEX.128.66.0F38.WIG 20 /r */
   24946       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24947          delta = dis_PMOVxXBW_128( vbi, pfx, delta,
   24948                                    True/*isAvx*/, False/*!xIsZ*/ );
   24949          goto decode_success;
   24950       }
   24951       break;
   24952 
   24953    case 0x21:
   24954       /* VPMOVSXBD xmm2/m32, xmm1 */
   24955       /* VPMOVSXBD = VEX.128.66.0F38.WIG 21 /r */
   24956       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24957          delta = dis_PMOVxXBD_128( vbi, pfx, delta,
   24958                                    True/*isAvx*/, False/*!xIsZ*/ );
   24959          goto decode_success;
   24960       }
   24961       break;
   24962 
   24963    case 0x22:
   24964       /* VPMOVSXBQ xmm2/m16, xmm1 */
   24965       /* VPMOVSXBQ = VEX.128.66.0F38.WIG 22 /r */
   24966       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24967          delta = dis_PMOVSXBQ_128( vbi, pfx, delta, True/*isAvx*/ );
   24968          goto decode_success;
   24969       }
   24970       break;
   24971 
   24972    case 0x23:
   24973       /* VPMOVSXWD xmm2/m64, xmm1 = VEX.128.66.0F38.WIG 23 /r */
   24974       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24975          delta = dis_PMOVxXWD_128( vbi, pfx, delta,
   24976                                    True/*isAvx*/, False/*!xIsZ*/ );
   24977          goto decode_success;
   24978       }
   24979       break;
   24980 
   24981    case 0x24:
   24982       /* VPMOVSXWQ xmm2/m32, xmm1 = VEX.128.66.0F38.WIG 24 /r */
   24983       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24984          delta = dis_PMOVSXWQ_128( vbi, pfx, delta, True/*isAvx*/ );
   24985          goto decode_success;
   24986       }
   24987       break;
   24988 
   24989    case 0x25:
   24990       /* VPMOVSXDQ xmm2/m64, xmm1 = VEX.128.66.0F38.WIG 25 /r */
   24991       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24992          delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
   24993                                    True/*isAvx*/, False/*!xIsZ*/ );
   24994          goto decode_success;
   24995       }
   24996       break;
   24997 
   24998    case 0x28:
   24999       /* VPMULDQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 28 /r */
   25000       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25001          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   25002                     uses_vvvv, vbi, pfx, delta,
   25003                     "vpmuldq", math_PMULDQ_128 );
   25004          goto decode_success;
   25005       }
   25006       break;
   25007 
   25008    case 0x29:
   25009       /* VPCMPEQQ r/m, rV, r ::: r = rV `eq-by-64s` r/m */
   25010       /* VPCMPEQQ = VEX.NDS.128.66.0F38.WIG 29 /r */
   25011       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25012          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25013                     uses_vvvv, vbi, pfx, delta, "vpcmpeqq", Iop_CmpEQ64x2 );
   25014          goto decode_success;
   25015       }
   25016       break;
   25017 
   25018    case 0x2A:
   25019       /* VMOVNTDQA m128, xmm1 = VEX.128.66.0F38.WIG 2A /r */
   25020       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   25021           && !epartIsReg(getUChar(delta))) {
   25022          UChar  modrm = getUChar(delta);
   25023          UInt   rD    = gregOfRexRM(pfx, modrm);
   25024          IRTemp tD    = newTemp(Ity_V128);
   25025          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   25026          delta += alen;
   25027          gen_SEGV_if_not_16_aligned(addr);
   25028          assign(tD, loadLE(Ity_V128, mkexpr(addr)));
   25029          DIP("vmovntdqa %s,%s\n", dis_buf, nameXMMReg(rD));
   25030          putYMMRegLoAndZU(rD, mkexpr(tD));
   25031          goto decode_success;
   25032       }
   25033       break;
   25034 
   25035    case 0x2B:
   25036       /* VPACKUSDW r/m, rV, r ::: r = QNarrowBin32Sto16Ux8(rV, r/m) */
   25037       /* VPACKUSDW = VEX.NDS.128.66.0F38.WIG 2B /r */
   25038       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25039          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25040                     uses_vvvv, vbi, pfx, delta, "vpackusdw",
   25041                     Iop_QNarrowBin32Sto16Ux8, NULL,
   25042                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25043          goto decode_success;
   25044       }
   25045       break;
   25046 
   25047    case 0x30:
   25048       /* VPMOVZXBW xmm2/m64, xmm1 */
   25049       /* VPMOVZXBW = VEX.128.66.0F38.WIG 30 /r */
   25050       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25051          delta = dis_PMOVxXBW_128( vbi, pfx, delta,
   25052                                    True/*isAvx*/, True/*xIsZ*/ );
   25053          goto decode_success;
   25054       }
   25055       break;
   25056 
   25057    case 0x31:
   25058       /* VPMOVZXBD xmm2/m32, xmm1 */
   25059       /* VPMOVZXBD = VEX.128.66.0F38.WIG 31 /r */
   25060       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25061          delta = dis_PMOVxXBD_128( vbi, pfx, delta,
   25062                                    True/*isAvx*/, True/*xIsZ*/ );
   25063          goto decode_success;
   25064       }
   25065       break;
   25066 
   25067    case 0x32:
   25068       /* VPMOVZXBQ xmm2/m16, xmm1 */
   25069       /* VPMOVZXBQ = VEX.128.66.0F38.WIG 32 /r */
   25070       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25071          delta = dis_PMOVZXBQ_128( vbi, pfx, delta, True/*isAvx*/ );
   25072          goto decode_success;
   25073       }
   25074       break;
   25075 
   25076    case 0x33:
   25077       /* VPMOVZXWD xmm2/m64, xmm1 */
   25078       /* VPMOVZXWD = VEX.128.66.0F38.WIG 33 /r */
   25079       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25080          delta = dis_PMOVxXWD_128( vbi, pfx, delta,
   25081                                    True/*isAvx*/, True/*xIsZ*/ );
   25082          goto decode_success;
   25083       }
   25084       break;
   25085 
   25086    case 0x34:
   25087       /* VPMOVZXWQ xmm2/m32, xmm1 = VEX.128.66.0F38.WIG 34 /r */
   25088       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25089          delta = dis_PMOVZXWQ_128( vbi, pfx, delta, True/*isAvx*/ );
   25090          goto decode_success;
   25091       }
   25092       break;
   25093 
   25094    case 0x35:
   25095       /* VPMOVZXDQ xmm2/m64, xmm1 = VEX.128.66.0F38.WIG 35 /r */
   25096       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25097          delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
   25098                                    True/*isAvx*/, True/*xIsZ*/ );
   25099          goto decode_success;
   25100       }
   25101       break;
   25102 
   25103    case 0x37:
   25104       /* VPCMPGTQ r/m, rV, r ::: r = rV `>s-by-64s` r/m */
   25105       /* VPCMPGTQ = VEX.NDS.128.66.0F38.WIG 37 /r */
   25106       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25107          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25108                     uses_vvvv, vbi, pfx, delta, "vpcmpgtq", Iop_CmpGT64Sx2 );
   25109          goto decode_success;
   25110       }
   25111       break;
   25112 
   25113    case 0x38:
   25114       /* VPMINSB r/m, rV, r ::: r = min-signed-8s(rV, r/m) */
   25115       /* VPMINSB = VEX.NDS.128.66.0F38.WIG 38 /r */
   25116       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25117          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25118                     uses_vvvv, vbi, pfx, delta, "vpminsb", Iop_Min8Sx16 );
   25119          goto decode_success;
   25120       }
   25121       break;
   25122 
   25123    case 0x39:
   25124       /* VPMINSD r/m, rV, r ::: r = min-signed-32s(rV, r/m) */
   25125       /* VPMINSD = VEX.NDS.128.66.0F38.WIG 39 /r */
   25126       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25127          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25128                     uses_vvvv, vbi, pfx, delta, "vpminsd", Iop_Min32Sx4 );
   25129          goto decode_success;
   25130       }
   25131       break;
   25132 
   25133    case 0x3A:
   25134       /* VPMINUW r/m, rV, r ::: r = min-unsigned-16s(rV, r/m) */
   25135       /* VPMINUW = VEX.NDS.128.66.0F38.WIG 3A /r */
   25136       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25137          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25138                     uses_vvvv, vbi, pfx, delta, "vpminuw", Iop_Min16Ux8 );
   25139          goto decode_success;
   25140       }
   25141       break;
   25142 
   25143    case 0x3B:
   25144       /* VPMINUD r/m, rV, r ::: r = min-unsigned-32s(rV, r/m) */
   25145       /* VPMINUD = VEX.NDS.128.66.0F38.WIG 3B /r */
   25146       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25147          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25148                     uses_vvvv, vbi, pfx, delta, "vpminud", Iop_Min32Ux4 );
   25149          goto decode_success;
   25150       }
   25151       break;
   25152 
   25153    case 0x3C:
   25154       /* VPMAXSB r/m, rV, r ::: r = max-signed-8s(rV, r/m) */
   25155       /* VPMAXSB = VEX.NDS.128.66.0F38.WIG 3C /r */
   25156       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25157          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25158                     uses_vvvv, vbi, pfx, delta, "vpmaxsb", Iop_Max8Sx16 );
   25159          goto decode_success;
   25160       }
   25161       break;
   25162 
   25163    case 0x3D:
   25164       /* VPMAXSD r/m, rV, r ::: r = max-signed-32s(rV, r/m) */
   25165       /* VPMAXSD = VEX.NDS.128.66.0F38.WIG 3D /r */
   25166       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25167          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25168                     uses_vvvv, vbi, pfx, delta, "vpmaxsd", Iop_Max32Sx4 );
   25169          goto decode_success;
   25170       }
   25171       break;
   25172 
   25173    case 0x3E:
   25174       /* VPMAXUW r/m, rV, r ::: r = max-unsigned-16s(rV, r/m) */
   25175       /* VPMAXUW = VEX.NDS.128.66.0F38.WIG 3E /r */
   25176       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25177          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25178                     uses_vvvv, vbi, pfx, delta, "vpmaxuw", Iop_Max16Ux8 );
   25179          goto decode_success;
   25180       }
   25181       break;
   25182 
   25183    case 0x3F:
   25184       /* VPMAXUD r/m, rV, r ::: r = max-unsigned-32s(rV, r/m) */
   25185       /* VPMAXUD = VEX.NDS.128.66.0F38.WIG 3F /r */
   25186       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25187          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25188                     uses_vvvv, vbi, pfx, delta, "vpmaxud", Iop_Max32Ux4 );
   25189          goto decode_success;
   25190       }
   25191       break;
   25192 
   25193    case 0x40:
   25194       /* VPMULLD r/m, rV, r ::: r = mul-32s(rV, r/m) */
   25195       /* VPMULLD = VEX.NDS.128.66.0F38.WIG 40 /r */
   25196       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25197          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25198                     uses_vvvv, vbi, pfx, delta, "vpmulld", Iop_Mul32x4 );
   25199          goto decode_success;
   25200       }
   25201       break;
   25202 
   25203    case 0x41:
   25204       /* VPHMINPOSUW xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 41 /r */
   25205       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25206          delta = dis_PHMINPOSUW_128( vbi, pfx, delta, True/*isAvx*/ );
   25207          goto decode_success;
   25208       }
   25209       break;
   25210 
   25211    case 0xDB:
   25212    case 0xDC:
   25213    case 0xDD:
   25214    case 0xDE:
   25215    case 0xDF:
   25216       /* VAESIMC xmm2/m128, xmm1 = VEX.128.66.0F38.WIG DB /r */
   25217       /* VAESENC xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DC /r */
   25218       /* VAESENCLAST xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DD /r */
   25219       /* VAESDEC xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DE /r */
   25220       /* VAESDECLAST xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DF /r */
   25221       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25222          delta = dis_AESx( vbi, pfx, delta, True/*!isAvx*/, opc );
   25223          if (opc != 0xDB) *uses_vvvv = True;
   25224          goto decode_success;
   25225       }
   25226       break;
   25227 
   25228    default:
   25229       break;
   25230 
   25231    }
   25232 
   25233   //decode_failure:
   25234    return deltaIN;
   25235 
   25236   decode_success:
   25237    return delta;
   25238 }
   25239 
   25240 
   25241 /*------------------------------------------------------------*/
   25242 /*---                                                      ---*/
   25243 /*--- Top-level post-escape decoders: dis_ESC_0F3A__VEX    ---*/
   25244 /*---                                                      ---*/
   25245 /*------------------------------------------------------------*/
   25246 
   25247 static IRTemp math_VPERMILPS_128 ( IRTemp sV, UInt imm8 )
   25248 {
   25249    vassert(imm8 < 256);
   25250    IRTemp s3, s2, s1, s0;
   25251    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   25252    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   25253 #  define SEL(_nn) (((_nn)==0) ? s0 : ((_nn)==1) ? s1 \
   25254                                     : ((_nn)==2) ? s2 : s3)
   25255    IRTemp res = newTemp(Ity_V128);
   25256    assign(res, mkV128from32s( SEL((imm8 >> 6) & 3),
   25257                               SEL((imm8 >> 4) & 3),
   25258                               SEL((imm8 >> 2) & 3),
   25259                               SEL((imm8 >> 0) & 3) ));
   25260 #  undef SEL
   25261    return res;
   25262 }
   25263 
   25264 __attribute__((noinline))
   25265 static
   25266 Long dis_ESC_0F3A__VEX (
   25267         /*MB_OUT*/DisResult* dres,
   25268         /*OUT*/   Bool*      uses_vvvv,
   25269         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
   25270         Bool         resteerCisOk,
   25271         void*        callback_opaque,
   25272         VexArchInfo* archinfo,
   25273         VexAbiInfo*  vbi,
   25274         Prefix pfx, Int sz, Long deltaIN
   25275      )
   25276 {
   25277    IRTemp addr  = IRTemp_INVALID;
   25278    Int    alen  = 0;
   25279    HChar  dis_buf[50];
   25280    Long   delta = deltaIN;
   25281    UChar  opc   = getUChar(delta);
   25282    delta++;
   25283    *uses_vvvv = False;
   25284 
   25285    switch (opc) {
   25286 
   25287    case 0x04:
   25288       /* VPERMILPS imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.WIG 04 /r ib */
   25289       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25290          UChar  modrm = getUChar(delta);
   25291          UInt   imm8  = 0;
   25292          UInt   rG    = gregOfRexRM(pfx, modrm);
   25293          IRTemp sV    = newTemp(Ity_V256);
   25294          if (epartIsReg(modrm)) {
   25295             UInt rE = eregOfRexRM(pfx, modrm);
   25296             delta += 1;
   25297             imm8 = getUChar(delta);
   25298             DIP("vpermilps $%u,%s,%s\n",
   25299                 imm8, nameYMMReg(rE), nameYMMReg(rG));
   25300             assign(sV, getYMMReg(rE));
   25301          } else {
   25302             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   25303             delta += alen;
   25304             imm8 = getUChar(delta);
   25305             DIP("vpermilps $%u,%s,%s\n",
   25306                 imm8, dis_buf, nameYMMReg(rG));
   25307             assign(sV, loadLE(Ity_V256, mkexpr(addr)));
   25308          }
   25309          delta++;
   25310          IRTemp  sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   25311          breakupV256toV128s( sV, &sVhi, &sVlo );
   25312          IRTemp  dVhi = math_VPERMILPS_128( sVhi, imm8 );
   25313          IRTemp  dVlo = math_VPERMILPS_128( sVlo, imm8 );
   25314          IRExpr* res  = binop(Iop_V128HLtoV256, mkexpr(dVhi), mkexpr(dVlo));
   25315          putYMMReg(rG, res);
   25316          goto decode_success;
   25317       }
   25318       /* VPERMILPS imm8, xmm2/m128, xmm1 = VEX.128.66.0F3A.WIG 04 /r ib */
   25319       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25320          UChar  modrm = getUChar(delta);
   25321          UInt   imm8  = 0;
   25322          UInt   rG    = gregOfRexRM(pfx, modrm);
   25323          IRTemp sV    = newTemp(Ity_V128);
   25324          if (epartIsReg(modrm)) {
   25325             UInt rE = eregOfRexRM(pfx, modrm);
   25326             delta += 1;
   25327             imm8 = getUChar(delta);
   25328             DIP("vpermilps $%u,%s,%s\n",
   25329                 imm8, nameXMMReg(rE), nameXMMReg(rG));
   25330             assign(sV, getXMMReg(rE));
   25331          } else {
   25332             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   25333             delta += alen;
   25334             imm8 = getUChar(delta);
   25335             DIP("vpermilps $%u,%s,%s\n",
   25336                 imm8, dis_buf, nameXMMReg(rG));
   25337             assign(sV, loadLE(Ity_V128, mkexpr(addr)));
   25338          }
   25339          delta++;
   25340          putYMMRegLoAndZU(rG, mkexpr ( math_VPERMILPS_128 ( sV, imm8 ) ) );
   25341          goto decode_success;
   25342       }
   25343       break;
   25344 
   25345    case 0x05:
   25346       /* VPERMILPD imm8, xmm2/m128, xmm1 = VEX.128.66.0F3A.WIG 05 /r ib */
   25347       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25348          UChar  modrm = getUChar(delta);
   25349          UInt   imm8  = 0;
   25350          UInt   rG    = gregOfRexRM(pfx, modrm);
   25351          IRTemp sV    = newTemp(Ity_V128);
   25352          if (epartIsReg(modrm)) {
   25353             UInt rE = eregOfRexRM(pfx, modrm);
   25354             delta += 1;
   25355             imm8 = getUChar(delta);
   25356             DIP("vpermilpd $%u,%s,%s\n",
   25357                 imm8, nameXMMReg(rE), nameXMMReg(rG));
   25358             assign(sV, getXMMReg(rE));
   25359          } else {
   25360             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   25361             delta += alen;
   25362             imm8 = getUChar(delta);
   25363             DIP("vpermilpd $%u,%s,%s\n",
   25364                 imm8, dis_buf, nameXMMReg(rG));
   25365             assign(sV, loadLE(Ity_V128, mkexpr(addr)));
   25366          }
   25367          delta++;
   25368          IRTemp s1 = newTemp(Ity_I64);
   25369          IRTemp s0 = newTemp(Ity_I64);
   25370          assign(s1, unop(Iop_V128HIto64, mkexpr(sV)));
   25371          assign(s0, unop(Iop_V128to64,   mkexpr(sV)));
   25372          IRTemp dV = newTemp(Ity_V128);
   25373          assign(dV, binop(Iop_64HLtoV128,
   25374                                mkexpr((imm8 & (1<<1)) ? s1 : s0),
   25375                                mkexpr((imm8 & (1<<0)) ? s1 : s0)));
   25376          putYMMRegLoAndZU(rG, mkexpr(dV));
   25377          goto decode_success;
   25378       }
   25379       /* VPERMILPD imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.WIG 05 /r ib */
   25380       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25381          UChar  modrm = getUChar(delta);
   25382          UInt   imm8  = 0;
   25383          UInt   rG    = gregOfRexRM(pfx, modrm);
   25384          IRTemp sV    = newTemp(Ity_V256);
   25385          if (epartIsReg(modrm)) {
   25386             UInt rE = eregOfRexRM(pfx, modrm);
   25387             delta += 1;
   25388             imm8 = getUChar(delta);
   25389             DIP("vpermilpd $%u,%s,%s\n",
   25390                 imm8, nameYMMReg(rE), nameYMMReg(rG));
   25391             assign(sV, getYMMReg(rE));
   25392          } else {
   25393             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   25394             delta += alen;
   25395             imm8 = getUChar(delta);
   25396             DIP("vpermilpd $%u,%s,%s\n",
   25397                 imm8, dis_buf, nameYMMReg(rG));
   25398             assign(sV, loadLE(Ity_V256, mkexpr(addr)));
   25399          }
   25400          delta++;
   25401          IRTemp s3, s2, s1, s0;
   25402          s3 = s2 = s1 = s0 = IRTemp_INVALID;
   25403          breakupV256to64s(sV, &s3, &s2, &s1, &s0);
   25404          IRTemp dV = newTemp(Ity_V256);
   25405          assign(dV, IRExpr_Qop(Iop_64x4toV256,
   25406                                mkexpr((imm8 & (1<<3)) ? s3 : s2),
   25407                                mkexpr((imm8 & (1<<2)) ? s3 : s2),
   25408                                mkexpr((imm8 & (1<<1)) ? s1 : s0),
   25409                                mkexpr((imm8 & (1<<0)) ? s1 : s0)));
   25410          putYMMReg(rG, mkexpr(dV));
   25411          goto decode_success;
   25412       }
   25413       break;
   25414 
   25415    case 0x06:
   25416       /* VPERM2F128 imm8, ymm3/m256, ymm2, ymm1 = VEX.NDS.66.0F3A.W0 06 /r ib */
   25417       if (have66noF2noF3(pfx)
   25418           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   25419          UChar  modrm = getUChar(delta);
   25420          UInt   imm8  = 0;
   25421          UInt   rG    = gregOfRexRM(pfx, modrm);
   25422          UInt   rV    = getVexNvvvv(pfx);
   25423          IRTemp s00   = newTemp(Ity_V128);
   25424          IRTemp s01   = newTemp(Ity_V128);
   25425          IRTemp s10   = newTemp(Ity_V128);
   25426          IRTemp s11   = newTemp(Ity_V128);
   25427          assign(s00, getYMMRegLane128(rV, 0));
   25428          assign(s01, getYMMRegLane128(rV, 1));
   25429          if (epartIsReg(modrm)) {
   25430             UInt rE = eregOfRexRM(pfx, modrm);
   25431             delta += 1;
   25432             imm8 = getUChar(delta);
   25433             DIP("vperm2f128 $%u,%s,%s,%s\n",
   25434                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   25435             assign(s10, getYMMRegLane128(rE, 0));
   25436             assign(s11, getYMMRegLane128(rE, 1));
   25437          } else {
   25438             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   25439             delta += alen;
   25440             imm8 = getUChar(delta);
   25441             DIP("vperm2f128 $%u,%s,%s,%s\n",
   25442                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   25443             assign(s10, loadLE(Ity_V128, binop(Iop_Add64,
   25444                                                mkexpr(addr), mkU64(0))));
   25445             assign(s11, loadLE(Ity_V128, binop(Iop_Add64,
   25446                                                mkexpr(addr), mkU64(16))));
   25447          }
   25448          delta++;
   25449 #        define SEL(_nn) (((_nn)==0) ? s00 : ((_nn)==1) ? s01 \
   25450                                            : ((_nn)==2) ? s10 : s11)
   25451          putYMMRegLane128(rG, 0, mkexpr(SEL((imm8 >> 0) & 3)));
   25452          putYMMRegLane128(rG, 1, mkexpr(SEL((imm8 >> 4) & 3)));
   25453 #        undef SEL
   25454          if (imm8 & (1<<3)) putYMMRegLane128(rG, 0, mkV128(0));
   25455          if (imm8 & (1<<7)) putYMMRegLane128(rG, 1, mkV128(0));
   25456          *uses_vvvv = True;
   25457          goto decode_success;
   25458       }
   25459       break;
   25460 
   25461    case 0x08:
   25462       /* VROUNDPS imm8, xmm2/m128, xmm1 */
   25463       /* VROUNDPS = VEX.NDS.128.66.0F3A.WIG 08 ib */
   25464       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25465          UChar  modrm = getUChar(delta);
   25466          UInt   rG    = gregOfRexRM(pfx, modrm);
   25467          IRTemp src   = newTemp(Ity_V128);
   25468          IRTemp s0    = IRTemp_INVALID;
   25469          IRTemp s1    = IRTemp_INVALID;
   25470          IRTemp s2    = IRTemp_INVALID;
   25471          IRTemp s3    = IRTemp_INVALID;
   25472          IRTemp rm    = newTemp(Ity_I32);
   25473          Int    imm   = 0;
   25474 
   25475          modrm = getUChar(delta);
   25476 
   25477          if (epartIsReg(modrm)) {
   25478             UInt rE = eregOfRexRM(pfx, modrm);
   25479             assign( src, getXMMReg( rE ) );
   25480             imm = getUChar(delta+1);
   25481             if (imm & ~15) break;
   25482             delta += 1+1;
   25483             DIP( "vroundps $%d,%s,%s\n", imm, nameXMMReg(rE), nameXMMReg(rG) );
   25484          } else {
   25485             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   25486             assign( src, loadLE(Ity_V128, mkexpr(addr) ) );
   25487             imm = getUChar(delta+alen);
   25488             if (imm & ~15) break;
   25489             delta += alen+1;
   25490             DIP( "vroundps $%d,%s,%s\n", imm, dis_buf, nameXMMReg(rG) );
   25491          }
   25492 
   25493          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   25494             that encoding is the same as the encoding for IRRoundingMode,
   25495             we can use that value directly in the IR as a rounding
   25496             mode. */
   25497          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   25498 
   25499          breakupV128to32s( src, &s3, &s2, &s1, &s0 );
   25500          putYMMRegLane128( rG, 1, mkV128(0) );
   25501 #        define CVT(s) binop(Iop_RoundF32toInt, mkexpr(rm), \
   25502                              unop(Iop_ReinterpI32asF32, mkexpr(s)))
   25503          putYMMRegLane32F( rG, 3, CVT(s3) );
   25504          putYMMRegLane32F( rG, 2, CVT(s2) );
   25505          putYMMRegLane32F( rG, 1, CVT(s1) );
   25506          putYMMRegLane32F( rG, 0, CVT(s0) );
   25507 #        undef CVT
   25508          goto decode_success;
   25509       }
   25510       /* VROUNDPS imm8, ymm2/m256, ymm1 */
   25511       /* VROUNDPS = VEX.NDS.256.66.0F3A.WIG 08 ib */
   25512       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25513          UChar  modrm = getUChar(delta);
   25514          UInt   rG    = gregOfRexRM(pfx, modrm);
   25515          IRTemp src   = newTemp(Ity_V256);
   25516          IRTemp s0    = IRTemp_INVALID;
   25517          IRTemp s1    = IRTemp_INVALID;
   25518          IRTemp s2    = IRTemp_INVALID;
   25519          IRTemp s3    = IRTemp_INVALID;
   25520          IRTemp s4    = IRTemp_INVALID;
   25521          IRTemp s5    = IRTemp_INVALID;
   25522          IRTemp s6    = IRTemp_INVALID;
   25523          IRTemp s7    = IRTemp_INVALID;
   25524          IRTemp rm    = newTemp(Ity_I32);
   25525          Int    imm   = 0;
   25526 
   25527          modrm = getUChar(delta);
   25528 
   25529          if (epartIsReg(modrm)) {
   25530             UInt rE = eregOfRexRM(pfx, modrm);
   25531             assign( src, getYMMReg( rE ) );
   25532             imm = getUChar(delta+1);
   25533             if (imm & ~15) break;
   25534             delta += 1+1;
   25535             DIP( "vroundps $%d,%s,%s\n", imm, nameYMMReg(rE), nameYMMReg(rG) );
   25536          } else {
   25537             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   25538             assign( src, loadLE(Ity_V256, mkexpr(addr) ) );
   25539             imm = getUChar(delta+alen);
   25540             if (imm & ~15) break;
   25541             delta += alen+1;
   25542             DIP( "vroundps $%d,%s,%s\n", imm, dis_buf, nameYMMReg(rG) );
   25543          }
   25544 
   25545          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   25546             that encoding is the same as the encoding for IRRoundingMode,
   25547             we can use that value directly in the IR as a rounding
   25548             mode. */
   25549          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   25550 
   25551          breakupV256to32s( src, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
   25552 #        define CVT(s) binop(Iop_RoundF32toInt, mkexpr(rm), \
   25553                              unop(Iop_ReinterpI32asF32, mkexpr(s)))
   25554          putYMMRegLane32F( rG, 7, CVT(s7) );
   25555          putYMMRegLane32F( rG, 6, CVT(s6) );
   25556          putYMMRegLane32F( rG, 5, CVT(s5) );
   25557          putYMMRegLane32F( rG, 4, CVT(s4) );
   25558          putYMMRegLane32F( rG, 3, CVT(s3) );
   25559          putYMMRegLane32F( rG, 2, CVT(s2) );
   25560          putYMMRegLane32F( rG, 1, CVT(s1) );
   25561          putYMMRegLane32F( rG, 0, CVT(s0) );
   25562 #        undef CVT
   25563          goto decode_success;
   25564       }
   25565 
   25566    case 0x09:
   25567       /* VROUNDPD imm8, xmm2/m128, xmm1 */
   25568       /* VROUNDPD = VEX.NDS.128.66.0F3A.WIG 09 ib */
   25569       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25570          UChar  modrm = getUChar(delta);
   25571          UInt   rG    = gregOfRexRM(pfx, modrm);
   25572          IRTemp src   = newTemp(Ity_V128);
   25573          IRTemp s0    = IRTemp_INVALID;
   25574          IRTemp s1    = IRTemp_INVALID;
   25575          IRTemp rm    = newTemp(Ity_I32);
   25576          Int    imm   = 0;
   25577 
   25578          modrm = getUChar(delta);
   25579 
   25580          if (epartIsReg(modrm)) {
   25581             UInt rE = eregOfRexRM(pfx, modrm);
   25582             assign( src, getXMMReg( rE ) );
   25583             imm = getUChar(delta+1);
   25584             if (imm & ~15) break;
   25585             delta += 1+1;
   25586             DIP( "vroundpd $%d,%s,%s\n", imm, nameXMMReg(rE), nameXMMReg(rG) );
   25587          } else {
   25588             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   25589             assign( src, loadLE(Ity_V128, mkexpr(addr) ) );
   25590             imm = getUChar(delta+alen);
   25591             if (imm & ~15) break;
   25592             delta += alen+1;
   25593             DIP( "vroundpd $%d,%s,%s\n", imm, dis_buf, nameXMMReg(rG) );
   25594          }
   25595 
   25596          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   25597             that encoding is the same as the encoding for IRRoundingMode,
   25598             we can use that value directly in the IR as a rounding
   25599             mode. */
   25600          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   25601 
   25602          breakupV128to64s( src, &s1, &s0 );
   25603          putYMMRegLane128( rG, 1, mkV128(0) );
   25604 #        define CVT(s) binop(Iop_RoundF64toInt, mkexpr(rm), \
   25605                              unop(Iop_ReinterpI64asF64, mkexpr(s)))
   25606          putYMMRegLane64F( rG, 1, CVT(s1) );
   25607          putYMMRegLane64F( rG, 0, CVT(s0) );
   25608 #        undef CVT
   25609          goto decode_success;
   25610       }
   25611       /* VROUNDPD imm8, ymm2/m256, ymm1 */
   25612       /* VROUNDPD = VEX.NDS.256.66.0F3A.WIG 09 ib */
   25613       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25614          UChar  modrm = getUChar(delta);
   25615          UInt   rG    = gregOfRexRM(pfx, modrm);
   25616          IRTemp src   = newTemp(Ity_V256);
   25617          IRTemp s0    = IRTemp_INVALID;
   25618          IRTemp s1    = IRTemp_INVALID;
   25619          IRTemp s2    = IRTemp_INVALID;
   25620          IRTemp s3    = IRTemp_INVALID;
   25621          IRTemp rm    = newTemp(Ity_I32);
   25622          Int    imm   = 0;
   25623 
   25624          modrm = getUChar(delta);
   25625 
   25626          if (epartIsReg(modrm)) {
   25627             UInt rE = eregOfRexRM(pfx, modrm);
   25628             assign( src, getYMMReg( rE ) );
   25629             imm = getUChar(delta+1);
   25630             if (imm & ~15) break;
   25631             delta += 1+1;
   25632             DIP( "vroundpd $%d,%s,%s\n", imm, nameYMMReg(rE), nameYMMReg(rG) );
   25633          } else {
   25634             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   25635             assign( src, loadLE(Ity_V256, mkexpr(addr) ) );
   25636             imm = getUChar(delta+alen);
   25637             if (imm & ~15) break;
   25638             delta += alen+1;
   25639             DIP( "vroundps $%d,%s,%s\n", imm, dis_buf, nameYMMReg(rG) );
   25640          }
   25641 
   25642          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   25643             that encoding is the same as the encoding for IRRoundingMode,
   25644             we can use that value directly in the IR as a rounding
   25645             mode. */
   25646          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   25647 
   25648          breakupV256to64s( src, &s3, &s2, &s1, &s0 );
   25649 #        define CVT(s) binop(Iop_RoundF64toInt, mkexpr(rm), \
   25650                              unop(Iop_ReinterpI64asF64, mkexpr(s)))
   25651          putYMMRegLane64F( rG, 3, CVT(s3) );
   25652          putYMMRegLane64F( rG, 2, CVT(s2) );
   25653          putYMMRegLane64F( rG, 1, CVT(s1) );
   25654          putYMMRegLane64F( rG, 0, CVT(s0) );
   25655 #        undef CVT
   25656          goto decode_success;
   25657       }
   25658 
   25659    case 0x0A:
   25660    case 0x0B:
   25661       /* VROUNDSS imm8, xmm3/m32, xmm2, xmm1 */
   25662       /* VROUNDSS = VEX.NDS.128.66.0F3A.WIG 0A ib */
   25663       /* VROUNDSD imm8, xmm3/m64, xmm2, xmm1 */
   25664       /* VROUNDSD = VEX.NDS.128.66.0F3A.WIG 0B ib */
   25665       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25666          UChar  modrm = getUChar(delta);
   25667          UInt   rG    = gregOfRexRM(pfx, modrm);
   25668          UInt   rV    = getVexNvvvv(pfx);
   25669          Bool   isD   = opc == 0x0B;
   25670          IRTemp src   = newTemp(isD ? Ity_F64 : Ity_F32);
   25671          IRTemp res   = newTemp(isD ? Ity_F64 : Ity_F32);
   25672          Int    imm   = 0;
   25673 
   25674          if (epartIsReg(modrm)) {
   25675             UInt rE = eregOfRexRM(pfx, modrm);
   25676             assign( src,
   25677                     isD ? getXMMRegLane64F(rE, 0) : getXMMRegLane32F(rE, 0) );
   25678             imm = getUChar(delta+1);
   25679             if (imm & ~15) break;
   25680             delta += 1+1;
   25681             DIP( "vrounds%c $%d,%s,%s,%s\n",
   25682                  isD ? 'd' : 's',
   25683                  imm, nameXMMReg( rE ), nameXMMReg( rV ), nameXMMReg( rG ) );
   25684          } else {
   25685             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   25686             assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
   25687             imm = getUChar(delta+alen);
   25688             if (imm & ~15) break;
   25689             delta += alen+1;
   25690             DIP( "vrounds%c $%d,%s,%s,%s\n",
   25691                  isD ? 'd' : 's',
   25692                  imm, dis_buf, nameXMMReg( rV ), nameXMMReg( rG ) );
   25693          }
   25694 
   25695          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   25696             that encoding is the same as the encoding for IRRoundingMode,
   25697             we can use that value directly in the IR as a rounding
   25698             mode. */
   25699          assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
   25700                            (imm & 4) ? get_sse_roundingmode()
   25701                                      : mkU32(imm & 3),
   25702                            mkexpr(src)) );
   25703 
   25704          if (isD)
   25705             putXMMRegLane64F( rG, 0, mkexpr(res) );
   25706          else {
   25707             putXMMRegLane32F( rG, 0, mkexpr(res) );
   25708             putXMMRegLane32F( rG, 1, getXMMRegLane32F( rV, 1 ) );
   25709          }
   25710          putXMMRegLane64F( rG, 1, getXMMRegLane64F( rV, 1 ) );
   25711          putYMMRegLane128( rG, 1, mkV128(0) );
   25712          *uses_vvvv = True;
   25713          goto decode_success;
   25714       }
   25715       break;
   25716 
   25717    case 0x0C:
   25718       /* VBLENDPS imm8, ymm3/m256, ymm2, ymm1 */
   25719       /* VBLENDPS = VEX.NDS.256.66.0F3A.WIG 0C /r ib */
   25720       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25721          UChar  modrm = getUChar(delta);
   25722          UInt   imm8;
   25723          UInt   rG    = gregOfRexRM(pfx, modrm);
   25724          UInt   rV    = getVexNvvvv(pfx);
   25725          IRTemp sV    = newTemp(Ity_V256);
   25726          IRTemp sE    = newTemp(Ity_V256);
   25727          assign ( sV, getYMMReg(rV) );
   25728          if (epartIsReg(modrm)) {
   25729             UInt rE = eregOfRexRM(pfx, modrm);
   25730             delta += 1;
   25731             imm8 = getUChar(delta);
   25732             DIP("vblendps $%u,%s,%s,%s\n",
   25733                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   25734             assign(sE, getYMMReg(rE));
   25735          } else {
   25736             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   25737             delta += alen;
   25738             imm8 = getUChar(delta);
   25739             DIP("vblendps $%u,%s,%s,%s\n",
   25740                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   25741             assign(sE, loadLE(Ity_V256, mkexpr(addr)));
   25742          }
   25743          delta++;
   25744          putYMMReg( rG,
   25745                     mkexpr( math_BLENDPS_256( sE, sV, imm8) ) );
   25746          *uses_vvvv = True;
   25747          goto decode_success;
   25748       }
   25749       /* VBLENDPS imm8, xmm3/m128, xmm2, xmm1 */
   25750       /* VBLENDPS = VEX.NDS.128.66.0F3A.WIG 0C /r ib */
   25751       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25752          UChar  modrm = getUChar(delta);
   25753          UInt   imm8;
   25754          UInt   rG    = gregOfRexRM(pfx, modrm);
   25755          UInt   rV    = getVexNvvvv(pfx);
   25756          IRTemp sV    = newTemp(Ity_V128);
   25757          IRTemp sE    = newTemp(Ity_V128);
   25758          assign ( sV, getXMMReg(rV) );
   25759          if (epartIsReg(modrm)) {
   25760             UInt rE = eregOfRexRM(pfx, modrm);
   25761             delta += 1;
   25762             imm8 = getUChar(delta);
   25763             DIP("vblendps $%u,%s,%s,%s\n",
   25764                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   25765             assign(sE, getXMMReg(rE));
   25766          } else {
   25767             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   25768             delta += alen;
   25769             imm8 = getUChar(delta);
   25770             DIP("vblendps $%u,%s,%s,%s\n",
   25771                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   25772             assign(sE, loadLE(Ity_V128, mkexpr(addr)));
   25773          }
   25774          delta++;
   25775          putYMMRegLoAndZU( rG,
   25776                            mkexpr( math_BLENDPS_128( sE, sV, imm8) ) );
   25777          *uses_vvvv = True;
   25778          goto decode_success;
   25779       }
   25780       break;
   25781 
   25782    case 0x0D:
   25783       /* VBLENDPD imm8, ymm3/m256, ymm2, ymm1 */
   25784       /* VBLENDPD = VEX.NDS.256.66.0F3A.WIG 0D /r ib */
   25785       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25786          UChar  modrm = getUChar(delta);
   25787          UInt   imm8;
   25788          UInt   rG    = gregOfRexRM(pfx, modrm);
   25789          UInt   rV    = getVexNvvvv(pfx);
   25790          IRTemp sV    = newTemp(Ity_V256);
   25791          IRTemp sE    = newTemp(Ity_V256);
   25792          assign ( sV, getYMMReg(rV) );
   25793          if (epartIsReg(modrm)) {
   25794             UInt rE = eregOfRexRM(pfx, modrm);
   25795             delta += 1;
   25796             imm8 = getUChar(delta);
   25797             DIP("vblendpd $%u,%s,%s,%s\n",
   25798                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   25799             assign(sE, getYMMReg(rE));
   25800          } else {
   25801             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   25802             delta += alen;
   25803             imm8 = getUChar(delta);
   25804             DIP("vblendpd $%u,%s,%s,%s\n",
   25805                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   25806             assign(sE, loadLE(Ity_V256, mkexpr(addr)));
   25807          }
   25808          delta++;
   25809          putYMMReg( rG,
   25810                     mkexpr( math_BLENDPD_256( sE, sV, imm8) ) );
   25811          *uses_vvvv = True;
   25812          goto decode_success;
   25813       }
   25814       /* VBLENDPD imm8, xmm3/m128, xmm2, xmm1 */
   25815       /* VBLENDPD = VEX.NDS.128.66.0F3A.WIG 0D /r ib */
   25816       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25817          UChar  modrm = getUChar(delta);
   25818          UInt   imm8;
   25819          UInt   rG    = gregOfRexRM(pfx, modrm);
   25820          UInt   rV    = getVexNvvvv(pfx);
   25821          IRTemp sV    = newTemp(Ity_V128);
   25822          IRTemp sE    = newTemp(Ity_V128);
   25823          assign ( sV, getXMMReg(rV) );
   25824          if (epartIsReg(modrm)) {
   25825             UInt rE = eregOfRexRM(pfx, modrm);
   25826             delta += 1;
   25827             imm8 = getUChar(delta);
   25828             DIP("vblendpd $%u,%s,%s,%s\n",
   25829                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   25830             assign(sE, getXMMReg(rE));
   25831          } else {
   25832             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   25833             delta += alen;
   25834             imm8 = getUChar(delta);
   25835             DIP("vblendpd $%u,%s,%s,%s\n",
   25836                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   25837             assign(sE, loadLE(Ity_V128, mkexpr(addr)));
   25838          }
   25839          delta++;
   25840          putYMMRegLoAndZU( rG,
   25841                            mkexpr( math_BLENDPD_128( sE, sV, imm8) ) );
   25842          *uses_vvvv = True;
   25843          goto decode_success;
   25844       }
   25845       break;
   25846 
   25847    case 0x0E:
   25848       /* VPBLENDW imm8, xmm3/m128, xmm2, xmm1 */
   25849       /* VPBLENDW = VEX.NDS.128.66.0F3A.WIG 0E /r ib */
   25850       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25851          UChar  modrm = getUChar(delta);
   25852          UInt   imm8;
   25853          UInt   rG    = gregOfRexRM(pfx, modrm);
   25854          UInt   rV    = getVexNvvvv(pfx);
   25855          IRTemp sV    = newTemp(Ity_V128);
   25856          IRTemp sE    = newTemp(Ity_V128);
   25857          assign ( sV, getXMMReg(rV) );
   25858          if (epartIsReg(modrm)) {
   25859             UInt rE = eregOfRexRM(pfx, modrm);
   25860             delta += 1;
   25861             imm8 = getUChar(delta);
   25862             DIP("vpblendw $%u,%s,%s,%s\n",
   25863                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   25864             assign(sE, getXMMReg(rE));
   25865          } else {
   25866             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   25867             delta += alen;
   25868             imm8 = getUChar(delta);
   25869             DIP("vpblendw $%u,%s,%s,%s\n",
   25870                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   25871             assign(sE, loadLE(Ity_V128, mkexpr(addr)));
   25872          }
   25873          delta++;
   25874          putYMMRegLoAndZU( rG,
   25875                            mkexpr( math_PBLENDW_128( sE, sV, imm8) ) );
   25876          *uses_vvvv = True;
   25877          goto decode_success;
   25878       }
   25879       break;
   25880 
   25881    case 0x0F:
   25882       /* VPALIGNR imm8, xmm3/m128, xmm2, xmm1 */
   25883       /* VPALIGNR = VEX.NDS.128.66.0F3A.WIG 0F /r ib */
   25884       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25885          UChar  modrm = getUChar(delta);
   25886          UInt   rG    = gregOfRexRM(pfx, modrm);
   25887          UInt   rV    = getVexNvvvv(pfx);
   25888          IRTemp sV    = newTemp(Ity_V128);
   25889          IRTemp dV    = newTemp(Ity_V128);
   25890          UInt   imm8;
   25891 
   25892          assign( dV, getXMMReg(rV) );
   25893 
   25894          if ( epartIsReg( modrm ) ) {
   25895             UInt   rE = eregOfRexRM(pfx, modrm);
   25896             assign( sV, getXMMReg(rE) );
   25897             imm8 = getUChar(delta+1);
   25898             delta += 1+1;
   25899             DIP("vpalignr $%d,%s,%s,%s\n", imm8, nameXMMReg(rE),
   25900                                            nameXMMReg(rV), nameXMMReg(rG));
   25901          } else {
   25902             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   25903             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   25904             imm8 = getUChar(delta+alen);
   25905             delta += alen+1;
   25906             DIP("vpalignr $%d,%s,%s,%s\n", imm8, dis_buf,
   25907                                            nameXMMReg(rV), nameXMMReg(rG));
   25908          }
   25909 
   25910          IRTemp res = math_PALIGNR_XMM( sV, dV, imm8 );
   25911          putYMMRegLoAndZU( rG, mkexpr(res) );
   25912          *uses_vvvv = True;
   25913          goto decode_success;
   25914       }
   25915       break;
   25916 
   25917    case 0x14:
   25918       /* VPEXTRB imm8, xmm2, reg/m8 = VEX.128.66.0F3A.W0 14 /r ib */
   25919       if (have66noF2noF3(pfx)
   25920           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   25921          delta = dis_PEXTRB_128_GtoE( vbi, pfx, delta, False/*!isAvx*/ );
   25922          goto decode_success;
   25923       }
   25924       break;
   25925 
   25926    case 0x15:
   25927       /* VPEXTRW imm8, reg/m16, xmm2 */
   25928       /* VPEXTRW = VEX.128.66.0F3A.W0 15 /r ib */
   25929       if (have66noF2noF3(pfx)
   25930           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   25931          delta = dis_PEXTRW( vbi, pfx, delta, True/*isAvx*/ );
   25932          goto decode_success;
   25933       }
   25934       break;
   25935 
   25936    case 0x16:
   25937       /* VPEXTRD imm8, r32/m32, xmm2 */
   25938       /* VPEXTRD = VEX.128.66.0F3A.W0 16 /r ib */
   25939       if (have66noF2noF3(pfx)
   25940           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   25941          delta = dis_PEXTRD( vbi, pfx, delta, True/*isAvx*/ );
   25942          goto decode_success;
   25943       }
   25944       /* VPEXTRQ = VEX.128.66.0F3A.W1 16 /r ib */
   25945       if (have66noF2noF3(pfx)
   25946           && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
   25947          delta = dis_PEXTRQ( vbi, pfx, delta, True/*isAvx*/ );
   25948          goto decode_success;
   25949       }
   25950       break;
   25951 
   25952    case 0x17:
   25953       /* VEXTRACTPS imm8, xmm1, r32/m32 = VEX.128.66.0F3A.WIG 17 /r ib */
   25954       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25955          delta = dis_EXTRACTPS( vbi, pfx, delta, True/*isAvx*/ );
   25956          goto decode_success;
   25957       }
   25958       break;
   25959 
   25960    case 0x18:
   25961       /* VINSERTF128 r/m, rV, rD
   25962          ::: rD = insertinto(a lane in rV, 128 bits from r/m) */
   25963       /* VINSERTF128 = VEX.NDS.256.66.0F3A.W0 18 /r ib */
   25964       if (have66noF2noF3(pfx)
   25965           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   25966          UChar  modrm = getUChar(delta);
   25967          UInt   ib    = 0;
   25968          UInt   rG    = gregOfRexRM(pfx, modrm);
   25969          UInt   rV    = getVexNvvvv(pfx);
   25970          IRTemp t128  = newTemp(Ity_V128);
   25971          if (epartIsReg(modrm)) {
   25972             UInt rE = eregOfRexRM(pfx, modrm);
   25973             delta += 1;
   25974             assign(t128, getXMMReg(rE));
   25975             ib = getUChar(delta);
   25976             DIP("vinsertf128 $%u,%s,%s,%s\n",
   25977                 ib, nameXMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   25978          } else {
   25979             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   25980             assign(t128, loadLE(Ity_V128, mkexpr(addr)));
   25981             delta += alen;
   25982             ib = getUChar(delta);
   25983             DIP("vinsertf128 $%u,%s,%s,%s\n",
   25984                 ib, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   25985          }
   25986          delta++;
   25987          putYMMRegLane128(rG, 0,   getYMMRegLane128(rV, 0));
   25988          putYMMRegLane128(rG, 1,   getYMMRegLane128(rV, 1));
   25989          putYMMRegLane128(rG, ib & 1, mkexpr(t128));
   25990          *uses_vvvv = True;
   25991          goto decode_success;
   25992       }
   25993       break;
   25994 
   25995    case 0x19:
   25996      /* VEXTRACTF128 $lane_no, rS, r/m
   25997         ::: r/m:V128 = a lane of rS:V256 (RM format) */
   25998      /* VEXTRACTF128 = VEX.256.66.0F3A.W0 19 /r ib */
   25999       if (have66noF2noF3(pfx)
   26000           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   26001          UChar  modrm = getUChar(delta);
   26002          UInt   ib    = 0;
   26003          UInt   rS    = gregOfRexRM(pfx, modrm);
   26004          IRTemp t128  = newTemp(Ity_V128);
   26005          if (epartIsReg(modrm)) {
   26006             UInt rD = eregOfRexRM(pfx, modrm);
   26007             delta += 1;
   26008             ib = getUChar(delta);
   26009             assign(t128, getYMMRegLane128(rS, ib & 1));
   26010             putYMMRegLoAndZU(rD, mkexpr(t128));
   26011             DIP("vextractf128 $%u,%s,%s\n",
   26012                 ib, nameXMMReg(rS), nameYMMReg(rD));
   26013          } else {
   26014             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   26015             delta += alen;
   26016             ib = getUChar(delta);
   26017             assign(t128, getYMMRegLane128(rS, ib & 1));
   26018             storeLE(mkexpr(addr), mkexpr(t128));
   26019             DIP("vextractf128 $%u,%s,%s\n",
   26020                 ib, nameYMMReg(rS), dis_buf);
   26021          }
   26022          delta++;
   26023          /* doesn't use vvvv */
   26024          goto decode_success;
   26025       }
   26026       break;
   26027 
   26028    case 0x20:
   26029       /* VPINSRB r32/m8, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W0 20 /r ib */
   26030       if (have66noF2noF3(pfx)
   26031           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   26032          UChar  modrm  = getUChar(delta);
   26033          UInt   rG     = gregOfRexRM(pfx, modrm);
   26034          UInt   rV     = getVexNvvvv(pfx);
   26035          Int    imm8;
   26036          IRTemp src_u8 = newTemp(Ity_I8);
   26037 
   26038          if ( epartIsReg( modrm ) ) {
   26039             UInt rE = eregOfRexRM(pfx,modrm);
   26040             imm8 = (Int)(getUChar(delta+1) & 15);
   26041             assign( src_u8, unop(Iop_32to8, getIReg32( rE )) );
   26042             delta += 1+1;
   26043             DIP( "vpinsrb $%d,%s,%s,%s\n",
   26044                  imm8, nameIReg32(rE), nameXMMReg(rV), nameXMMReg(rG) );
   26045          } else {
   26046             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   26047             imm8 = (Int)(getUChar(delta+alen) & 15);
   26048             assign( src_u8, loadLE( Ity_I8, mkexpr(addr) ) );
   26049             delta += alen+1;
   26050             DIP( "vpinsrb $%d,%s,%s,%s\n",
   26051                  imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   26052          }
   26053 
   26054          IRTemp src_vec = newTemp(Ity_V128);
   26055          assign(src_vec, getXMMReg( rV ));
   26056          IRTemp res_vec = math_PINSRB_128( src_vec, src_u8, imm8 );
   26057          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   26058          *uses_vvvv = True;
   26059          goto decode_success;
   26060       }
   26061       break;
   26062 
   26063    case 0x21:
   26064       /* VINSERTPS imm8, xmm3/m32, xmm2, xmm1
   26065          = VEX.NDS.128.66.0F3A.WIG 21 /r ib */
   26066       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26067          UChar  modrm = getUChar(delta);
   26068          UInt   rG    = gregOfRexRM(pfx, modrm);
   26069          UInt   rV    = getVexNvvvv(pfx);
   26070          UInt   imm8;
   26071          IRTemp d2ins = newTemp(Ity_I32); /* comes from the E part */
   26072          const IRTemp inval = IRTemp_INVALID;
   26073 
   26074          if ( epartIsReg( modrm ) ) {
   26075             UInt   rE = eregOfRexRM(pfx, modrm);
   26076             IRTemp vE = newTemp(Ity_V128);
   26077             assign( vE, getXMMReg(rE) );
   26078             IRTemp dsE[4] = { inval, inval, inval, inval };
   26079             breakupV128to32s( vE, &dsE[3], &dsE[2], &dsE[1], &dsE[0] );
   26080             imm8 = getUChar(delta+1);
   26081             d2ins = dsE[(imm8 >> 6) & 3]; /* "imm8_count_s" */
   26082             delta += 1+1;
   26083             DIP( "insertps $%u, %s,%s\n",
   26084                  imm8, nameXMMReg(rE), nameXMMReg(rG) );
   26085          } else {
   26086             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   26087             assign( d2ins, loadLE( Ity_I32, mkexpr(addr) ) );
   26088             imm8 = getUChar(delta+alen);
   26089             delta += alen+1;
   26090             DIP( "insertps $%u, %s,%s\n",
   26091                  imm8, dis_buf, nameXMMReg(rG) );
   26092          }
   26093 
   26094          IRTemp vV = newTemp(Ity_V128);
   26095          assign( vV, getXMMReg(rV) );
   26096 
   26097          putYMMRegLoAndZU( rG, mkexpr(math_INSERTPS( vV, d2ins, imm8 )) );
   26098          *uses_vvvv = True;
   26099          goto decode_success;
   26100       }
   26101       break;
   26102 
   26103    case 0x22:
   26104       /* VPINSRD r32/m32, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W0 22 /r ib */
   26105       if (have66noF2noF3(pfx)
   26106           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   26107          UChar  modrm = getUChar(delta);
   26108          UInt   rG    = gregOfRexRM(pfx, modrm);
   26109          UInt   rV    = getVexNvvvv(pfx);
   26110          Int    imm8_10;
   26111          IRTemp src_u32 = newTemp(Ity_I32);
   26112 
   26113          if ( epartIsReg( modrm ) ) {
   26114             UInt rE = eregOfRexRM(pfx,modrm);
   26115             imm8_10 = (Int)(getUChar(delta+1) & 3);
   26116             assign( src_u32, getIReg32( rE ) );
   26117             delta += 1+1;
   26118             DIP( "vpinsrd $%d,%s,%s,%s\n",
   26119                  imm8_10, nameIReg32(rE), nameXMMReg(rV), nameXMMReg(rG) );
   26120          } else {
   26121             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   26122             imm8_10 = (Int)(getUChar(delta+alen) & 3);
   26123             assign( src_u32, loadLE( Ity_I32, mkexpr(addr) ) );
   26124             delta += alen+1;
   26125             DIP( "vpinsrd $%d,%s,%s,%s\n",
   26126                  imm8_10, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   26127          }
   26128 
   26129          IRTemp src_vec = newTemp(Ity_V128);
   26130          assign(src_vec, getXMMReg( rV ));
   26131          IRTemp res_vec = math_PINSRD_128( src_vec, src_u32, imm8_10 );
   26132          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   26133          *uses_vvvv = True;
   26134          goto decode_success;
   26135       }
   26136       /* VPINSRQ r64/m64, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W1 22 /r ib */
   26137       if (have66noF2noF3(pfx)
   26138           && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
   26139          UChar  modrm = getUChar(delta);
   26140          UInt   rG    = gregOfRexRM(pfx, modrm);
   26141          UInt   rV    = getVexNvvvv(pfx);
   26142          Int    imm8_0;
   26143          IRTemp src_u64 = newTemp(Ity_I64);
   26144 
   26145          if ( epartIsReg( modrm ) ) {
   26146             UInt rE = eregOfRexRM(pfx,modrm);
   26147             imm8_0 = (Int)(getUChar(delta+1) & 1);
   26148             assign( src_u64, getIReg64( rE ) );
   26149             delta += 1+1;
   26150             DIP( "vpinsrq $%d,%s,%s,%s\n",
   26151                  imm8_0, nameIReg64(rE), nameXMMReg(rV), nameXMMReg(rG) );
   26152          } else {
   26153             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   26154             imm8_0 = (Int)(getUChar(delta+alen) & 1);
   26155             assign( src_u64, loadLE( Ity_I64, mkexpr(addr) ) );
   26156             delta += alen+1;
   26157             DIP( "vpinsrd $%d,%s,%s,%s\n",
   26158                  imm8_0, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   26159          }
   26160 
   26161          IRTemp src_vec = newTemp(Ity_V128);
   26162          assign(src_vec, getXMMReg( rV ));
   26163          IRTemp res_vec = math_PINSRQ_128( src_vec, src_u64, imm8_0 );
   26164          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   26165          *uses_vvvv = True;
   26166          goto decode_success;
   26167       }
   26168       break;
   26169 
   26170    case 0x40:
   26171       /* VDPPS imm8, xmm3/m128,xmm2,xmm1 = VEX.NDS.128.66.0F3A.WIG 40 /r ib */
   26172       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26173          UChar  modrm   = getUChar(delta);
   26174          UInt   rG      = gregOfRexRM(pfx, modrm);
   26175          UInt   rV      = getVexNvvvv(pfx);
   26176          IRTemp dst_vec = newTemp(Ity_V128);
   26177          Int    imm8;
   26178          if (epartIsReg( modrm )) {
   26179             UInt rE = eregOfRexRM(pfx,modrm);
   26180             imm8 = (Int)getUChar(delta+1);
   26181             assign( dst_vec, getXMMReg( rE ) );
   26182             delta += 1+1;
   26183             DIP( "vdpps $%d,%s,%s,%s\n",
   26184                  imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
   26185          } else {
   26186             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   26187             imm8 = (Int)getUChar(delta+alen);
   26188             assign( dst_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   26189             delta += alen+1;
   26190             DIP( "vdpps $%d,%s,%s,%s\n",
   26191                  imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   26192          }
   26193 
   26194          IRTemp src_vec = newTemp(Ity_V128);
   26195          assign(src_vec, getXMMReg( rV ));
   26196          IRTemp res_vec = math_DPPS_128( src_vec, dst_vec, imm8 );
   26197          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   26198          *uses_vvvv = True;
   26199          goto decode_success;
   26200       }
   26201       /* VDPPS imm8, ymm3/m128,ymm2,ymm1 = VEX.NDS.256.66.0F3A.WIG 40 /r ib */
   26202       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26203          UChar  modrm   = getUChar(delta);
   26204          UInt   rG      = gregOfRexRM(pfx, modrm);
   26205          UInt   rV      = getVexNvvvv(pfx);
   26206          IRTemp dst_vec = newTemp(Ity_V256);
   26207          Int    imm8;
   26208          if (epartIsReg( modrm )) {
   26209             UInt rE = eregOfRexRM(pfx,modrm);
   26210             imm8 = (Int)getUChar(delta+1);
   26211             assign( dst_vec, getYMMReg( rE ) );
   26212             delta += 1+1;
   26213             DIP( "vdpps $%d,%s,%s,%s\n",
   26214                  imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG) );
   26215          } else {
   26216             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   26217             imm8 = (Int)getUChar(delta+alen);
   26218             assign( dst_vec, loadLE( Ity_V256, mkexpr(addr) ) );
   26219             delta += alen+1;
   26220             DIP( "vdpps $%d,%s,%s,%s\n",
   26221                  imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
   26222          }
   26223 
   26224          IRTemp src_vec = newTemp(Ity_V256);
   26225          assign(src_vec, getYMMReg( rV ));
   26226          IRTemp s0, s1, d0, d1;
   26227          s0 = s1 = d0 = d1 = IRTemp_INVALID;
   26228          breakupV256toV128s( dst_vec, &d1, &d0 );
   26229          breakupV256toV128s( src_vec, &s1, &s0 );
   26230          putYMMReg( rG, binop( Iop_V128HLtoV256,
   26231                                mkexpr( math_DPPS_128(s1, d1, imm8) ),
   26232                                mkexpr( math_DPPS_128(s0, d0, imm8) ) ) );
   26233          *uses_vvvv = True;
   26234          goto decode_success;
   26235       }
   26236       break;
   26237 
   26238    case 0x41:
   26239       /* VDPPD imm8, xmm3/m128,xmm2,xmm1 = VEX.NDS.128.66.0F3A.WIG 41 /r ib */
   26240       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26241          UChar  modrm   = getUChar(delta);
   26242          UInt   rG      = gregOfRexRM(pfx, modrm);
   26243          UInt   rV      = getVexNvvvv(pfx);
   26244          IRTemp dst_vec = newTemp(Ity_V128);
   26245          Int    imm8;
   26246          if (epartIsReg( modrm )) {
   26247             UInt rE = eregOfRexRM(pfx,modrm);
   26248             imm8 = (Int)getUChar(delta+1);
   26249             assign( dst_vec, getXMMReg( rE ) );
   26250             delta += 1+1;
   26251             DIP( "vdppd $%d,%s,%s,%s\n",
   26252                  imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
   26253          } else {
   26254             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   26255             imm8 = (Int)getUChar(delta+alen);
   26256             assign( dst_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   26257             delta += alen+1;
   26258             DIP( "vdppd $%d,%s,%s,%s\n",
   26259                  imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   26260          }
   26261 
   26262          IRTemp src_vec = newTemp(Ity_V128);
   26263          assign(src_vec, getXMMReg( rV ));
   26264          IRTemp res_vec = math_DPPD_128( src_vec, dst_vec, imm8 );
   26265          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   26266          *uses_vvvv = True;
   26267          goto decode_success;
   26268       }
   26269       break;
   26270 
   26271    case 0x42:
   26272       /* VMPSADBW imm8, xmm3/m128,xmm2,xmm1 */
   26273       /* VMPSADBW = VEX.NDS.128.66.0F3A.WIG 42 /r ib */
   26274       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26275          UChar  modrm   = getUChar(delta);
   26276          Int    imm8;
   26277          IRTemp src_vec = newTemp(Ity_V128);
   26278          IRTemp dst_vec = newTemp(Ity_V128);
   26279          UInt   rG      = gregOfRexRM(pfx, modrm);
   26280          UInt   rV      = getVexNvvvv(pfx);
   26281 
   26282          assign( dst_vec, getXMMReg(rV) );
   26283 
   26284          if ( epartIsReg( modrm ) ) {
   26285             UInt rE = eregOfRexRM(pfx, modrm);
   26286 
   26287             imm8 = (Int)getUChar(delta+1);
   26288             assign( src_vec, getXMMReg(rE) );
   26289             delta += 1+1;
   26290             DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
   26291                  nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
   26292          } else {
   26293             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   26294                              1/* imm8 is 1 byte after the amode */ );
   26295             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   26296             imm8 = (Int)getUChar(delta+alen);
   26297             delta += alen+1;
   26298             DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
   26299                  dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   26300          }
   26301 
   26302          putYMMRegLoAndZU( rG, mkexpr( math_MPSADBW_128(dst_vec,
   26303                                                         src_vec, imm8) ) );
   26304          *uses_vvvv = True;
   26305          goto decode_success;
   26306       }
   26307       break;
   26308 
   26309    case 0x44:
   26310       /* VPCLMULQDQ imm8, xmm3/m128,xmm2,xmm1 */
   26311       /* VPCLMULQDQ = VEX.NDS.128.66.0F3A.WIG 44 /r ib */
   26312       /* 66 0F 3A 44 /r ib = PCLMULQDQ xmm1, xmm2/m128, imm8
   26313        * Carry-less multiplication of selected XMM quadwords into XMM
   26314        * registers (a.k.a multiplication of polynomials over GF(2))
   26315        */
   26316       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26317          UChar  modrm = getUChar(delta);
   26318          Int imm8;
   26319          IRTemp sV    = newTemp(Ity_V128);
   26320          IRTemp dV    = newTemp(Ity_V128);
   26321          UInt   rG    = gregOfRexRM(pfx, modrm);
   26322          UInt   rV    = getVexNvvvv(pfx);
   26323 
   26324          assign( dV, getXMMReg(rV) );
   26325 
   26326          if ( epartIsReg( modrm ) ) {
   26327             UInt rE = eregOfRexRM(pfx, modrm);
   26328             imm8 = (Int)getUChar(delta+1);
   26329             assign( sV, getXMMReg(rE) );
   26330             delta += 1+1;
   26331             DIP( "vpclmulqdq $%d, %s,%s,%s\n", imm8,
   26332                  nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
   26333          } else {
   26334             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   26335                              1/* imm8 is 1 byte after the amode */ );
   26336             assign( sV, loadLE( Ity_V128, mkexpr(addr) ) );
   26337             imm8 = (Int)getUChar(delta+alen);
   26338             delta += alen+1;
   26339             DIP( "vpclmulqdq $%d, %s,%s,%s\n",
   26340                  imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   26341          }
   26342 
   26343          putYMMRegLoAndZU( rG, mkexpr( math_PCLMULQDQ(dV, sV, imm8) ) );
   26344          *uses_vvvv = True;
   26345          goto decode_success;
   26346       }
   26347       break;
   26348 
   26349    case 0x4A:
   26350       /* VBLENDVPS xmmG, xmmE/memE, xmmV, xmmIS4
   26351          ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */
   26352       /* VBLENDVPS = VEX.NDS.128.66.0F3A.WIG 4A /r /is4 */
   26353       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26354          delta = dis_VBLENDV_128 ( vbi, pfx, delta,
   26355                                    "vblendvps", 4, Iop_SarN32x4 );
   26356          *uses_vvvv = True;
   26357          goto decode_success;
   26358       }
   26359       /* VBLENDVPS ymmG, ymmE/memE, ymmV, ymmIS4
   26360          ::: ymmG:V256 = PBLEND(ymmE, ymmV, ymmIS4) (RMVR) */
   26361       /* VBLENDVPS = VEX.NDS.256.66.0F3A.WIG 4A /r /is4 */
   26362       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26363          delta = dis_VBLENDV_256 ( vbi, pfx, delta,
   26364                                    "vblendvps", 4, Iop_SarN32x4 );
   26365          *uses_vvvv = True;
   26366          goto decode_success;
   26367       }
   26368       break;
   26369 
   26370    case 0x4B:
   26371       /* VBLENDVPD xmmG, xmmE/memE, xmmV, xmmIS4
   26372          ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */
   26373       /* VBLENDVPD = VEX.NDS.128.66.0F3A.WIG 4B /r /is4 */
   26374       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26375          delta = dis_VBLENDV_128 ( vbi, pfx, delta,
   26376                                    "vblendvpd", 8, Iop_SarN64x2 );
   26377          *uses_vvvv = True;
   26378          goto decode_success;
   26379       }
   26380       /* VBLENDVPD ymmG, ymmE/memE, ymmV, ymmIS4
   26381          ::: ymmG:V256 = PBLEND(ymmE, ymmV, ymmIS4) (RMVR) */
   26382       /* VBLENDVPD = VEX.NDS.256.66.0F3A.WIG 4B /r /is4 */
   26383       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26384          delta = dis_VBLENDV_256 ( vbi, pfx, delta,
   26385                                    "vblendvpd", 8, Iop_SarN64x2 );
   26386          *uses_vvvv = True;
   26387          goto decode_success;
   26388       }
   26389       break;
   26390 
   26391    case 0x4C:
   26392       /* VPBLENDVB xmmG, xmmE/memE, xmmV, xmmIS4
   26393          ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */
   26394       /* VPBLENDVB = VEX.NDS.128.66.0F3A.WIG 4C /r /is4 */
   26395       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26396          delta = dis_VBLENDV_128 ( vbi, pfx, delta,
   26397                                    "vpblendvb", 1, Iop_SarN8x16 );
   26398          *uses_vvvv = True;
   26399          goto decode_success;
   26400       }
   26401       break;
   26402 
   26403    case 0x60:
   26404    case 0x61:
   26405    case 0x62:
   26406    case 0x63:
   26407       /* VEX.128.66.0F3A.WIG 63 /r ib = VPCMPISTRI imm8, xmm2/m128, xmm1
   26408          VEX.128.66.0F3A.WIG 62 /r ib = VPCMPISTRM imm8, xmm2/m128, xmm1
   26409          VEX.128.66.0F3A.WIG 61 /r ib = VPCMPESTRI imm8, xmm2/m128, xmm1
   26410          VEX.128.66.0F3A.WIG 60 /r ib = VPCMPESTRM imm8, xmm2/m128, xmm1
   26411          (selected special cases that actually occur in glibc,
   26412           not by any means a complete implementation.)
   26413       */
   26414       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26415          Long delta0 = delta;
   26416          delta = dis_PCMPxSTRx( vbi, pfx, delta, True/*isAvx*/, opc );
   26417          if (delta > delta0) goto decode_success;
   26418          /* else fall though; dis_PCMPxSTRx failed to decode it */
   26419       }
   26420       break;
   26421 
   26422    case 0xDF:
   26423       /* VAESKEYGENASSIST imm8, xmm2/m128, xmm1 = VEX.128.66.0F3A.WIG DF /r */
   26424       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26425          delta = dis_AESKEYGENASSIST( vbi, pfx, delta, True/*!isAvx*/ );
   26426          goto decode_success;
   26427       }
   26428       break;
   26429 
   26430    default:
   26431       break;
   26432 
   26433    }
   26434 
   26435   //decode_failure:
   26436    return deltaIN;
   26437 
   26438   decode_success:
   26439    return delta;
   26440 }
   26441 
   26442 
   26443 /*------------------------------------------------------------*/
   26444 /*---                                                      ---*/
   26445 /*--- Disassemble a single instruction                     ---*/
   26446 /*---                                                      ---*/
   26447 /*------------------------------------------------------------*/
   26448 
   26449 /* Disassemble a single instruction into IR.  The instruction is
   26450    located in host memory at &guest_code[delta]. */
   26451 
   26452 static
   26453 DisResult disInstr_AMD64_WRK (
   26454              /*OUT*/Bool* expect_CAS,
   26455              Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
   26456              Bool         resteerCisOk,
   26457              void*        callback_opaque,
   26458              Long         delta64,
   26459              VexArchInfo* archinfo,
   26460              VexAbiInfo*  vbi
   26461           )
   26462 {
   26463    IRTemp    t1, t2, t3, t4, t5, t6;
   26464    UChar     pre;
   26465    Int       n, n_prefixes;
   26466    DisResult dres;
   26467 
   26468    /* The running delta */
   26469    Long delta = delta64;
   26470 
   26471    /* Holds eip at the start of the insn, so that we can print
   26472       consistent error messages for unimplemented insns. */
   26473    Long delta_start = delta;
   26474 
   26475    /* sz denotes the nominal data-op size of the insn; we change it to
   26476       2 if an 0x66 prefix is seen and 8 if REX.W is 1.  In case of
   26477       conflict REX.W takes precedence. */
   26478    Int sz = 4;
   26479 
   26480    /* pfx holds the summary of prefixes. */
   26481    Prefix pfx = PFX_EMPTY;
   26482 
   26483    /* Holds the computed opcode-escape indication. */
   26484    Escape esc = ESC_NONE;
   26485 
   26486    /* Set result defaults. */
   26487    dres.whatNext    = Dis_Continue;
   26488    dres.len         = 0;
   26489    dres.continueAt  = 0;
   26490    dres.jk_StopHere = Ijk_INVALID;
   26491    *expect_CAS = False;
   26492 
   26493    vassert(guest_RIP_next_assumed == 0);
   26494    vassert(guest_RIP_next_mustcheck == False);
   26495 
   26496    t1 = t2 = t3 = t4 = t5 = t6 = IRTemp_INVALID;
   26497 
   26498    DIP("\t0x%llx:  ", guest_RIP_bbstart+delta);
   26499 
   26500    /* Spot "Special" instructions (see comment at top of file). */
   26501    {
   26502       UChar* code = (UChar*)(guest_code + delta);
   26503       /* Spot the 16-byte preamble:
   26504          48C1C703   rolq $3,  %rdi
   26505          48C1C70D   rolq $13, %rdi
   26506          48C1C73D   rolq $61, %rdi
   26507          48C1C733   rolq $51, %rdi
   26508       */
   26509       if (code[ 0] == 0x48 && code[ 1] == 0xC1 && code[ 2] == 0xC7
   26510                                                && code[ 3] == 0x03 &&
   26511           code[ 4] == 0x48 && code[ 5] == 0xC1 && code[ 6] == 0xC7
   26512                                                && code[ 7] == 0x0D &&
   26513           code[ 8] == 0x48 && code[ 9] == 0xC1 && code[10] == 0xC7
   26514                                                && code[11] == 0x3D &&
   26515           code[12] == 0x48 && code[13] == 0xC1 && code[14] == 0xC7
   26516                                                && code[15] == 0x33) {
   26517          /* Got a "Special" instruction preamble.  Which one is it? */
   26518          if (code[16] == 0x48 && code[17] == 0x87
   26519                               && code[18] == 0xDB /* xchgq %rbx,%rbx */) {
   26520             /* %RDX = client_request ( %RAX ) */
   26521             DIP("%%rdx = client_request ( %%rax )\n");
   26522             delta += 19;
   26523             jmp_lit(&dres, Ijk_ClientReq, guest_RIP_bbstart+delta);
   26524             vassert(dres.whatNext == Dis_StopHere);
   26525             goto decode_success;
   26526          }
   26527          else
   26528          if (code[16] == 0x48 && code[17] == 0x87
   26529                               && code[18] == 0xC9 /* xchgq %rcx,%rcx */) {
   26530             /* %RAX = guest_NRADDR */
   26531             DIP("%%rax = guest_NRADDR\n");
   26532             delta += 19;
   26533             putIRegRAX(8, IRExpr_Get( OFFB_NRADDR, Ity_I64 ));
   26534             goto decode_success;
   26535          }
   26536          else
   26537          if (code[16] == 0x48 && code[17] == 0x87
   26538                               && code[18] == 0xD2 /* xchgq %rdx,%rdx */) {
   26539             /* call-noredir *%RAX */
   26540             DIP("call-noredir *%%rax\n");
   26541             delta += 19;
   26542             t1 = newTemp(Ity_I64);
   26543             assign(t1, getIRegRAX(8));
   26544             t2 = newTemp(Ity_I64);
   26545             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   26546             putIReg64(R_RSP, mkexpr(t2));
   26547             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta));
   26548             jmp_treg(&dres, Ijk_NoRedir, t1);
   26549             vassert(dres.whatNext == Dis_StopHere);
   26550             goto decode_success;
   26551          }
   26552          /* We don't know what it is. */
   26553          goto decode_failure;
   26554          /*NOTREACHED*/
   26555       }
   26556    }
   26557 
   26558    /* Eat prefixes, summarising the result in pfx and sz, and rejecting
   26559       as many invalid combinations as possible. */
   26560    n_prefixes = 0;
   26561    while (True) {
   26562       if (n_prefixes > 7) goto decode_failure;
   26563       pre = getUChar(delta);
   26564       switch (pre) {
   26565          case 0x66: pfx |= PFX_66; break;
   26566          case 0x67: pfx |= PFX_ASO; break;
   26567          case 0xF2: pfx |= PFX_F2; break;
   26568          case 0xF3: pfx |= PFX_F3; break;
   26569          case 0xF0: pfx |= PFX_LOCK; *expect_CAS = True; break;
   26570          case 0x2E: pfx |= PFX_CS; break;
   26571          case 0x3E: pfx |= PFX_DS; break;
   26572          case 0x26: pfx |= PFX_ES; break;
   26573          case 0x64: pfx |= PFX_FS; break;
   26574          case 0x65: pfx |= PFX_GS; break;
   26575          case 0x36: pfx |= PFX_SS; break;
   26576          case 0x40 ... 0x4F:
   26577             pfx |= PFX_REX;
   26578             if (pre & (1<<3)) pfx |= PFX_REXW;
   26579             if (pre & (1<<2)) pfx |= PFX_REXR;
   26580             if (pre & (1<<1)) pfx |= PFX_REXX;
   26581             if (pre & (1<<0)) pfx |= PFX_REXB;
   26582             break;
   26583          default:
   26584             goto not_a_legacy_prefix;
   26585       }
   26586       n_prefixes++;
   26587       delta++;
   26588    }
   26589 
   26590    not_a_legacy_prefix:
   26591    /* We've used up all the non-VEX prefixes.  Parse and validate a
   26592       VEX prefix if that's appropriate. */
   26593    if (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX) {
   26594       /* Used temporarily for holding VEX prefixes. */
   26595       UChar vex0 = getUChar(delta);
   26596       if (vex0 == 0xC4) {
   26597          /* 3-byte VEX */
   26598          UChar vex1 = getUChar(delta+1);
   26599          UChar vex2 = getUChar(delta+2);
   26600          delta += 3;
   26601          pfx |= PFX_VEX;
   26602          /* Snarf contents of byte 1 */
   26603          /* R */ pfx |= (vex1 & (1<<7)) ? 0 : PFX_REXR;
   26604          /* X */ pfx |= (vex1 & (1<<6)) ? 0 : PFX_REXX;
   26605          /* B */ pfx |= (vex1 & (1<<5)) ? 0 : PFX_REXB;
   26606          /* m-mmmm */
   26607          switch (vex1 & 0x1F) {
   26608             case 1: esc = ESC_0F;   break;
   26609             case 2: esc = ESC_0F38; break;
   26610             case 3: esc = ESC_0F3A; break;
   26611             /* Any other m-mmmm field will #UD */
   26612             default: goto decode_failure;
   26613          }
   26614          /* Snarf contents of byte 2 */
   26615          /* W */    pfx |= (vex2 & (1<<7)) ? PFX_REXW : 0;
   26616          /* ~v3 */  pfx |= (vex2 & (1<<6)) ? 0 : PFX_VEXnV3;
   26617          /* ~v2 */  pfx |= (vex2 & (1<<5)) ? 0 : PFX_VEXnV2;
   26618          /* ~v1 */  pfx |= (vex2 & (1<<4)) ? 0 : PFX_VEXnV1;
   26619          /* ~v0 */  pfx |= (vex2 & (1<<3)) ? 0 : PFX_VEXnV0;
   26620          /* L */    pfx |= (vex2 & (1<<2)) ? PFX_VEXL : 0;
   26621          /* pp */
   26622          switch (vex2 & 3) {
   26623             case 0: break;
   26624             case 1: pfx |= PFX_66; break;
   26625             case 2: pfx |= PFX_F3; break;
   26626             case 3: pfx |= PFX_F2; break;
   26627             default: vassert(0);
   26628          }
   26629       }
   26630       else if (vex0 == 0xC5) {
   26631          /* 2-byte VEX */
   26632          UChar vex1 = getUChar(delta+1);
   26633          delta += 2;
   26634          pfx |= PFX_VEX;
   26635          /* Snarf contents of byte 1 */
   26636          /* R */    pfx |= (vex1 & (1<<7)) ? 0 : PFX_REXR;
   26637          /* ~v3 */  pfx |= (vex1 & (1<<6)) ? 0 : PFX_VEXnV3;
   26638          /* ~v2 */  pfx |= (vex1 & (1<<5)) ? 0 : PFX_VEXnV2;
   26639          /* ~v1 */  pfx |= (vex1 & (1<<4)) ? 0 : PFX_VEXnV1;
   26640          /* ~v0 */  pfx |= (vex1 & (1<<3)) ? 0 : PFX_VEXnV0;
   26641          /* L */    pfx |= (vex1 & (1<<2)) ? PFX_VEXL : 0;
   26642          /* pp */
   26643          switch (vex1 & 3) {
   26644             case 0: break;
   26645             case 1: pfx |= PFX_66; break;
   26646             case 2: pfx |= PFX_F3; break;
   26647             case 3: pfx |= PFX_F2; break;
   26648             default: vassert(0);
   26649          }
   26650          /* implied: */
   26651          esc = ESC_0F;
   26652       }
   26653       /* Can't have both VEX and REX */
   26654       if ((pfx & PFX_VEX) && (pfx & PFX_REX))
   26655          goto decode_failure; /* can't have both */
   26656    }
   26657 
   26658    /* Dump invalid combinations */
   26659    n = 0;
   26660    if (pfx & PFX_F2) n++;
   26661    if (pfx & PFX_F3) n++;
   26662    if (n > 1)
   26663       goto decode_failure; /* can't have both */
   26664 
   26665    n = 0;
   26666    if (pfx & PFX_CS) n++;
   26667    if (pfx & PFX_DS) n++;
   26668    if (pfx & PFX_ES) n++;
   26669    if (pfx & PFX_FS) n++;
   26670    if (pfx & PFX_GS) n++;
   26671    if (pfx & PFX_SS) n++;
   26672    if (n > 1)
   26673       goto decode_failure; /* multiple seg overrides == illegal */
   26674 
   26675    /* We have a %fs prefix.  Reject it if there's no evidence in 'vbi'
   26676       that we should accept it. */
   26677    if ((pfx & PFX_FS) && !vbi->guest_amd64_assume_fs_is_zero)
   26678       goto decode_failure;
   26679 
   26680    /* Ditto for %gs prefixes. */
   26681    if ((pfx & PFX_GS) && !vbi->guest_amd64_assume_gs_is_0x60)
   26682       goto decode_failure;
   26683 
   26684    /* Set up sz. */
   26685    sz = 4;
   26686    if (pfx & PFX_66) sz = 2;
   26687    if ((pfx & PFX_REX) && (pfx & PFX_REXW)) sz = 8;
   26688 
   26689    /* Now we should be looking at the primary opcode byte or the
   26690       leading escapes.  Check that any LOCK prefix is actually
   26691       allowed. */
   26692    if (pfx & PFX_LOCK) {
   26693       if (can_be_used_with_LOCK_prefix( (UChar*)&guest_code[delta] )) {
   26694          DIP("lock ");
   26695       } else {
   26696          *expect_CAS = False;
   26697          goto decode_failure;
   26698       }
   26699    }
   26700 
   26701    /* Eat up opcode escape bytes, until we're really looking at the
   26702       primary opcode byte.  But only if there's no VEX present. */
   26703    if (!(pfx & PFX_VEX)) {
   26704       vassert(esc == ESC_NONE);
   26705       pre = getUChar(delta);
   26706       if (pre == 0x0F) {
   26707          delta++;
   26708          pre = getUChar(delta);
   26709          switch (pre) {
   26710             case 0x38: esc = ESC_0F38; delta++; break;
   26711             case 0x3A: esc = ESC_0F3A; delta++; break;
   26712             default:   esc = ESC_0F; break;
   26713          }
   26714       }
   26715    }
   26716 
   26717    /* So now we're really really looking at the primary opcode
   26718       byte. */
   26719    Long delta_at_primary_opcode = delta;
   26720 
   26721    if (!(pfx & PFX_VEX)) {
   26722       /* Handle non-VEX prefixed instructions.  "Legacy" (non-VEX) SSE
   26723          instructions preserve the upper 128 bits of YMM registers;
   26724          iow we can simply ignore the presence of the upper halves of
   26725          these registers. */
   26726       switch (esc) {
   26727          case ESC_NONE:
   26728             delta = dis_ESC_NONE( &dres, expect_CAS,
   26729                                   resteerOkFn, resteerCisOk, callback_opaque,
   26730                                   archinfo, vbi, pfx, sz, delta );
   26731             break;
   26732          case ESC_0F:
   26733             delta = dis_ESC_0F  ( &dres, expect_CAS,
   26734                                   resteerOkFn, resteerCisOk, callback_opaque,
   26735                                   archinfo, vbi, pfx, sz, delta );
   26736             break;
   26737          case ESC_0F38:
   26738             delta = dis_ESC_0F38( &dres,
   26739                                   resteerOkFn, resteerCisOk, callback_opaque,
   26740                                   archinfo, vbi, pfx, sz, delta );
   26741             break;
   26742          case ESC_0F3A:
   26743             delta = dis_ESC_0F3A( &dres,
   26744                                   resteerOkFn, resteerCisOk, callback_opaque,
   26745                                   archinfo, vbi, pfx, sz, delta );
   26746             break;
   26747          default:
   26748             vassert(0);
   26749       }
   26750    } else {
   26751       /* VEX prefixed instruction */
   26752       /* Sloppy Intel wording: "An instruction encoded with a VEX.128
   26753          prefix that loads a YMM register operand ..." zeroes out bits
   26754          128 and above of the register. */
   26755       Bool uses_vvvv = False;
   26756       switch (esc) {
   26757          case ESC_0F:
   26758             delta = dis_ESC_0F__VEX ( &dres, &uses_vvvv,
   26759                                       resteerOkFn, resteerCisOk,
   26760                                       callback_opaque,
   26761                                       archinfo, vbi, pfx, sz, delta );
   26762             break;
   26763          case ESC_0F38:
   26764             delta = dis_ESC_0F38__VEX ( &dres, &uses_vvvv,
   26765                                         resteerOkFn, resteerCisOk,
   26766                                         callback_opaque,
   26767                                         archinfo, vbi, pfx, sz, delta );
   26768             break;
   26769          case ESC_0F3A:
   26770             delta = dis_ESC_0F3A__VEX ( &dres, &uses_vvvv,
   26771                                         resteerOkFn, resteerCisOk,
   26772                                         callback_opaque,
   26773                                         archinfo, vbi, pfx, sz, delta );
   26774             break;
   26775          case ESC_NONE:
   26776             /* The presence of a VEX prefix, by Intel definition,
   26777                always implies at least an 0F escape. */
   26778             goto decode_failure;
   26779          default:
   26780             vassert(0);
   26781       }
   26782       /* If the insn doesn't use VEX.vvvv then it must be all ones.
   26783          Check this. */
   26784       if (!uses_vvvv) {
   26785          if (getVexNvvvv(pfx) != 0)
   26786             goto decode_failure;
   26787       }
   26788    }
   26789 
   26790    vassert(delta - delta_at_primary_opcode >= 0);
   26791    vassert(delta - delta_at_primary_opcode < 16/*let's say*/);
   26792 
   26793    /* Use delta == delta_at_primary_opcode to denote decode failure.
   26794       This implies that any successful decode must use at least one
   26795       byte up. */
   26796    if (delta == delta_at_primary_opcode)
   26797       goto decode_failure;
   26798    else
   26799       goto decode_success; /* \o/ */
   26800 
   26801 #if 0 /* XYZZY */
   26802 
   26803    /* ---------------------------------------------------- */
   26804    /* --- The SSE/SSE2 decoder.                        --- */
   26805    /* ---------------------------------------------------- */
   26806 
   26807    /* What did I do to deserve SSE ?  Perhaps I was really bad in a
   26808       previous life? */
   26809 
   26810    /* Note, this doesn't handle SSE3 right now.  All amd64s support
   26811       SSE2 as a minimum so there is no point distinguishing SSE1 vs
   26812       SSE2. */
   26813 
   26814    insn = (UChar*)&guest_code[delta];
   26815 
   26816    /* FXSAVE is spuriously at the start here only because it is
   26817       thusly placed in guest-x86/toIR.c. */
   26818 
   26819    /* ------ SSE decoder main ------ */
   26820 
   26821    /* ---------------------------------------------------- */
   26822    /* --- end of the SSE decoder.                      --- */
   26823    /* ---------------------------------------------------- */
   26824 
   26825    /* ---------------------------------------------------- */
   26826    /* --- start of the SSE2 decoder.                   --- */
   26827    /* ---------------------------------------------------- */
   26828 
   26829    /* ---------------------------------------------------- */
   26830    /* --- end of the SSE/SSE2 decoder.                 --- */
   26831    /* ---------------------------------------------------- */
   26832 
   26833    /* ---------------------------------------------------- */
   26834    /* --- start of the SSE3 decoder.                   --- */
   26835    /* ---------------------------------------------------- */
   26836 
   26837    /* ---------------------------------------------------- */
   26838    /* --- end of the SSE3 decoder.                     --- */
   26839    /* ---------------------------------------------------- */
   26840 
   26841    /* ---------------------------------------------------- */
   26842    /* --- start of the SSSE3 decoder.                  --- */
   26843    /* ---------------------------------------------------- */
   26844 
   26845    /* ---------------------------------------------------- */
   26846    /* --- end of the SSSE3 decoder.                    --- */
   26847    /* ---------------------------------------------------- */
   26848 
   26849    /* ---------------------------------------------------- */
   26850    /* --- start of the SSE4 decoder                    --- */
   26851    /* ---------------------------------------------------- */
   26852 
   26853    /* ---------------------------------------------------- */
   26854    /* --- end of the SSE4 decoder                      --- */
   26855    /* ---------------------------------------------------- */
   26856 
   26857    /*after_sse_decoders:*/
   26858 
   26859    /* Get the primary opcode. */
   26860    opc = getUChar(delta); delta++;
   26861 
   26862    /* We get here if the current insn isn't SSE, or this CPU doesn't
   26863       support SSE. */
   26864 
   26865    switch (opc) {
   26866 
   26867    /* ------------------------ Control flow --------------- */
   26868 
   26869    /* ------------------------ CWD/CDQ -------------------- */
   26870 
   26871    /* ------------------------ FPU ops -------------------- */
   26872 
   26873    /* ------------------------ INT ------------------------ */
   26874 
   26875    case 0xCD: { /* INT imm8 */
   26876       IRJumpKind jk = Ijk_Boring;
   26877       if (have66orF2orF3(pfx)) goto decode_failure;
   26878       d64 = getUChar(delta); delta++;
   26879       switch (d64) {
   26880          case 32: jk = Ijk_Sys_int32; break;
   26881          default: goto decode_failure;
   26882       }
   26883       guest_RIP_next_mustcheck = True;
   26884       guest_RIP_next_assumed = guest_RIP_bbstart + delta;
   26885       jmp_lit(jk, guest_RIP_next_assumed);
   26886       /* It's important that all ArchRegs carry their up-to-date value
   26887          at this point.  So we declare an end-of-block here, which
   26888          forces any TempRegs caching ArchRegs to be flushed. */
   26889       vassert(dres.whatNext == Dis_StopHere);
   26890       DIP("int $0x%02x\n", (UInt)d64);
   26891       break;
   26892    }
   26893 
   26894    /* ------------------------ Jcond, byte offset --------- */
   26895 
   26896    /* ------------------------ IMUL ----------------------- */
   26897 
   26898    /* ------------------------ MOV ------------------------ */
   26899 
   26900    /* ------------------------ MOVx ------------------------ */
   26901 
   26902    /* ------------------------ opl imm, A ----------------- */
   26903 
   26904    /* ------------------------ opl Ev, Gv ----------------- */
   26905 
   26906    /* ------------------------ opl Gv, Ev ----------------- */
   26907 
   26908    /* ------------------------ POP ------------------------ */
   26909 
   26910    /* ------------------------ PUSH ----------------------- */
   26911 
   26912    /* ------ AE: SCAS variants ------ */
   26913 
   26914    /* ------ A6, A7: CMPS variants ------ */
   26915 
   26916    /* ------ AA, AB: STOS variants ------ */
   26917 
   26918    /* ------ A4, A5: MOVS variants ------ */
   26919 
   26920    /* ------------------------ XCHG ----------------------- */
   26921 
   26922    /* ------------------------ IN / OUT ----------------------- */
   26923 
   26924    /* ------------------------ (Grp1 extensions) ---------- */
   26925 
   26926    /* ------------------------ (Grp2 extensions) ---------- */
   26927 
   26928    /* ------------------------ (Grp3 extensions) ---------- */
   26929 
   26930    /* ------------------------ (Grp4 extensions) ---------- */
   26931 
   26932    /* ------------------------ (Grp5 extensions) ---------- */
   26933 
   26934    /* ------------------------ Escapes to 2-byte opcodes -- */
   26935 
   26936    case 0x0F: {
   26937       opc = getUChar(delta); delta++;
   26938       switch (opc) {
   26939 
   26940       /* =-=-=-=-=-=-=-=-=- Grp8 =-=-=-=-=-=-=-=-=-=-=-= */
   26941 
   26942       /* =-=-=-=-=-=-=-=-=- BSF/BSR -=-=-=-=-=-=-=-=-=-= */
   26943 
   26944       /* =-=-=-=-=-=-=-=-=- BSWAP -=-=-=-=-=-=-=-=-=-=-= */
   26945 
   26946       /* =-=-=-=-=-=-=-=-=- BT/BTS/BTR/BTC =-=-=-=-=-=-= */
   26947 
   26948       /* =-=-=-=-=-=-=-=-=- CMOV =-=-=-=-=-=-=-=-=-=-=-= */
   26949 
   26950       /* =-=-=-=-=-=-=-=-=- CMPXCHG -=-=-=-=-=-=-=-=-=-= */
   26951 
   26952       /* =-=-=-=-=-=-=-=-=- CPUID -=-=-=-=-=-=-=-=-=-=-= */
   26953 
   26954       /* =-=-=-=-=-=-=-=-=- MOVZX, MOVSX =-=-=-=-=-=-=-= */
   26955 
   26956       /* =-=-=-=-=-=-=-=-=- MUL/IMUL =-=-=-=-=-=-=-=-=-= */
   26957 
   26958       /* =-=-=-=-=-=-=-=-=- NOPs =-=-=-=-=-=-=-=-=-=-=-= */
   26959 
   26960       /* =-=-=-=-=-=-=-=-=- Jcond d32 -=-=-=-=-=-=-=-=-= */
   26961 
   26962       /* =-=-=-=-=-=-=-=-=- PREFETCH =-=-=-=-=-=-=-=-=-= */
   26963 
   26964       /* =-=-=-=-=-=-=-=-=- RDTSC -=-=-=-=-=-=-=-=-=-=-= */
   26965 
   26966       /* =-=-=-=-=-=-=-=-=- SETcc Eb =-=-=-=-=-=-=-=-=-= */
   26967 
   26968       /* =-=-=-=-=-=-=-=-=- SHLD/SHRD -=-=-=-=-=-=-=-=-= */
   26969 
   26970       /* =-=-=-=-=-=-=-=-=- SYSCALL -=-=-=-=-=-=-=-=-=-= */
   26971 
   26972       /* =-=-=-=-=-=-=-=-=- XADD -=-=-=-=-=-=-=-=-=-= */
   26973 
   26974       case 0xC0: { /* XADD Gb,Eb */
   26975          Bool decode_OK = False;
   26976          delta = dis_xadd_G_E ( &decode_OK, vbi, pfx, 1, delta );
   26977          if (!decode_OK)
   26978             goto decode_failure;
   26979          break;
   26980       }
   26981 
   26982       /* =-=-=-=-=-=-=-=-=- SGDT and SIDT =-=-=-=-=-=-=-=-=-=-= */
   26983 
   26984       /* =-=-=-=-=-=-=-=-=- unimp2 =-=-=-=-=-=-=-=-=-=-= */
   26985 
   26986       default:
   26987          goto decode_failure;
   26988    } /* switch (opc) for the 2-byte opcodes */
   26989    goto decode_success;
   26990    } /* case 0x0F: of primary opcode */
   26991 
   26992    /* ------------------------ ??? ------------------------ */
   26993 #endif /* XYZZY */
   26994 
   26995      //default:
   26996   decode_failure:
   26997    /* All decode failures end up here. */
   26998    vex_printf("vex amd64->IR: unhandled instruction bytes: "
   26999               "0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
   27000               (Int)getUChar(delta_start+0),
   27001               (Int)getUChar(delta_start+1),
   27002               (Int)getUChar(delta_start+2),
   27003               (Int)getUChar(delta_start+3),
   27004               (Int)getUChar(delta_start+4),
   27005               (Int)getUChar(delta_start+5),
   27006               (Int)getUChar(delta_start+6),
   27007               (Int)getUChar(delta_start+7) );
   27008    vex_printf("vex amd64->IR:   REX=%d REX.W=%d REX.R=%d REX.X=%d REX.B=%d\n",
   27009               haveREX(pfx) ? 1 : 0, getRexW(pfx), getRexR(pfx),
   27010               getRexX(pfx), getRexB(pfx));
   27011    vex_printf("vex amd64->IR:   VEX=%d VEX.L=%d VEX.nVVVV=0x%x ESC=%s\n",
   27012               haveVEX(pfx) ? 1 : 0, getVexL(pfx),
   27013               getVexNvvvv(pfx),
   27014               esc==ESC_NONE ? "NONE" :
   27015                 esc==ESC_0F ? "0F" :
   27016                 esc==ESC_0F38 ? "0F38" :
   27017                 esc==ESC_0F3A ? "0F3A" : "???");
   27018    vex_printf("vex amd64->IR:   PFX.66=%d PFX.F2=%d PFX.F3=%d\n",
   27019               have66(pfx) ? 1 : 0, haveF2(pfx) ? 1 : 0,
   27020               haveF3(pfx) ? 1 : 0);
   27021 
   27022    /* Tell the dispatcher that this insn cannot be decoded, and so has
   27023       not been executed, and (is currently) the next to be executed.
   27024       RIP should be up-to-date since it made so at the start of each
   27025       insn, but nevertheless be paranoid and update it again right
   27026       now. */
   27027    stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_curr_instr) ) );
   27028    jmp_lit(&dres, Ijk_NoDecode, guest_RIP_curr_instr);
   27029    vassert(dres.whatNext == Dis_StopHere);
   27030    dres.len = 0;
   27031    /* We also need to say that a CAS is not expected now, regardless
   27032       of what it might have been set to at the start of the function,
   27033       since the IR that we've emitted just above (to synthesis a
   27034       SIGILL) does not involve any CAS, and presumably no other IR has
   27035       been emitted for this (non-decoded) insn. */
   27036    *expect_CAS = False;
   27037    return dres;
   27038 
   27039    //   } /* switch (opc) for the main (primary) opcode switch. */
   27040 
   27041   decode_success:
   27042    /* All decode successes end up here. */
   27043    switch (dres.whatNext) {
   27044       case Dis_Continue:
   27045          stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_bbstart + delta) ) );
   27046          break;
   27047       case Dis_ResteerU:
   27048       case Dis_ResteerC:
   27049          stmt( IRStmt_Put( OFFB_RIP, mkU64(dres.continueAt) ) );
   27050          break;
   27051       case Dis_StopHere:
   27052          break;
   27053       default:
   27054          vassert(0);
   27055    }
   27056 
   27057    DIP("\n");
   27058    dres.len = (Int)toUInt(delta - delta_start);
   27059    return dres;
   27060 }
   27061 
   27062 #undef DIP
   27063 #undef DIS
   27064 
   27065 
   27066 /*------------------------------------------------------------*/
   27067 /*--- Top-level fn                                         ---*/
   27068 /*------------------------------------------------------------*/
   27069 
   27070 /* Disassemble a single instruction into IR.  The instruction
   27071    is located in host memory at &guest_code[delta]. */
   27072 
   27073 DisResult disInstr_AMD64 ( IRSB*        irsb_IN,
   27074                            Bool         (*resteerOkFn) ( void*, Addr64 ),
   27075                            Bool         resteerCisOk,
   27076                            void*        callback_opaque,
   27077                            UChar*       guest_code_IN,
   27078                            Long         delta,
   27079                            Addr64       guest_IP,
   27080                            VexArch      guest_arch,
   27081                            VexArchInfo* archinfo,
   27082                            VexAbiInfo*  abiinfo,
   27083                            Bool         host_bigendian_IN )
   27084 {
   27085    Int       i, x1, x2;
   27086    Bool      expect_CAS, has_CAS;
   27087    DisResult dres;
   27088 
   27089    /* Set globals (see top of this file) */
   27090    vassert(guest_arch == VexArchAMD64);
   27091    guest_code           = guest_code_IN;
   27092    irsb                 = irsb_IN;
   27093    host_is_bigendian    = host_bigendian_IN;
   27094    guest_RIP_curr_instr = guest_IP;
   27095    guest_RIP_bbstart    = guest_IP - delta;
   27096 
   27097    /* We'll consult these after doing disInstr_AMD64_WRK. */
   27098    guest_RIP_next_assumed   = 0;
   27099    guest_RIP_next_mustcheck = False;
   27100 
   27101    x1 = irsb_IN->stmts_used;
   27102    expect_CAS = False;
   27103    dres = disInstr_AMD64_WRK ( &expect_CAS, resteerOkFn,
   27104                                resteerCisOk,
   27105                                callback_opaque,
   27106                                delta, archinfo, abiinfo );
   27107    x2 = irsb_IN->stmts_used;
   27108    vassert(x2 >= x1);
   27109 
   27110    /* If disInstr_AMD64_WRK tried to figure out the next rip, check it
   27111       got it right.  Failure of this assertion is serious and denotes
   27112       a bug in disInstr. */
   27113    if (guest_RIP_next_mustcheck
   27114        && guest_RIP_next_assumed != guest_RIP_curr_instr + dres.len) {
   27115       vex_printf("\n");
   27116       vex_printf("assumed next %%rip = 0x%llx\n",
   27117                  guest_RIP_next_assumed );
   27118       vex_printf(" actual next %%rip = 0x%llx\n",
   27119                  guest_RIP_curr_instr + dres.len );
   27120       vpanic("disInstr_AMD64: disInstr miscalculated next %rip");
   27121    }
   27122 
   27123    /* See comment at the top of disInstr_AMD64_WRK for meaning of
   27124       expect_CAS.  Here, we (sanity-)check for the presence/absence of
   27125       IRCAS as directed by the returned expect_CAS value. */
   27126    has_CAS = False;
   27127    for (i = x1; i < x2; i++) {
   27128       if (irsb_IN->stmts[i]->tag == Ist_CAS)
   27129          has_CAS = True;
   27130    }
   27131 
   27132    if (expect_CAS != has_CAS) {
   27133       /* inconsistency detected.  re-disassemble the instruction so as
   27134          to generate a useful error message; then assert. */
   27135       vex_traceflags |= VEX_TRACE_FE;
   27136       dres = disInstr_AMD64_WRK ( &expect_CAS, resteerOkFn,
   27137                                   resteerCisOk,
   27138                                   callback_opaque,
   27139                                   delta, archinfo, abiinfo );
   27140       for (i = x1; i < x2; i++) {
   27141          vex_printf("\t\t");
   27142          ppIRStmt(irsb_IN->stmts[i]);
   27143          vex_printf("\n");
   27144       }
   27145       /* Failure of this assertion is serious and denotes a bug in
   27146          disInstr. */
   27147       vpanic("disInstr_AMD64: inconsistency in LOCK prefix handling");
   27148    }
   27149 
   27150    return dres;
   27151 }
   27152 
   27153 
   27154 /*------------------------------------------------------------*/
   27155 /*--- Unused stuff                                         ---*/
   27156 /*------------------------------------------------------------*/
   27157 
   27158 // A potentially more Memcheck-friendly version of gen_LZCNT, if
   27159 // this should ever be needed.
   27160 //
   27161 //static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
   27162 //{
   27163 //   /* Scheme is simple: propagate the most significant 1-bit into all
   27164 //      lower positions in the word.  This gives a word of the form
   27165 //      0---01---1.  Now invert it, giving a word of the form
   27166 //      1---10---0, then do a population-count idiom (to count the 1s,
   27167 //      which is the number of leading zeroes, or the word size if the
   27168 //      original word was 0.
   27169 //   */
   27170 //   Int i;
   27171 //   IRTemp t[7];
   27172 //   for (i = 0; i < 7; i++) {
   27173 //      t[i] = newTemp(ty);
   27174 //   }
   27175 //   if (ty == Ity_I64) {
   27176 //      assign(t[0], binop(Iop_Or64, mkexpr(src),
   27177 //                                   binop(Iop_Shr64, mkexpr(src),  mkU8(1))));
   27178 //      assign(t[1], binop(Iop_Or64, mkexpr(t[0]),
   27179 //                                   binop(Iop_Shr64, mkexpr(t[0]), mkU8(2))));
   27180 //      assign(t[2], binop(Iop_Or64, mkexpr(t[1]),
   27181 //                                   binop(Iop_Shr64, mkexpr(t[1]), mkU8(4))));
   27182 //      assign(t[3], binop(Iop_Or64, mkexpr(t[2]),
   27183 //                                   binop(Iop_Shr64, mkexpr(t[2]), mkU8(8))));
   27184 //      assign(t[4], binop(Iop_Or64, mkexpr(t[3]),
   27185 //                                   binop(Iop_Shr64, mkexpr(t[3]), mkU8(16))));
   27186 //      assign(t[5], binop(Iop_Or64, mkexpr(t[4]),
   27187 //                                   binop(Iop_Shr64, mkexpr(t[4]), mkU8(32))));
   27188 //      assign(t[6], unop(Iop_Not64, mkexpr(t[5])));
   27189 //      return gen_POPCOUNT(ty, t[6]);
   27190 //   }
   27191 //   if (ty == Ity_I32) {
   27192 //      assign(t[0], binop(Iop_Or32, mkexpr(src),
   27193 //                                   binop(Iop_Shr32, mkexpr(src),  mkU8(1))));
   27194 //      assign(t[1], binop(Iop_Or32, mkexpr(t[0]),
   27195 //                                   binop(Iop_Shr32, mkexpr(t[0]), mkU8(2))));
   27196 //      assign(t[2], binop(Iop_Or32, mkexpr(t[1]),
   27197 //                                   binop(Iop_Shr32, mkexpr(t[1]), mkU8(4))));
   27198 //      assign(t[3], binop(Iop_Or32, mkexpr(t[2]),
   27199 //                                   binop(Iop_Shr32, mkexpr(t[2]), mkU8(8))));
   27200 //      assign(t[4], binop(Iop_Or32, mkexpr(t[3]),
   27201 //                                   binop(Iop_Shr32, mkexpr(t[3]), mkU8(16))));
   27202 //      assign(t[5], unop(Iop_Not32, mkexpr(t[4])));
   27203 //      return gen_POPCOUNT(ty, t[5]);
   27204 //   }
   27205 //   if (ty == Ity_I16) {
   27206 //      assign(t[0], binop(Iop_Or16, mkexpr(src),
   27207 //                                   binop(Iop_Shr16, mkexpr(src),  mkU8(1))));
   27208 //      assign(t[1], binop(Iop_Or16, mkexpr(t[0]),
   27209 //                                   binop(Iop_Shr16, mkexpr(t[0]), mkU8(2))));
   27210 //      assign(t[2], binop(Iop_Or16, mkexpr(t[1]),
   27211 //                                   binop(Iop_Shr16, mkexpr(t[1]), mkU8(4))));
   27212 //      assign(t[3], binop(Iop_Or16, mkexpr(t[2]),
   27213 //                                   binop(Iop_Shr16, mkexpr(t[2]), mkU8(8))));
   27214 //      assign(t[4], unop(Iop_Not16, mkexpr(t[3])));
   27215 //      return gen_POPCOUNT(ty, t[4]);
   27216 //   }
   27217 //   vassert(0);
   27218 //}
   27219 
   27220 
   27221 /*--------------------------------------------------------------------*/
   27222 /*--- end                                       guest_amd64_toIR.c ---*/
   27223 /*--------------------------------------------------------------------*/
   27224