Home | History | Annotate | Download | only in priv
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- begin                                     guest_amd64_toIR.c ---*/
      4 /*--------------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2017 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 /* Translates AMD64 code to IR. */
     37 
     38 /* TODO:
     39 
     40    All Puts to CC_OP/CC_DEP1/CC_DEP2/CC_NDEP should really be checked
     41    to ensure a 64-bit value is being written.
     42 
     43    x87 FP Limitations:
     44 
     45    * all arithmetic done at 64 bits
     46 
     47    * no FP exceptions, except for handling stack over/underflow
     48 
     49    * FP rounding mode observed only for float->int conversions and
     50      int->float conversions which could lose accuracy, and for
     51      float-to-float rounding.  For all other operations,
     52      round-to-nearest is used, regardless.
     53 
     54    * some of the FCOM cases could do with testing -- not convinced
     55      that the args are the right way round.
     56 
     57    * FSAVE does not re-initialise the FPU; it should do
     58 
     59    * FINIT not only initialises the FPU environment, it also zeroes
     60      all the FP registers.  It should leave the registers unchanged.
     61 
     62     SAHF should cause eflags[1] == 1, and in fact it produces 0.  As
     63     per Intel docs this bit has no meaning anyway.  Since PUSHF is the
     64     only way to observe eflags[1], a proper fix would be to make that
     65     bit be set by PUSHF.
     66 
     67     This module uses global variables and so is not MT-safe (if that
     68     should ever become relevant).
     69 */
     70 
     71 /* Notes re address size overrides (0x67).
     72 
     73    According to the AMD documentation (24594 Rev 3.09, Sept 2003,
     74    "AMD64 Architecture Programmer's Manual Volume 3: General-Purpose
     75    and System Instructions"), Section 1.2.3 ("Address-Size Override
     76    Prefix"):
     77 
     78    0x67 applies to all explicit memory references, causing the top
     79    32 bits of the effective address to become zero.
     80 
     81    0x67 has no effect on stack references (push/pop); these always
     82    use a 64-bit address.
     83 
     84    0x67 changes the interpretation of instructions which implicitly
     85    reference RCX/RSI/RDI, so that in fact ECX/ESI/EDI are used
     86    instead.  These are:
     87 
     88       cmp{s,sb,sw,sd,sq}
     89       in{s,sb,sw,sd}
     90       jcxz, jecxz, jrcxz
     91       lod{s,sb,sw,sd,sq}
     92       loop{,e,bz,be,z}
     93       mov{s,sb,sw,sd,sq}
     94       out{s,sb,sw,sd}
     95       rep{,e,ne,nz}
     96       sca{s,sb,sw,sd,sq}
     97       sto{s,sb,sw,sd,sq}
     98       xlat{,b} */
     99 
    100 /* "Special" instructions.
    101 
    102    This instruction decoder can decode three special instructions
    103    which mean nothing natively (are no-ops as far as regs/mem are
    104    concerned) but have meaning for supporting Valgrind.  A special
    105    instruction is flagged by the 16-byte preamble 48C1C703 48C1C70D
    106    48C1C73D 48C1C733 (in the standard interpretation, that means: rolq
    107    $3, %rdi; rolq $13, %rdi; rolq $61, %rdi; rolq $51, %rdi).
    108    Following that, one of the following 3 are allowed (standard
    109    interpretation in parentheses):
    110 
    111       4887DB (xchgq %rbx,%rbx)   %RDX = client_request ( %RAX )
    112       4887C9 (xchgq %rcx,%rcx)   %RAX = guest_NRADDR
    113       4887D2 (xchgq %rdx,%rdx)   call-noredir *%RAX
    114       4887F6 (xchgq %rdi,%rdi)   IR injection
    115 
    116    Any other bytes following the 16-byte preamble are illegal and
    117    constitute a failure in instruction decoding.  This all assumes
    118    that the preamble will never occur except in specific code
    119    fragments designed for Valgrind to catch.
    120 
    121    No prefixes may precede a "Special" instruction.
    122 */
    123 
    124 /* casLE (implementation of lock-prefixed insns) and rep-prefixed
    125    insns: the side-exit back to the start of the insn is done with
    126    Ijk_Boring.  This is quite wrong, it should be done with
    127    Ijk_NoRedir, since otherwise the side exit, which is intended to
    128    restart the instruction for whatever reason, could go somewhere
    129    entirely else.  Doing it right (with Ijk_NoRedir jumps) would make
    130    no-redir jumps performance critical, at least for rep-prefixed
    131    instructions, since all iterations thereof would involve such a
    132    jump.  It's not such a big deal with casLE since the side exit is
    133    only taken if the CAS fails, that is, the location is contended,
    134    which is relatively unlikely.
    135 
    136    Note also, the test for CAS success vs failure is done using
    137    Iop_CasCmp{EQ,NE}{8,16,32,64} rather than the ordinary
    138    Iop_Cmp{EQ,NE} equivalents.  This is so as to tell Memcheck that it
    139    shouldn't definedness-check these comparisons.  See
    140    COMMENT_ON_CasCmpEQ in memcheck/mc_translate.c for
    141    background/rationale.
    142 */
    143 
    144 /* LOCK prefixed instructions.  These are translated using IR-level
    145    CAS statements (IRCAS) and are believed to preserve atomicity, even
    146    from the point of view of some other process racing against a
    147    simulated one (presumably they communicate via a shared memory
    148    segment).
    149 
    150    Handlers which are aware of LOCK prefixes are:
    151       dis_op2_G_E      (add, or, adc, sbb, and, sub, xor)
    152       dis_cmpxchg_G_E  (cmpxchg)
    153       dis_Grp1         (add, or, adc, sbb, and, sub, xor)
    154       dis_Grp3         (not, neg)
    155       dis_Grp4         (inc, dec)
    156       dis_Grp5         (inc, dec)
    157       dis_Grp8_Imm     (bts, btc, btr)
    158       dis_bt_G_E       (bts, btc, btr)
    159       dis_xadd_G_E     (xadd)
    160 */
    161 
    162 
    163 #include "libvex_basictypes.h"
    164 #include "libvex_ir.h"
    165 #include "libvex.h"
    166 #include "libvex_guest_amd64.h"
    167 
    168 #include "main_util.h"
    169 #include "main_globals.h"
    170 #include "guest_generic_bb_to_IR.h"
    171 #include "guest_generic_x87.h"
    172 #include "guest_amd64_defs.h"
    173 
    174 
    175 /*------------------------------------------------------------*/
    176 /*--- Globals                                              ---*/
    177 /*------------------------------------------------------------*/
    178 
    179 /* These are set at the start of the translation of an insn, right
    180    down in disInstr_AMD64, so that we don't have to pass them around
    181    endlessly.  They are all constant during the translation of any
    182    given insn. */
    183 
    184 /* These are set at the start of the translation of a BB, so
    185    that we don't have to pass them around endlessly. */
    186 
    187 /* We need to know this to do sub-register accesses correctly. */
    188 static VexEndness host_endness;
    189 
    190 /* Pointer to the guest code area (points to start of BB, not to the
    191    insn being processed). */
    192 static const UChar* guest_code;
    193 
    194 /* The guest address corresponding to guest_code[0]. */
    195 static Addr64 guest_RIP_bbstart;
    196 
    197 /* The guest address for the instruction currently being
    198    translated. */
    199 static Addr64 guest_RIP_curr_instr;
    200 
    201 /* The IRSB* into which we're generating code. */
    202 static IRSB* irsb;
    203 
    204 /* For ensuring that %rip-relative addressing is done right.  A read
    205    of %rip generates the address of the next instruction.  It may be
    206    that we don't conveniently know that inside disAMode().  For sanity
    207    checking, if the next insn %rip is needed, we make a guess at what
    208    it is, record that guess here, and set the accompanying Bool to
    209    indicate that -- after this insn's decode is finished -- that guess
    210    needs to be checked.  */
    211 
    212 /* At the start of each insn decode, is set to (0, False).
    213    After the decode, if _mustcheck is now True, _assumed is
    214    checked. */
    215 
    216 static Addr64 guest_RIP_next_assumed;
    217 static Bool   guest_RIP_next_mustcheck;
    218 
    219 
    220 /*------------------------------------------------------------*/
    221 /*--- Helpers for constructing IR.                         ---*/
    222 /*------------------------------------------------------------*/
    223 
    224 /* Generate a new temporary of the given type. */
    225 static IRTemp newTemp ( IRType ty )
    226 {
    227    vassert(isPlausibleIRType(ty));
    228    return newIRTemp( irsb->tyenv, ty );
    229 }
    230 
    231 /* Add a statement to the list held by "irsb". */
    232 static void stmt ( IRStmt* st )
    233 {
    234    addStmtToIRSB( irsb, st );
    235 }
    236 
    237 /* Generate a statement "dst := e". */
    238 static void assign ( IRTemp dst, IRExpr* e )
    239 {
    240    stmt( IRStmt_WrTmp(dst, e) );
    241 }
    242 
    243 static IRExpr* unop ( IROp op, IRExpr* a )
    244 {
    245    return IRExpr_Unop(op, a);
    246 }
    247 
    248 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
    249 {
    250    return IRExpr_Binop(op, a1, a2);
    251 }
    252 
    253 static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
    254 {
    255    return IRExpr_Triop(op, a1, a2, a3);
    256 }
    257 
    258 static IRExpr* mkexpr ( IRTemp tmp )
    259 {
    260    return IRExpr_RdTmp(tmp);
    261 }
    262 
    263 static IRExpr* mkU8 ( ULong i )
    264 {
    265    vassert(i < 256);
    266    return IRExpr_Const(IRConst_U8( (UChar)i ));
    267 }
    268 
    269 static IRExpr* mkU16 ( ULong i )
    270 {
    271    vassert(i < 0x10000ULL);
    272    return IRExpr_Const(IRConst_U16( (UShort)i ));
    273 }
    274 
    275 static IRExpr* mkU32 ( ULong i )
    276 {
    277    vassert(i < 0x100000000ULL);
    278    return IRExpr_Const(IRConst_U32( (UInt)i ));
    279 }
    280 
    281 static IRExpr* mkU64 ( ULong i )
    282 {
    283    return IRExpr_Const(IRConst_U64(i));
    284 }
    285 
    286 static IRExpr* mkU ( IRType ty, ULong i )
    287 {
    288    switch (ty) {
    289       case Ity_I8:  return mkU8(i);
    290       case Ity_I16: return mkU16(i);
    291       case Ity_I32: return mkU32(i);
    292       case Ity_I64: return mkU64(i);
    293       default: vpanic("mkU(amd64)");
    294    }
    295 }
    296 
    297 static void storeLE ( IRExpr* addr, IRExpr* data )
    298 {
    299    stmt( IRStmt_Store(Iend_LE, addr, data) );
    300 }
    301 
    302 static IRExpr* loadLE ( IRType ty, IRExpr* addr )
    303 {
    304    return IRExpr_Load(Iend_LE, ty, addr);
    305 }
    306 
    307 static IROp mkSizedOp ( IRType ty, IROp op8 )
    308 {
    309    vassert(op8 == Iop_Add8 || op8 == Iop_Sub8
    310            || op8 == Iop_Mul8
    311            || op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
    312            || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
    313            || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
    314            || op8 == Iop_CasCmpNE8
    315            || op8 == Iop_Not8 );
    316    switch (ty) {
    317       case Ity_I8:  return 0 +op8;
    318       case Ity_I16: return 1 +op8;
    319       case Ity_I32: return 2 +op8;
    320       case Ity_I64: return 3 +op8;
    321       default: vpanic("mkSizedOp(amd64)");
    322    }
    323 }
    324 
    325 static
    326 IRExpr* doScalarWidening ( Int szSmall, Int szBig, Bool signd, IRExpr* src )
    327 {
    328    if (szSmall == 1 && szBig == 4) {
    329       return unop(signd ? Iop_8Sto32 : Iop_8Uto32, src);
    330    }
    331    if (szSmall == 1 && szBig == 2) {
    332       return unop(signd ? Iop_8Sto16 : Iop_8Uto16, src);
    333    }
    334    if (szSmall == 2 && szBig == 4) {
    335       return unop(signd ? Iop_16Sto32 : Iop_16Uto32, src);
    336    }
    337    if (szSmall == 1 && szBig == 8 && !signd) {
    338       return unop(Iop_8Uto64, src);
    339    }
    340    if (szSmall == 1 && szBig == 8 && signd) {
    341       return unop(Iop_8Sto64, src);
    342    }
    343    if (szSmall == 2 && szBig == 8 && !signd) {
    344       return unop(Iop_16Uto64, src);
    345    }
    346    if (szSmall == 2 && szBig == 8 && signd) {
    347       return unop(Iop_16Sto64, src);
    348    }
    349    vpanic("doScalarWidening(amd64)");
    350 }
    351 
    352 static
    353 void putGuarded ( Int gstOffB, IRExpr* guard, IRExpr* value )
    354 {
    355    IRType ty = typeOfIRExpr(irsb->tyenv, value);
    356    stmt( IRStmt_Put(gstOffB,
    357                     IRExpr_ITE(guard, value, IRExpr_Get(gstOffB, ty))) );
    358 }
    359 
    360 
    361 /*------------------------------------------------------------*/
    362 /*--- Debugging output                                     ---*/
    363 /*------------------------------------------------------------*/
    364 
    365 /* Bomb out if we can't handle something. */
    366 __attribute__ ((noreturn))
    367 static void unimplemented ( const HChar* str )
    368 {
    369    vex_printf("amd64toIR: unimplemented feature\n");
    370    vpanic(str);
    371 }
    372 
    373 #define DIP(format, args...)           \
    374    if (vex_traceflags & VEX_TRACE_FE)  \
    375       vex_printf(format, ## args)
    376 
    377 #define DIS(buf, format, args...)      \
    378    if (vex_traceflags & VEX_TRACE_FE)  \
    379       vex_sprintf(buf, format, ## args)
    380 
    381 
    382 /*------------------------------------------------------------*/
    383 /*--- Offsets of various parts of the amd64 guest state.   ---*/
    384 /*------------------------------------------------------------*/
    385 
    386 #define OFFB_RAX       offsetof(VexGuestAMD64State,guest_RAX)
    387 #define OFFB_RBX       offsetof(VexGuestAMD64State,guest_RBX)
    388 #define OFFB_RCX       offsetof(VexGuestAMD64State,guest_RCX)
    389 #define OFFB_RDX       offsetof(VexGuestAMD64State,guest_RDX)
    390 #define OFFB_RSP       offsetof(VexGuestAMD64State,guest_RSP)
    391 #define OFFB_RBP       offsetof(VexGuestAMD64State,guest_RBP)
    392 #define OFFB_RSI       offsetof(VexGuestAMD64State,guest_RSI)
    393 #define OFFB_RDI       offsetof(VexGuestAMD64State,guest_RDI)
    394 #define OFFB_R8        offsetof(VexGuestAMD64State,guest_R8)
    395 #define OFFB_R9        offsetof(VexGuestAMD64State,guest_R9)
    396 #define OFFB_R10       offsetof(VexGuestAMD64State,guest_R10)
    397 #define OFFB_R11       offsetof(VexGuestAMD64State,guest_R11)
    398 #define OFFB_R12       offsetof(VexGuestAMD64State,guest_R12)
    399 #define OFFB_R13       offsetof(VexGuestAMD64State,guest_R13)
    400 #define OFFB_R14       offsetof(VexGuestAMD64State,guest_R14)
    401 #define OFFB_R15       offsetof(VexGuestAMD64State,guest_R15)
    402 
    403 #define OFFB_RIP       offsetof(VexGuestAMD64State,guest_RIP)
    404 
    405 #define OFFB_FS_CONST  offsetof(VexGuestAMD64State,guest_FS_CONST)
    406 #define OFFB_GS_CONST  offsetof(VexGuestAMD64State,guest_GS_CONST)
    407 
    408 #define OFFB_CC_OP     offsetof(VexGuestAMD64State,guest_CC_OP)
    409 #define OFFB_CC_DEP1   offsetof(VexGuestAMD64State,guest_CC_DEP1)
    410 #define OFFB_CC_DEP2   offsetof(VexGuestAMD64State,guest_CC_DEP2)
    411 #define OFFB_CC_NDEP   offsetof(VexGuestAMD64State,guest_CC_NDEP)
    412 
    413 #define OFFB_FPREGS    offsetof(VexGuestAMD64State,guest_FPREG[0])
    414 #define OFFB_FPTAGS    offsetof(VexGuestAMD64State,guest_FPTAG[0])
    415 #define OFFB_DFLAG     offsetof(VexGuestAMD64State,guest_DFLAG)
    416 #define OFFB_ACFLAG    offsetof(VexGuestAMD64State,guest_ACFLAG)
    417 #define OFFB_IDFLAG    offsetof(VexGuestAMD64State,guest_IDFLAG)
    418 #define OFFB_FTOP      offsetof(VexGuestAMD64State,guest_FTOP)
    419 #define OFFB_FC3210    offsetof(VexGuestAMD64State,guest_FC3210)
    420 #define OFFB_FPROUND   offsetof(VexGuestAMD64State,guest_FPROUND)
    421 
    422 #define OFFB_SSEROUND  offsetof(VexGuestAMD64State,guest_SSEROUND)
    423 #define OFFB_YMM0      offsetof(VexGuestAMD64State,guest_YMM0)
    424 #define OFFB_YMM1      offsetof(VexGuestAMD64State,guest_YMM1)
    425 #define OFFB_YMM2      offsetof(VexGuestAMD64State,guest_YMM2)
    426 #define OFFB_YMM3      offsetof(VexGuestAMD64State,guest_YMM3)
    427 #define OFFB_YMM4      offsetof(VexGuestAMD64State,guest_YMM4)
    428 #define OFFB_YMM5      offsetof(VexGuestAMD64State,guest_YMM5)
    429 #define OFFB_YMM6      offsetof(VexGuestAMD64State,guest_YMM6)
    430 #define OFFB_YMM7      offsetof(VexGuestAMD64State,guest_YMM7)
    431 #define OFFB_YMM8      offsetof(VexGuestAMD64State,guest_YMM8)
    432 #define OFFB_YMM9      offsetof(VexGuestAMD64State,guest_YMM9)
    433 #define OFFB_YMM10     offsetof(VexGuestAMD64State,guest_YMM10)
    434 #define OFFB_YMM11     offsetof(VexGuestAMD64State,guest_YMM11)
    435 #define OFFB_YMM12     offsetof(VexGuestAMD64State,guest_YMM12)
    436 #define OFFB_YMM13     offsetof(VexGuestAMD64State,guest_YMM13)
    437 #define OFFB_YMM14     offsetof(VexGuestAMD64State,guest_YMM14)
    438 #define OFFB_YMM15     offsetof(VexGuestAMD64State,guest_YMM15)
    439 #define OFFB_YMM16     offsetof(VexGuestAMD64State,guest_YMM16)
    440 
    441 #define OFFB_EMNOTE    offsetof(VexGuestAMD64State,guest_EMNOTE)
    442 #define OFFB_CMSTART   offsetof(VexGuestAMD64State,guest_CMSTART)
    443 #define OFFB_CMLEN     offsetof(VexGuestAMD64State,guest_CMLEN)
    444 
    445 #define OFFB_NRADDR    offsetof(VexGuestAMD64State,guest_NRADDR)
    446 
    447 
    448 /*------------------------------------------------------------*/
    449 /*--- Helper bits and pieces for deconstructing the        ---*/
    450 /*--- amd64 insn stream.                                   ---*/
    451 /*------------------------------------------------------------*/
    452 
    453 /* This is the AMD64 register encoding -- integer regs. */
    454 #define R_RAX 0
    455 #define R_RCX 1
    456 #define R_RDX 2
    457 #define R_RBX 3
    458 #define R_RSP 4
    459 #define R_RBP 5
    460 #define R_RSI 6
    461 #define R_RDI 7
    462 #define R_R8  8
    463 #define R_R9  9
    464 #define R_R10 10
    465 #define R_R11 11
    466 #define R_R12 12
    467 #define R_R13 13
    468 #define R_R14 14
    469 #define R_R15 15
    470 
    471 /* This is the Intel register encoding -- segment regs. */
    472 #define R_ES 0
    473 #define R_CS 1
    474 #define R_SS 2
    475 #define R_DS 3
    476 #define R_FS 4
    477 #define R_GS 5
    478 
    479 
    480 /* Various simple conversions */
    481 
    482 static ULong extend_s_8to64 ( UChar x )
    483 {
    484    return (ULong)((Long)(((ULong)x) << 56) >> 56);
    485 }
    486 
    487 static ULong extend_s_16to64 ( UShort x )
    488 {
    489    return (ULong)((Long)(((ULong)x) << 48) >> 48);
    490 }
    491 
    492 static ULong extend_s_32to64 ( UInt x )
    493 {
    494    return (ULong)((Long)(((ULong)x) << 32) >> 32);
    495 }
    496 
    497 /* Figure out whether the mod and rm parts of a modRM byte refer to a
    498    register or memory.  If so, the byte will have the form 11XXXYYY,
    499    where YYY is the register number. */
    500 inline
    501 static Bool epartIsReg ( UChar mod_reg_rm )
    502 {
    503    return toBool(0xC0 == (mod_reg_rm & 0xC0));
    504 }
    505 
    506 /* Extract the 'g' field from a modRM byte.  This only produces 3
    507    bits, which is not a complete register number.  You should avoid
    508    this function if at all possible. */
    509 inline
    510 static Int gregLO3ofRM ( UChar mod_reg_rm )
    511 {
    512    return (Int)( (mod_reg_rm >> 3) & 7 );
    513 }
    514 
    515 /* Ditto the 'e' field of a modRM byte. */
    516 inline
    517 static Int eregLO3ofRM ( UChar mod_reg_rm )
    518 {
    519    return (Int)(mod_reg_rm & 0x7);
    520 }
    521 
    522 /* Get a 8/16/32-bit unsigned value out of the insn stream. */
    523 
    524 static inline UChar getUChar ( Long delta )
    525 {
    526    UChar v = guest_code[delta+0];
    527    return v;
    528 }
    529 
    530 static UInt getUDisp16 ( Long delta )
    531 {
    532    UInt v = guest_code[delta+1]; v <<= 8;
    533    v |= guest_code[delta+0];
    534    return v & 0xFFFF;
    535 }
    536 
    537 //.. static UInt getUDisp ( Int size, Long delta )
    538 //.. {
    539 //..    switch (size) {
    540 //..       case 4: return getUDisp32(delta);
    541 //..       case 2: return getUDisp16(delta);
    542 //..       case 1: return getUChar(delta);
    543 //..       default: vpanic("getUDisp(x86)");
    544 //..    }
    545 //..    return 0; /*notreached*/
    546 //.. }
    547 
    548 
    549 /* Get a byte value out of the insn stream and sign-extend to 64
    550    bits. */
    551 static Long getSDisp8 ( Long delta )
    552 {
    553    return extend_s_8to64( guest_code[delta] );
    554 }
    555 
    556 /* Get a 16-bit value out of the insn stream and sign-extend to 64
    557    bits. */
    558 static Long getSDisp16 ( Long delta )
    559 {
    560    UInt v = guest_code[delta+1]; v <<= 8;
    561    v |= guest_code[delta+0];
    562    return extend_s_16to64( (UShort)v );
    563 }
    564 
    565 /* Get a 32-bit value out of the insn stream and sign-extend to 64
    566    bits. */
    567 static Long getSDisp32 ( Long delta )
    568 {
    569    UInt v = guest_code[delta+3]; v <<= 8;
    570    v |= guest_code[delta+2]; v <<= 8;
    571    v |= guest_code[delta+1]; v <<= 8;
    572    v |= guest_code[delta+0];
    573    return extend_s_32to64( v );
    574 }
    575 
    576 /* Get a 64-bit value out of the insn stream. */
    577 static Long getDisp64 ( Long delta )
    578 {
    579    ULong v = 0;
    580    v |= guest_code[delta+7]; v <<= 8;
    581    v |= guest_code[delta+6]; v <<= 8;
    582    v |= guest_code[delta+5]; v <<= 8;
    583    v |= guest_code[delta+4]; v <<= 8;
    584    v |= guest_code[delta+3]; v <<= 8;
    585    v |= guest_code[delta+2]; v <<= 8;
    586    v |= guest_code[delta+1]; v <<= 8;
    587    v |= guest_code[delta+0];
    588    return v;
    589 }
    590 
    591 /* Note: because AMD64 doesn't allow 64-bit literals, it is an error
    592    if this is called with size==8.  Should not happen. */
    593 static Long getSDisp ( Int size, Long delta )
    594 {
    595    switch (size) {
    596       case 4: return getSDisp32(delta);
    597       case 2: return getSDisp16(delta);
    598       case 1: return getSDisp8(delta);
    599       default: vpanic("getSDisp(amd64)");
    600   }
    601 }
    602 
    603 static ULong mkSizeMask ( Int sz )
    604 {
    605    switch (sz) {
    606       case 1: return 0x00000000000000FFULL;
    607       case 2: return 0x000000000000FFFFULL;
    608       case 4: return 0x00000000FFFFFFFFULL;
    609       case 8: return 0xFFFFFFFFFFFFFFFFULL;
    610       default: vpanic("mkSzMask(amd64)");
    611    }
    612 }
    613 
    614 static Int imin ( Int a, Int b )
    615 {
    616    return (a < b) ? a : b;
    617 }
    618 
    619 static IRType szToITy ( Int n )
    620 {
    621    switch (n) {
    622       case 1: return Ity_I8;
    623       case 2: return Ity_I16;
    624       case 4: return Ity_I32;
    625       case 8: return Ity_I64;
    626       default: vex_printf("\nszToITy(%d)\n", n);
    627                vpanic("szToITy(amd64)");
    628    }
    629 }
    630 
    631 
    632 /*------------------------------------------------------------*/
    633 /*--- For dealing with prefixes.                           ---*/
    634 /*------------------------------------------------------------*/
    635 
    636 /* The idea is to pass around an int holding a bitmask summarising
    637    info from the prefixes seen on the current instruction, including
    638    info from the REX byte.  This info is used in various places, but
    639    most especially when making sense of register fields in
    640    instructions.
    641 
    642    The top 8 bits of the prefix are 0x55, just as a hacky way to
    643    ensure it really is a valid prefix.
    644 
    645    Things you can safely assume about a well-formed prefix:
    646    * at most one segment-override bit (CS,DS,ES,FS,GS,SS) is set.
    647    * if REX is not present then REXW,REXR,REXX,REXB will read
    648      as zero.
    649    * F2 and F3 will not both be 1.
    650 */
    651 
    652 typedef UInt  Prefix;
    653 
    654 #define PFX_ASO    (1<<0)    /* address-size override present (0x67) */
    655 #define PFX_66     (1<<1)    /* operand-size override-to-16 present (0x66) */
    656 #define PFX_REX    (1<<2)    /* REX byte present (0x40 to 0x4F) */
    657 #define PFX_REXW   (1<<3)    /* REX W bit, if REX present, else 0 */
    658 #define PFX_REXR   (1<<4)    /* REX R bit, if REX present, else 0 */
    659 #define PFX_REXX   (1<<5)    /* REX X bit, if REX present, else 0 */
    660 #define PFX_REXB   (1<<6)    /* REX B bit, if REX present, else 0 */
    661 #define PFX_LOCK   (1<<7)    /* bus LOCK prefix present (0xF0) */
    662 #define PFX_F2     (1<<8)    /* REP/REPE/REPZ prefix present (0xF2) */
    663 #define PFX_F3     (1<<9)    /* REPNE/REPNZ prefix present (0xF3) */
    664 #define PFX_CS     (1<<10)   /* CS segment prefix present (0x2E) */
    665 #define PFX_DS     (1<<11)   /* DS segment prefix present (0x3E) */
    666 #define PFX_ES     (1<<12)   /* ES segment prefix present (0x26) */
    667 #define PFX_FS     (1<<13)   /* FS segment prefix present (0x64) */
    668 #define PFX_GS     (1<<14)   /* GS segment prefix present (0x65) */
    669 #define PFX_SS     (1<<15)   /* SS segment prefix present (0x36) */
    670 #define PFX_VEX    (1<<16)   /* VEX prefix present (0xC4 or 0xC5) */
    671 #define PFX_VEXL   (1<<17)   /* VEX L bit, if VEX present, else 0 */
    672 /* The extra register field VEX.vvvv is encoded (after not-ing it) as
    673    PFX_VEXnV3 .. PFX_VEXnV0, so these must occupy adjacent bit
    674    positions. */
    675 #define PFX_VEXnV0 (1<<18)   /* ~VEX vvvv[0], if VEX present, else 0 */
    676 #define PFX_VEXnV1 (1<<19)   /* ~VEX vvvv[1], if VEX present, else 0 */
    677 #define PFX_VEXnV2 (1<<20)   /* ~VEX vvvv[2], if VEX present, else 0 */
    678 #define PFX_VEXnV3 (1<<21)   /* ~VEX vvvv[3], if VEX present, else 0 */
    679 
    680 
    681 #define PFX_EMPTY 0x55000000
    682 
    683 static Bool IS_VALID_PFX ( Prefix pfx ) {
    684    return toBool((pfx & 0xFF000000) == PFX_EMPTY);
    685 }
    686 
    687 static Bool haveREX ( Prefix pfx ) {
    688    return toBool(pfx & PFX_REX);
    689 }
    690 
    691 static Int getRexW ( Prefix pfx ) {
    692    return (pfx & PFX_REXW) ? 1 : 0;
    693 }
    694 static Int getRexR ( Prefix pfx ) {
    695    return (pfx & PFX_REXR) ? 1 : 0;
    696 }
    697 static Int getRexX ( Prefix pfx ) {
    698    return (pfx & PFX_REXX) ? 1 : 0;
    699 }
    700 static Int getRexB ( Prefix pfx ) {
    701    return (pfx & PFX_REXB) ? 1 : 0;
    702 }
    703 
    704 /* Check a prefix doesn't have F2 or F3 set in it, since usually that
    705    completely changes what instruction it really is. */
    706 static Bool haveF2orF3 ( Prefix pfx ) {
    707    return toBool((pfx & (PFX_F2|PFX_F3)) > 0);
    708 }
    709 static Bool haveF2andF3 ( Prefix pfx ) {
    710    return toBool((pfx & (PFX_F2|PFX_F3)) == (PFX_F2|PFX_F3));
    711 }
    712 static Bool haveF2 ( Prefix pfx ) {
    713    return toBool((pfx & PFX_F2) > 0);
    714 }
    715 static Bool haveF3 ( Prefix pfx ) {
    716    return toBool((pfx & PFX_F3) > 0);
    717 }
    718 
    719 static Bool have66 ( Prefix pfx ) {
    720    return toBool((pfx & PFX_66) > 0);
    721 }
    722 static Bool haveASO ( Prefix pfx ) {
    723    return toBool((pfx & PFX_ASO) > 0);
    724 }
    725 static Bool haveLOCK ( Prefix pfx ) {
    726    return toBool((pfx & PFX_LOCK) > 0);
    727 }
    728 
    729 /* Return True iff pfx has 66 set and F2 and F3 clear */
    730 static Bool have66noF2noF3 ( Prefix pfx )
    731 {
    732   return
    733      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_66);
    734 }
    735 
    736 /* Return True iff pfx has F2 set and 66 and F3 clear */
    737 static Bool haveF2no66noF3 ( Prefix pfx )
    738 {
    739   return
    740      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F2);
    741 }
    742 
    743 /* Return True iff pfx has F3 set and 66 and F2 clear */
    744 static Bool haveF3no66noF2 ( Prefix pfx )
    745 {
    746   return
    747      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F3);
    748 }
    749 
    750 /* Return True iff pfx has F3 set and F2 clear */
    751 static Bool haveF3noF2 ( Prefix pfx )
    752 {
    753   return
    754      toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F3);
    755 }
    756 
    757 /* Return True iff pfx has F2 set and F3 clear */
    758 static Bool haveF2noF3 ( Prefix pfx )
    759 {
    760   return
    761      toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F2);
    762 }
    763 
    764 /* Return True iff pfx has 66, F2 and F3 clear */
    765 static Bool haveNo66noF2noF3 ( Prefix pfx )
    766 {
    767   return
    768      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == 0);
    769 }
    770 
    771 /* Return True iff pfx has any of 66, F2 and F3 set */
    772 static Bool have66orF2orF3 ( Prefix pfx )
    773 {
    774   return toBool( ! haveNo66noF2noF3(pfx) );
    775 }
    776 
    777 /* Return True iff pfx has 66 or F3 set */
    778 static Bool have66orF3 ( Prefix pfx )
    779 {
    780    return toBool((pfx & (PFX_66|PFX_F3)) > 0);
    781 }
    782 
    783 /* Clear all the segment-override bits in a prefix. */
    784 static Prefix clearSegBits ( Prefix p )
    785 {
    786    return
    787       p & ~(PFX_CS | PFX_DS | PFX_ES | PFX_FS | PFX_GS | PFX_SS);
    788 }
    789 
    790 /* Get the (inverted, hence back to "normal") VEX.vvvv field. */
    791 static UInt getVexNvvvv ( Prefix pfx ) {
    792    UInt r = (UInt)pfx;
    793    r /= (UInt)PFX_VEXnV0; /* pray this turns into a shift */
    794    return r & 0xF;
    795 }
    796 
    797 static Bool haveVEX ( Prefix pfx ) {
    798    return toBool(pfx & PFX_VEX);
    799 }
    800 
    801 static Int getVexL ( Prefix pfx ) {
    802    return (pfx & PFX_VEXL) ? 1 : 0;
    803 }
    804 
    805 
    806 /*------------------------------------------------------------*/
    807 /*--- For dealing with escapes                             ---*/
    808 /*------------------------------------------------------------*/
    809 
    810 
    811 /* Escapes come after the prefixes, but before the primary opcode
    812    byte.  They escape the primary opcode byte into a bigger space.
    813    The 0xF0000000 isn't significant, except so as to make it not
    814    overlap valid Prefix values, for sanity checking.
    815 */
    816 
    817 typedef
    818    enum {
    819       ESC_NONE=0xF0000000, // none
    820       ESC_0F,              // 0F
    821       ESC_0F38,            // 0F 38
    822       ESC_0F3A             // 0F 3A
    823    }
    824    Escape;
    825 
    826 
    827 /*------------------------------------------------------------*/
    828 /*--- For dealing with integer registers                   ---*/
    829 /*------------------------------------------------------------*/
    830 
    831 /* This is somewhat complex.  The rules are:
    832 
    833    For 64, 32 and 16 bit register references, the e or g fields in the
    834    modrm bytes supply the low 3 bits of the register number.  The
    835    fourth (most-significant) bit of the register number is supplied by
    836    the REX byte, if it is present; else that bit is taken to be zero.
    837 
    838    The REX.R bit supplies the high bit corresponding to the g register
    839    field, and the REX.B bit supplies the high bit corresponding to the
    840    e register field (when the mod part of modrm indicates that modrm's
    841    e component refers to a register and not to memory).
    842 
    843    The REX.X bit supplies a high register bit for certain registers
    844    in SIB address modes, and is generally rarely used.
    845 
    846    For 8 bit register references, the presence of the REX byte itself
    847    has significance.  If there is no REX present, then the 3-bit
    848    number extracted from the modrm e or g field is treated as an index
    849    into the sequence %al %cl %dl %bl %ah %ch %dh %bh -- that is, the
    850    old x86 encoding scheme.
    851 
    852    But if there is a REX present, the register reference is
    853    interpreted in the same way as for 64/32/16-bit references: a high
    854    bit is extracted from REX, giving a 4-bit number, and the denoted
    855    register is the lowest 8 bits of the 16 integer registers denoted
    856    by the number.  In particular, values 3 through 7 of this sequence
    857    do not refer to %ah %ch %dh %bh but instead to the lowest 8 bits of
    858    %rsp %rbp %rsi %rdi.
    859 
    860    The REX.W bit has no bearing at all on register numbers.  Instead
    861    its presence indicates that the operand size is to be overridden
    862    from its default value (32 bits) to 64 bits instead.  This is in
    863    the same fashion that an 0x66 prefix indicates the operand size is
    864    to be overridden from 32 bits down to 16 bits.  When both REX.W and
    865    0x66 are present there is a conflict, and REX.W takes precedence.
    866 
    867    Rather than try to handle this complexity using a single huge
    868    function, several smaller ones are provided.  The aim is to make it
    869    as difficult as possible to screw up register decoding in a subtle
    870    and hard-to-track-down way.
    871 
    872    Because these routines fish around in the host's memory (that is,
    873    in the guest state area) for sub-parts of guest registers, their
    874    correctness depends on the host's endianness.  So far these
    875    routines only work for little-endian hosts.  Those for which
    876    endianness is important have assertions to ensure sanity.
    877 */
    878 
    879 
    880 /* About the simplest question you can ask: where do the 64-bit
    881    integer registers live (in the guest state) ? */
    882 
    883 static Int integerGuestReg64Offset ( UInt reg )
    884 {
    885    switch (reg) {
    886       case R_RAX: return OFFB_RAX;
    887       case R_RCX: return OFFB_RCX;
    888       case R_RDX: return OFFB_RDX;
    889       case R_RBX: return OFFB_RBX;
    890       case R_RSP: return OFFB_RSP;
    891       case R_RBP: return OFFB_RBP;
    892       case R_RSI: return OFFB_RSI;
    893       case R_RDI: return OFFB_RDI;
    894       case R_R8:  return OFFB_R8;
    895       case R_R9:  return OFFB_R9;
    896       case R_R10: return OFFB_R10;
    897       case R_R11: return OFFB_R11;
    898       case R_R12: return OFFB_R12;
    899       case R_R13: return OFFB_R13;
    900       case R_R14: return OFFB_R14;
    901       case R_R15: return OFFB_R15;
    902       default: vpanic("integerGuestReg64Offset(amd64)");
    903    }
    904 }
    905 
    906 
    907 /* Produce the name of an integer register, for printing purposes.
    908    reg is a number in the range 0 .. 15 that has been generated from a
    909    3-bit reg-field number and a REX extension bit.  irregular denotes
    910    the case where sz==1 and no REX byte is present. */
    911 
    912 static
    913 const HChar* nameIReg ( Int sz, UInt reg, Bool irregular )
    914 {
    915    static const HChar* ireg64_names[16]
    916      = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
    917          "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
    918    static const HChar* ireg32_names[16]
    919      = { "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
    920          "%r8d", "%r9d", "%r10d","%r11d","%r12d","%r13d","%r14d","%r15d" };
    921    static const HChar* ireg16_names[16]
    922      = { "%ax",  "%cx",  "%dx",  "%bx",  "%sp",  "%bp",  "%si",  "%di",
    923          "%r8w", "%r9w", "%r10w","%r11w","%r12w","%r13w","%r14w","%r15w" };
    924    static const HChar* ireg8_names[16]
    925      = { "%al",  "%cl",  "%dl",  "%bl",  "%spl", "%bpl", "%sil", "%dil",
    926          "%r8b", "%r9b", "%r10b","%r11b","%r12b","%r13b","%r14b","%r15b" };
    927    static const HChar* ireg8_irregular[8]
    928      = { "%al", "%cl", "%dl", "%bl", "%ah", "%ch", "%dh", "%bh" };
    929 
    930    vassert(reg < 16);
    931    if (sz == 1) {
    932       if (irregular)
    933          vassert(reg < 8);
    934    } else {
    935       vassert(irregular == False);
    936    }
    937 
    938    switch (sz) {
    939       case 8: return ireg64_names[reg];
    940       case 4: return ireg32_names[reg];
    941       case 2: return ireg16_names[reg];
    942       case 1: if (irregular) {
    943                  return ireg8_irregular[reg];
    944               } else {
    945                  return ireg8_names[reg];
    946               }
    947       default: vpanic("nameIReg(amd64)");
    948    }
    949 }
    950 
    951 /* Using the same argument conventions as nameIReg, produce the
    952    guest state offset of an integer register. */
    953 
    954 static
    955 Int offsetIReg ( Int sz, UInt reg, Bool irregular )
    956 {
    957    vassert(reg < 16);
    958    if (sz == 1) {
    959       if (irregular)
    960          vassert(reg < 8);
    961    } else {
    962       vassert(irregular == False);
    963    }
    964 
    965    /* Deal with irregular case -- sz==1 and no REX present */
    966    if (sz == 1 && irregular) {
    967       switch (reg) {
    968          case R_RSP: return 1+ OFFB_RAX;
    969          case R_RBP: return 1+ OFFB_RCX;
    970          case R_RSI: return 1+ OFFB_RDX;
    971          case R_RDI: return 1+ OFFB_RBX;
    972          default:    break; /* use the normal case */
    973       }
    974    }
    975 
    976    /* Normal case */
    977    return integerGuestReg64Offset(reg);
    978 }
    979 
    980 
    981 /* Read the %CL register :: Ity_I8, for shift/rotate operations. */
    982 
    983 static IRExpr* getIRegCL ( void )
    984 {
    985    vassert(host_endness == VexEndnessLE);
    986    return IRExpr_Get( OFFB_RCX, Ity_I8 );
    987 }
    988 
    989 
    990 /* Write to the %AH register. */
    991 
    992 static void putIRegAH ( IRExpr* e )
    993 {
    994    vassert(host_endness == VexEndnessLE);
    995    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I8);
    996    stmt( IRStmt_Put( OFFB_RAX+1, e ) );
    997 }
    998 
    999 
   1000 /* Read/write various widths of %RAX, as it has various
   1001    special-purpose uses. */
   1002 
   1003 static const HChar* nameIRegRAX ( Int sz )
   1004 {
   1005    switch (sz) {
   1006       case 1: return "%al";
   1007       case 2: return "%ax";
   1008       case 4: return "%eax";
   1009       case 8: return "%rax";
   1010       default: vpanic("nameIRegRAX(amd64)");
   1011    }
   1012 }
   1013 
   1014 static IRExpr* getIRegRAX ( Int sz )
   1015 {
   1016    vassert(host_endness == VexEndnessLE);
   1017    switch (sz) {
   1018       case 1: return IRExpr_Get( OFFB_RAX, Ity_I8 );
   1019       case 2: return IRExpr_Get( OFFB_RAX, Ity_I16 );
   1020       case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RAX, Ity_I64 ));
   1021       case 8: return IRExpr_Get( OFFB_RAX, Ity_I64 );
   1022       default: vpanic("getIRegRAX(amd64)");
   1023    }
   1024 }
   1025 
   1026 static void putIRegRAX ( Int sz, IRExpr* e )
   1027 {
   1028    IRType ty = typeOfIRExpr(irsb->tyenv, e);
   1029    vassert(host_endness == VexEndnessLE);
   1030    switch (sz) {
   1031       case 8: vassert(ty == Ity_I64);
   1032               stmt( IRStmt_Put( OFFB_RAX, e ));
   1033               break;
   1034       case 4: vassert(ty == Ity_I32);
   1035               stmt( IRStmt_Put( OFFB_RAX, unop(Iop_32Uto64,e) ));
   1036               break;
   1037       case 2: vassert(ty == Ity_I16);
   1038               stmt( IRStmt_Put( OFFB_RAX, e ));
   1039               break;
   1040       case 1: vassert(ty == Ity_I8);
   1041               stmt( IRStmt_Put( OFFB_RAX, e ));
   1042               break;
   1043       default: vpanic("putIRegRAX(amd64)");
   1044    }
   1045 }
   1046 
   1047 
   1048 /* Read/write various widths of %RDX, as it has various
   1049    special-purpose uses. */
   1050 
   1051 static const HChar* nameIRegRDX ( Int sz )
   1052 {
   1053    switch (sz) {
   1054       case 1: return "%dl";
   1055       case 2: return "%dx";
   1056       case 4: return "%edx";
   1057       case 8: return "%rdx";
   1058       default: vpanic("nameIRegRDX(amd64)");
   1059    }
   1060 }
   1061 
   1062 static IRExpr* getIRegRDX ( Int sz )
   1063 {
   1064    vassert(host_endness == VexEndnessLE);
   1065    switch (sz) {
   1066       case 1: return IRExpr_Get( OFFB_RDX, Ity_I8 );
   1067       case 2: return IRExpr_Get( OFFB_RDX, Ity_I16 );
   1068       case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RDX, Ity_I64 ));
   1069       case 8: return IRExpr_Get( OFFB_RDX, Ity_I64 );
   1070       default: vpanic("getIRegRDX(amd64)");
   1071    }
   1072 }
   1073 
   1074 static void putIRegRDX ( Int sz, IRExpr* e )
   1075 {
   1076    vassert(host_endness == VexEndnessLE);
   1077    vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
   1078    switch (sz) {
   1079       case 8: stmt( IRStmt_Put( OFFB_RDX, e ));
   1080               break;
   1081       case 4: stmt( IRStmt_Put( OFFB_RDX, unop(Iop_32Uto64,e) ));
   1082               break;
   1083       case 2: stmt( IRStmt_Put( OFFB_RDX, e ));
   1084               break;
   1085       case 1: stmt( IRStmt_Put( OFFB_RDX, e ));
   1086               break;
   1087       default: vpanic("putIRegRDX(amd64)");
   1088    }
   1089 }
   1090 
   1091 
   1092 /* Simplistic functions to deal with the integer registers as a
   1093    straightforward bank of 16 64-bit regs. */
   1094 
   1095 static IRExpr* getIReg64 ( UInt regno )
   1096 {
   1097    return IRExpr_Get( integerGuestReg64Offset(regno),
   1098                       Ity_I64 );
   1099 }
   1100 
   1101 static void putIReg64 ( UInt regno, IRExpr* e )
   1102 {
   1103    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   1104    stmt( IRStmt_Put( integerGuestReg64Offset(regno), e ) );
   1105 }
   1106 
   1107 static const HChar* nameIReg64 ( UInt regno )
   1108 {
   1109    return nameIReg( 8, regno, False );
   1110 }
   1111 
   1112 
   1113 /* Simplistic functions to deal with the lower halves of integer
   1114    registers as a straightforward bank of 16 32-bit regs. */
   1115 
   1116 static IRExpr* getIReg32 ( UInt regno )
   1117 {
   1118    vassert(host_endness == VexEndnessLE);
   1119    return unop(Iop_64to32,
   1120                IRExpr_Get( integerGuestReg64Offset(regno),
   1121                            Ity_I64 ));
   1122 }
   1123 
   1124 static void putIReg32 ( UInt regno, IRExpr* e )
   1125 {
   1126    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
   1127    stmt( IRStmt_Put( integerGuestReg64Offset(regno),
   1128                      unop(Iop_32Uto64,e) ) );
   1129 }
   1130 
   1131 static const HChar* nameIReg32 ( UInt regno )
   1132 {
   1133    return nameIReg( 4, regno, False );
   1134 }
   1135 
   1136 
   1137 /* Simplistic functions to deal with the lower quarters of integer
   1138    registers as a straightforward bank of 16 16-bit regs. */
   1139 
   1140 static IRExpr* getIReg16 ( UInt regno )
   1141 {
   1142    vassert(host_endness == VexEndnessLE);
   1143    return IRExpr_Get( integerGuestReg64Offset(regno),
   1144                       Ity_I16 );
   1145 }
   1146 
   1147 static void putIReg16 ( UInt regno, IRExpr* e )
   1148 {
   1149    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
   1150    stmt( IRStmt_Put( integerGuestReg64Offset(regno),
   1151                      unop(Iop_16Uto64,e) ) );
   1152 }
   1153 
   1154 static const HChar* nameIReg16 ( UInt regno )
   1155 {
   1156    return nameIReg( 2, regno, False );
   1157 }
   1158 
   1159 
   1160 /* Sometimes what we know is a 3-bit register number, a REX byte, and
   1161    which field of the REX byte is to be used to extend to a 4-bit
   1162    number.  These functions cater for that situation.
   1163 */
   1164 static IRExpr* getIReg64rexX ( Prefix pfx, UInt lo3bits )
   1165 {
   1166    vassert(lo3bits < 8);
   1167    vassert(IS_VALID_PFX(pfx));
   1168    return getIReg64( lo3bits | (getRexX(pfx) << 3) );
   1169 }
   1170 
   1171 static const HChar* nameIReg64rexX ( Prefix pfx, UInt lo3bits )
   1172 {
   1173    vassert(lo3bits < 8);
   1174    vassert(IS_VALID_PFX(pfx));
   1175    return nameIReg( 8, lo3bits | (getRexX(pfx) << 3), False );
   1176 }
   1177 
   1178 static const HChar* nameIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
   1179 {
   1180    vassert(lo3bits < 8);
   1181    vassert(IS_VALID_PFX(pfx));
   1182    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1183    return nameIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1184                         toBool(sz==1 && !haveREX(pfx)) );
   1185 }
   1186 
   1187 static IRExpr* getIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
   1188 {
   1189    vassert(lo3bits < 8);
   1190    vassert(IS_VALID_PFX(pfx));
   1191    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1192    if (sz == 4) {
   1193       sz = 8;
   1194       return unop(Iop_64to32,
   1195                   IRExpr_Get(
   1196                      offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1197                                      False/*!irregular*/ ),
   1198                      szToITy(sz)
   1199                  )
   1200              );
   1201    } else {
   1202       return IRExpr_Get(
   1203                 offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1204                                 toBool(sz==1 && !haveREX(pfx)) ),
   1205                 szToITy(sz)
   1206              );
   1207    }
   1208 }
   1209 
   1210 static void putIRegRexB ( Int sz, Prefix pfx, UInt lo3bits, IRExpr* e )
   1211 {
   1212    vassert(lo3bits < 8);
   1213    vassert(IS_VALID_PFX(pfx));
   1214    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1215    vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
   1216    stmt( IRStmt_Put(
   1217             offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1218                             toBool(sz==1 && !haveREX(pfx)) ),
   1219             sz==4 ? unop(Iop_32Uto64,e) : e
   1220    ));
   1221 }
   1222 
   1223 
   1224 /* Functions for getting register numbers from modrm bytes and REX
   1225    when we don't have to consider the complexities of integer subreg
   1226    accesses.
   1227 */
   1228 /* Extract the g reg field from a modRM byte, and augment it using the
   1229    REX.R bit from the supplied REX byte.  The R bit usually is
   1230    associated with the g register field.
   1231 */
   1232 static UInt gregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
   1233 {
   1234    Int reg = (Int)( (mod_reg_rm >> 3) & 7 );
   1235    reg += (pfx & PFX_REXR) ? 8 : 0;
   1236    return reg;
   1237 }
   1238 
   1239 /* Extract the e reg field from a modRM byte, and augment it using the
   1240    REX.B bit from the supplied REX byte.  The B bit usually is
   1241    associated with the e register field (when modrm indicates e is a
   1242    register, that is).
   1243 */
   1244 static UInt eregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
   1245 {
   1246    Int rm;
   1247    vassert(epartIsReg(mod_reg_rm));
   1248    rm = (Int)(mod_reg_rm & 0x7);
   1249    rm += (pfx & PFX_REXB) ? 8 : 0;
   1250    return rm;
   1251 }
   1252 
   1253 
   1254 /* General functions for dealing with integer register access. */
   1255 
   1256 /* Produce the guest state offset for a reference to the 'g' register
   1257    field in a modrm byte, taking into account REX (or its absence),
   1258    and the size of the access.
   1259 */
   1260 static UInt offsetIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1261 {
   1262    UInt reg;
   1263    vassert(host_endness == VexEndnessLE);
   1264    vassert(IS_VALID_PFX(pfx));
   1265    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1266    reg = gregOfRexRM( pfx, mod_reg_rm );
   1267    return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
   1268 }
   1269 
   1270 static
   1271 IRExpr* getIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1272 {
   1273    if (sz == 4) {
   1274       sz = 8;
   1275       return unop(Iop_64to32,
   1276                   IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
   1277                               szToITy(sz) ));
   1278    } else {
   1279       return IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
   1280                          szToITy(sz) );
   1281    }
   1282 }
   1283 
   1284 static
   1285 void putIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
   1286 {
   1287    vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
   1288    if (sz == 4) {
   1289       e = unop(Iop_32Uto64,e);
   1290    }
   1291    stmt( IRStmt_Put( offsetIRegG( sz, pfx, mod_reg_rm ), e ) );
   1292 }
   1293 
   1294 static
   1295 const HChar* nameIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1296 {
   1297    return nameIReg( sz, gregOfRexRM(pfx,mod_reg_rm),
   1298                         toBool(sz==1 && !haveREX(pfx)) );
   1299 }
   1300 
   1301 
   1302 static
   1303 IRExpr* getIRegV ( Int sz, Prefix pfx )
   1304 {
   1305    if (sz == 4) {
   1306       sz = 8;
   1307       return unop(Iop_64to32,
   1308                   IRExpr_Get( offsetIReg( sz, getVexNvvvv(pfx), False ),
   1309                               szToITy(sz) ));
   1310    } else {
   1311       return IRExpr_Get( offsetIReg( sz, getVexNvvvv(pfx), False ),
   1312                          szToITy(sz) );
   1313    }
   1314 }
   1315 
   1316 static
   1317 void putIRegV ( Int sz, Prefix pfx, IRExpr* e )
   1318 {
   1319    vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
   1320    if (sz == 4) {
   1321       e = unop(Iop_32Uto64,e);
   1322    }
   1323    stmt( IRStmt_Put( offsetIReg( sz, getVexNvvvv(pfx), False ), e ) );
   1324 }
   1325 
   1326 static
   1327 const HChar* nameIRegV ( Int sz, Prefix pfx )
   1328 {
   1329    return nameIReg( sz, getVexNvvvv(pfx), False );
   1330 }
   1331 
   1332 
   1333 
   1334 /* Produce the guest state offset for a reference to the 'e' register
   1335    field in a modrm byte, taking into account REX (or its absence),
   1336    and the size of the access.  eregOfRexRM will assert if mod_reg_rm
   1337    denotes a memory access rather than a register access.
   1338 */
   1339 static UInt offsetIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1340 {
   1341    UInt reg;
   1342    vassert(host_endness == VexEndnessLE);
   1343    vassert(IS_VALID_PFX(pfx));
   1344    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1345    reg = eregOfRexRM( pfx, mod_reg_rm );
   1346    return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
   1347 }
   1348 
   1349 static
   1350 IRExpr* getIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1351 {
   1352    if (sz == 4) {
   1353       sz = 8;
   1354       return unop(Iop_64to32,
   1355                   IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
   1356                               szToITy(sz) ));
   1357    } else {
   1358       return IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
   1359                          szToITy(sz) );
   1360    }
   1361 }
   1362 
   1363 static
   1364 void putIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
   1365 {
   1366    vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
   1367    if (sz == 4) {
   1368       e = unop(Iop_32Uto64,e);
   1369    }
   1370    stmt( IRStmt_Put( offsetIRegE( sz, pfx, mod_reg_rm ), e ) );
   1371 }
   1372 
   1373 static
   1374 const HChar* nameIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1375 {
   1376    return nameIReg( sz, eregOfRexRM(pfx,mod_reg_rm),
   1377                         toBool(sz==1 && !haveREX(pfx)) );
   1378 }
   1379 
   1380 
   1381 /*------------------------------------------------------------*/
   1382 /*--- For dealing with XMM registers                       ---*/
   1383 /*------------------------------------------------------------*/
   1384 
   1385 static Int ymmGuestRegOffset ( UInt ymmreg )
   1386 {
   1387    switch (ymmreg) {
   1388       case 0:  return OFFB_YMM0;
   1389       case 1:  return OFFB_YMM1;
   1390       case 2:  return OFFB_YMM2;
   1391       case 3:  return OFFB_YMM3;
   1392       case 4:  return OFFB_YMM4;
   1393       case 5:  return OFFB_YMM5;
   1394       case 6:  return OFFB_YMM6;
   1395       case 7:  return OFFB_YMM7;
   1396       case 8:  return OFFB_YMM8;
   1397       case 9:  return OFFB_YMM9;
   1398       case 10: return OFFB_YMM10;
   1399       case 11: return OFFB_YMM11;
   1400       case 12: return OFFB_YMM12;
   1401       case 13: return OFFB_YMM13;
   1402       case 14: return OFFB_YMM14;
   1403       case 15: return OFFB_YMM15;
   1404       default: vpanic("ymmGuestRegOffset(amd64)");
   1405    }
   1406 }
   1407 
   1408 static Int xmmGuestRegOffset ( UInt xmmreg )
   1409 {
   1410    /* Correct for little-endian host only. */
   1411    vassert(host_endness == VexEndnessLE);
   1412    return ymmGuestRegOffset( xmmreg );
   1413 }
   1414 
   1415 /* Lanes of vector registers are always numbered from zero being the
   1416    least significant lane (rightmost in the register).  */
   1417 
   1418 static Int xmmGuestRegLane16offset ( UInt xmmreg, Int laneno )
   1419 {
   1420    /* Correct for little-endian host only. */
   1421    vassert(host_endness == VexEndnessLE);
   1422    vassert(laneno >= 0 && laneno < 8);
   1423    return xmmGuestRegOffset( xmmreg ) + 2 * laneno;
   1424 }
   1425 
   1426 static Int xmmGuestRegLane32offset ( UInt xmmreg, Int laneno )
   1427 {
   1428    /* Correct for little-endian host only. */
   1429    vassert(host_endness == VexEndnessLE);
   1430    vassert(laneno >= 0 && laneno < 4);
   1431    return xmmGuestRegOffset( xmmreg ) + 4 * laneno;
   1432 }
   1433 
   1434 static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno )
   1435 {
   1436    /* Correct for little-endian host only. */
   1437    vassert(host_endness == VexEndnessLE);
   1438    vassert(laneno >= 0 && laneno < 2);
   1439    return xmmGuestRegOffset( xmmreg ) + 8 * laneno;
   1440 }
   1441 
   1442 static Int ymmGuestRegLane128offset ( UInt ymmreg, Int laneno )
   1443 {
   1444    /* Correct for little-endian host only. */
   1445    vassert(host_endness == VexEndnessLE);
   1446    vassert(laneno >= 0 && laneno < 2);
   1447    return ymmGuestRegOffset( ymmreg ) + 16 * laneno;
   1448 }
   1449 
   1450 static Int ymmGuestRegLane64offset ( UInt ymmreg, Int laneno )
   1451 {
   1452    /* Correct for little-endian host only. */
   1453    vassert(host_endness == VexEndnessLE);
   1454    vassert(laneno >= 0 && laneno < 4);
   1455    return ymmGuestRegOffset( ymmreg ) + 8 * laneno;
   1456 }
   1457 
   1458 static Int ymmGuestRegLane32offset ( UInt ymmreg, Int laneno )
   1459 {
   1460    /* Correct for little-endian host only. */
   1461    vassert(host_endness == VexEndnessLE);
   1462    vassert(laneno >= 0 && laneno < 8);
   1463    return ymmGuestRegOffset( ymmreg ) + 4 * laneno;
   1464 }
   1465 
   1466 static IRExpr* getXMMReg ( UInt xmmreg )
   1467 {
   1468    return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
   1469 }
   1470 
   1471 static IRExpr* getXMMRegLane64 ( UInt xmmreg, Int laneno )
   1472 {
   1473    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_I64 );
   1474 }
   1475 
   1476 static IRExpr* getXMMRegLane64F ( UInt xmmreg, Int laneno )
   1477 {
   1478    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_F64 );
   1479 }
   1480 
   1481 static IRExpr* getXMMRegLane32 ( UInt xmmreg, Int laneno )
   1482 {
   1483    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_I32 );
   1484 }
   1485 
   1486 static IRExpr* getXMMRegLane32F ( UInt xmmreg, Int laneno )
   1487 {
   1488    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_F32 );
   1489 }
   1490 
   1491 static IRExpr* getXMMRegLane16 ( UInt xmmreg, Int laneno )
   1492 {
   1493   return IRExpr_Get( xmmGuestRegLane16offset(xmmreg,laneno), Ity_I16 );
   1494 }
   1495 
   1496 static void putXMMReg ( UInt xmmreg, IRExpr* e )
   1497 {
   1498    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
   1499    stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
   1500 }
   1501 
   1502 static void putXMMRegLane64 ( UInt xmmreg, Int laneno, IRExpr* e )
   1503 {
   1504    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   1505    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
   1506 }
   1507 
   1508 static void putXMMRegLane64F ( UInt xmmreg, Int laneno, IRExpr* e )
   1509 {
   1510    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
   1511    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
   1512 }
   1513 
   1514 static void putXMMRegLane32F ( UInt xmmreg, Int laneno, IRExpr* e )
   1515 {
   1516    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
   1517    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
   1518 }
   1519 
   1520 static void putXMMRegLane32 ( UInt xmmreg, Int laneno, IRExpr* e )
   1521 {
   1522    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
   1523    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
   1524 }
   1525 
   1526 static IRExpr* getYMMReg ( UInt xmmreg )
   1527 {
   1528    return IRExpr_Get( ymmGuestRegOffset(xmmreg), Ity_V256 );
   1529 }
   1530 
   1531 static IRExpr* getYMMRegLane128 ( UInt ymmreg, Int laneno )
   1532 {
   1533    return IRExpr_Get( ymmGuestRegLane128offset(ymmreg,laneno), Ity_V128 );
   1534 }
   1535 
   1536 static IRExpr* getYMMRegLane64F ( UInt ymmreg, Int laneno )
   1537 {
   1538    return IRExpr_Get( ymmGuestRegLane64offset(ymmreg,laneno), Ity_F64 );
   1539 }
   1540 
   1541 static IRExpr* getYMMRegLane64 ( UInt ymmreg, Int laneno )
   1542 {
   1543    return IRExpr_Get( ymmGuestRegLane64offset(ymmreg,laneno), Ity_I64 );
   1544 }
   1545 
   1546 static IRExpr* getYMMRegLane32F ( UInt ymmreg, Int laneno )
   1547 {
   1548    return IRExpr_Get( ymmGuestRegLane32offset(ymmreg,laneno), Ity_F32 );
   1549 }
   1550 
   1551 static IRExpr* getYMMRegLane32 ( UInt ymmreg, Int laneno )
   1552 {
   1553    return IRExpr_Get( ymmGuestRegLane32offset(ymmreg,laneno), Ity_I32 );
   1554 }
   1555 
   1556 static void putYMMReg ( UInt ymmreg, IRExpr* e )
   1557 {
   1558    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V256);
   1559    stmt( IRStmt_Put( ymmGuestRegOffset(ymmreg), e ) );
   1560 }
   1561 
   1562 static void putYMMRegLane128 ( UInt ymmreg, Int laneno, IRExpr* e )
   1563 {
   1564    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
   1565    stmt( IRStmt_Put( ymmGuestRegLane128offset(ymmreg,laneno), e ) );
   1566 }
   1567 
   1568 static void putYMMRegLane64F ( UInt ymmreg, Int laneno, IRExpr* e )
   1569 {
   1570    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
   1571    stmt( IRStmt_Put( ymmGuestRegLane64offset(ymmreg,laneno), e ) );
   1572 }
   1573 
   1574 static void putYMMRegLane64 ( UInt ymmreg, Int laneno, IRExpr* e )
   1575 {
   1576    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   1577    stmt( IRStmt_Put( ymmGuestRegLane64offset(ymmreg,laneno), e ) );
   1578 }
   1579 
   1580 static void putYMMRegLane32F ( UInt ymmreg, Int laneno, IRExpr* e )
   1581 {
   1582    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
   1583    stmt( IRStmt_Put( ymmGuestRegLane32offset(ymmreg,laneno), e ) );
   1584 }
   1585 
   1586 static void putYMMRegLane32 ( UInt ymmreg, Int laneno, IRExpr* e )
   1587 {
   1588    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
   1589    stmt( IRStmt_Put( ymmGuestRegLane32offset(ymmreg,laneno), e ) );
   1590 }
   1591 
   1592 static IRExpr* mkV128 ( UShort mask )
   1593 {
   1594    return IRExpr_Const(IRConst_V128(mask));
   1595 }
   1596 
   1597 /* Write the low half of a YMM reg and zero out the upper half. */
   1598 static void putYMMRegLoAndZU ( UInt ymmreg, IRExpr* e )
   1599 {
   1600    putYMMRegLane128( ymmreg, 0, e );
   1601    putYMMRegLane128( ymmreg, 1, mkV128(0) );
   1602 }
   1603 
   1604 static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y )
   1605 {
   1606    vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1);
   1607    vassert(typeOfIRExpr(irsb->tyenv,y) == Ity_I1);
   1608    return unop(Iop_64to1,
   1609                binop(Iop_And64,
   1610                      unop(Iop_1Uto64,x),
   1611                      unop(Iop_1Uto64,y)));
   1612 }
   1613 
   1614 /* Generate a compare-and-swap operation, operating on memory at
   1615    'addr'.  The expected value is 'expVal' and the new value is
   1616    'newVal'.  If the operation fails, then transfer control (with a
   1617    no-redir jump (XXX no -- see comment at top of this file)) to
   1618    'restart_point', which is presumably the address of the guest
   1619    instruction again -- retrying, essentially. */
   1620 static void casLE ( IRExpr* addr, IRExpr* expVal, IRExpr* newVal,
   1621                     Addr64 restart_point )
   1622 {
   1623    IRCAS* cas;
   1624    IRType tyE    = typeOfIRExpr(irsb->tyenv, expVal);
   1625    IRType tyN    = typeOfIRExpr(irsb->tyenv, newVal);
   1626    IRTemp oldTmp = newTemp(tyE);
   1627    IRTemp expTmp = newTemp(tyE);
   1628    vassert(tyE == tyN);
   1629    vassert(tyE == Ity_I64 || tyE == Ity_I32
   1630            || tyE == Ity_I16 || tyE == Ity_I8);
   1631    assign(expTmp, expVal);
   1632    cas = mkIRCAS( IRTemp_INVALID, oldTmp, Iend_LE, addr,
   1633                   NULL, mkexpr(expTmp), NULL, newVal );
   1634    stmt( IRStmt_CAS(cas) );
   1635    stmt( IRStmt_Exit(
   1636             binop( mkSizedOp(tyE,Iop_CasCmpNE8),
   1637                    mkexpr(oldTmp), mkexpr(expTmp) ),
   1638             Ijk_Boring, /*Ijk_NoRedir*/
   1639             IRConst_U64( restart_point ),
   1640             OFFB_RIP
   1641          ));
   1642 }
   1643 
   1644 
   1645 /*------------------------------------------------------------*/
   1646 /*--- Helpers for %rflags.                                 ---*/
   1647 /*------------------------------------------------------------*/
   1648 
   1649 /* -------------- Evaluating the flags-thunk. -------------- */
   1650 
   1651 /* Build IR to calculate all the eflags from stored
   1652    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
   1653    Ity_I64. */
   1654 static IRExpr* mk_amd64g_calculate_rflags_all ( void )
   1655 {
   1656    IRExpr** args
   1657       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
   1658                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1659                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1660                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   1661    IRExpr* call
   1662       = mkIRExprCCall(
   1663            Ity_I64,
   1664            0/*regparm*/,
   1665            "amd64g_calculate_rflags_all", &amd64g_calculate_rflags_all,
   1666            args
   1667         );
   1668    /* Exclude OP and NDEP from definedness checking.  We're only
   1669       interested in DEP1 and DEP2. */
   1670    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
   1671    return call;
   1672 }
   1673 
   1674 /* Build IR to calculate some particular condition from stored
   1675    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
   1676    Ity_Bit. */
   1677 static IRExpr* mk_amd64g_calculate_condition ( AMD64Condcode cond )
   1678 {
   1679    IRExpr** args
   1680       = mkIRExprVec_5( mkU64(cond),
   1681                        IRExpr_Get(OFFB_CC_OP,   Ity_I64),
   1682                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1683                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1684                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   1685    IRExpr* call
   1686       = mkIRExprCCall(
   1687            Ity_I64,
   1688            0/*regparm*/,
   1689            "amd64g_calculate_condition", &amd64g_calculate_condition,
   1690            args
   1691         );
   1692    /* Exclude the requested condition, OP and NDEP from definedness
   1693       checking.  We're only interested in DEP1 and DEP2. */
   1694    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<1) | (1<<4);
   1695    return unop(Iop_64to1, call);
   1696 }
   1697 
   1698 /* Build IR to calculate just the carry flag from stored
   1699    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression :: Ity_I64. */
   1700 static IRExpr* mk_amd64g_calculate_rflags_c ( void )
   1701 {
   1702    IRExpr** args
   1703       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
   1704                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1705                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1706                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   1707    IRExpr* call
   1708       = mkIRExprCCall(
   1709            Ity_I64,
   1710            0/*regparm*/,
   1711            "amd64g_calculate_rflags_c", &amd64g_calculate_rflags_c,
   1712            args
   1713         );
   1714    /* Exclude OP and NDEP from definedness checking.  We're only
   1715       interested in DEP1 and DEP2. */
   1716    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
   1717    return call;
   1718 }
   1719 
   1720 
   1721 /* -------------- Building the flags-thunk. -------------- */
   1722 
   1723 /* The machinery in this section builds the flag-thunk following a
   1724    flag-setting operation.  Hence the various setFlags_* functions.
   1725 */
   1726 
   1727 static Bool isAddSub ( IROp op8 )
   1728 {
   1729    return toBool(op8 == Iop_Add8 || op8 == Iop_Sub8);
   1730 }
   1731 
   1732 static Bool isLogic ( IROp op8 )
   1733 {
   1734    return toBool(op8 == Iop_And8 || op8 == Iop_Or8 || op8 == Iop_Xor8);
   1735 }
   1736 
   1737 /* U-widen 1/8/16/32/64 bit int expr to 64. */
   1738 static IRExpr* widenUto64 ( IRExpr* e )
   1739 {
   1740    switch (typeOfIRExpr(irsb->tyenv,e)) {
   1741       case Ity_I64: return e;
   1742       case Ity_I32: return unop(Iop_32Uto64, e);
   1743       case Ity_I16: return unop(Iop_16Uto64, e);
   1744       case Ity_I8:  return unop(Iop_8Uto64, e);
   1745       case Ity_I1:  return unop(Iop_1Uto64, e);
   1746       default: vpanic("widenUto64");
   1747    }
   1748 }
   1749 
   1750 /* S-widen 8/16/32/64 bit int expr to 32. */
   1751 static IRExpr* widenSto64 ( IRExpr* e )
   1752 {
   1753    switch (typeOfIRExpr(irsb->tyenv,e)) {
   1754       case Ity_I64: return e;
   1755       case Ity_I32: return unop(Iop_32Sto64, e);
   1756       case Ity_I16: return unop(Iop_16Sto64, e);
   1757       case Ity_I8:  return unop(Iop_8Sto64, e);
   1758       default: vpanic("widenSto64");
   1759    }
   1760 }
   1761 
   1762 /* Narrow 8/16/32/64 bit int expr to 8/16/32/64.  Clearly only some
   1763    of these combinations make sense. */
   1764 static IRExpr* narrowTo ( IRType dst_ty, IRExpr* e )
   1765 {
   1766    IRType src_ty = typeOfIRExpr(irsb->tyenv,e);
   1767    if (src_ty == dst_ty)
   1768       return e;
   1769    if (src_ty == Ity_I32 && dst_ty == Ity_I16)
   1770       return unop(Iop_32to16, e);
   1771    if (src_ty == Ity_I32 && dst_ty == Ity_I8)
   1772       return unop(Iop_32to8, e);
   1773    if (src_ty == Ity_I64 && dst_ty == Ity_I32)
   1774       return unop(Iop_64to32, e);
   1775    if (src_ty == Ity_I64 && dst_ty == Ity_I16)
   1776       return unop(Iop_64to16, e);
   1777    if (src_ty == Ity_I64 && dst_ty == Ity_I8)
   1778       return unop(Iop_64to8, e);
   1779 
   1780    vex_printf("\nsrc, dst tys are: ");
   1781    ppIRType(src_ty);
   1782    vex_printf(", ");
   1783    ppIRType(dst_ty);
   1784    vex_printf("\n");
   1785    vpanic("narrowTo(amd64)");
   1786 }
   1787 
   1788 
   1789 /* Set the flags thunk OP, DEP1 and DEP2 fields.  The supplied op is
   1790    auto-sized up to the real op. */
   1791 
   1792 static
   1793 void setFlags_DEP1_DEP2 ( IROp op8, IRTemp dep1, IRTemp dep2, IRType ty )
   1794 {
   1795    Int ccOp = 0;
   1796    switch (ty) {
   1797       case Ity_I8:  ccOp = 0; break;
   1798       case Ity_I16: ccOp = 1; break;
   1799       case Ity_I32: ccOp = 2; break;
   1800       case Ity_I64: ccOp = 3; break;
   1801       default: vassert(0);
   1802    }
   1803    switch (op8) {
   1804       case Iop_Add8: ccOp += AMD64G_CC_OP_ADDB;   break;
   1805       case Iop_Sub8: ccOp += AMD64G_CC_OP_SUBB;   break;
   1806       default:       ppIROp(op8);
   1807                      vpanic("setFlags_DEP1_DEP2(amd64)");
   1808    }
   1809    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   1810    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
   1811    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(dep2))) );
   1812 }
   1813 
   1814 
   1815 /* Set the OP and DEP1 fields only, and write zero to DEP2. */
   1816 
   1817 static
   1818 void setFlags_DEP1 ( IROp op8, IRTemp dep1, IRType ty )
   1819 {
   1820    Int ccOp = 0;
   1821    switch (ty) {
   1822       case Ity_I8:  ccOp = 0; break;
   1823       case Ity_I16: ccOp = 1; break;
   1824       case Ity_I32: ccOp = 2; break;
   1825       case Ity_I64: ccOp = 3; break;
   1826       default: vassert(0);
   1827    }
   1828    switch (op8) {
   1829       case Iop_Or8:
   1830       case Iop_And8:
   1831       case Iop_Xor8: ccOp += AMD64G_CC_OP_LOGICB; break;
   1832       default:       ppIROp(op8);
   1833                      vpanic("setFlags_DEP1(amd64)");
   1834    }
   1835    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   1836    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
   1837    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
   1838 }
   1839 
   1840 
   1841 /* For shift operations, we put in the result and the undershifted
   1842    result.  Except if the shift amount is zero, the thunk is left
   1843    unchanged. */
   1844 
   1845 static void setFlags_DEP1_DEP2_shift ( IROp    op64,
   1846                                        IRTemp  res,
   1847                                        IRTemp  resUS,
   1848                                        IRType  ty,
   1849                                        IRTemp  guard )
   1850 {
   1851    Int ccOp = 0;
   1852    switch (ty) {
   1853       case Ity_I8:  ccOp = 0; break;
   1854       case Ity_I16: ccOp = 1; break;
   1855       case Ity_I32: ccOp = 2; break;
   1856       case Ity_I64: ccOp = 3; break;
   1857       default: vassert(0);
   1858    }
   1859 
   1860    vassert(guard);
   1861 
   1862    /* Both kinds of right shifts are handled by the same thunk
   1863       operation. */
   1864    switch (op64) {
   1865       case Iop_Shr64:
   1866       case Iop_Sar64: ccOp += AMD64G_CC_OP_SHRB; break;
   1867       case Iop_Shl64: ccOp += AMD64G_CC_OP_SHLB; break;
   1868       default:        ppIROp(op64);
   1869                       vpanic("setFlags_DEP1_DEP2_shift(amd64)");
   1870    }
   1871 
   1872    /* guard :: Ity_I8.  We need to convert it to I1. */
   1873    IRTemp guardB = newTemp(Ity_I1);
   1874    assign( guardB, binop(Iop_CmpNE8, mkexpr(guard), mkU8(0)) );
   1875 
   1876    /* DEP1 contains the result, DEP2 contains the undershifted value. */
   1877    stmt( IRStmt_Put( OFFB_CC_OP,
   1878                      IRExpr_ITE( mkexpr(guardB),
   1879                                  mkU64(ccOp),
   1880                                  IRExpr_Get(OFFB_CC_OP,Ity_I64) ) ));
   1881    stmt( IRStmt_Put( OFFB_CC_DEP1,
   1882                      IRExpr_ITE( mkexpr(guardB),
   1883                                  widenUto64(mkexpr(res)),
   1884                                  IRExpr_Get(OFFB_CC_DEP1,Ity_I64) ) ));
   1885    stmt( IRStmt_Put( OFFB_CC_DEP2,
   1886                      IRExpr_ITE( mkexpr(guardB),
   1887                                  widenUto64(mkexpr(resUS)),
   1888                                  IRExpr_Get(OFFB_CC_DEP2,Ity_I64) ) ));
   1889 }
   1890 
   1891 
   1892 /* For the inc/dec case, we store in DEP1 the result value and in NDEP
   1893    the former value of the carry flag, which unfortunately we have to
   1894    compute. */
   1895 
   1896 static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
   1897 {
   1898    Int ccOp = inc ? AMD64G_CC_OP_INCB : AMD64G_CC_OP_DECB;
   1899 
   1900    switch (ty) {
   1901       case Ity_I8:  ccOp += 0; break;
   1902       case Ity_I16: ccOp += 1; break;
   1903       case Ity_I32: ccOp += 2; break;
   1904       case Ity_I64: ccOp += 3; break;
   1905       default: vassert(0);
   1906    }
   1907 
   1908    /* This has to come first, because calculating the C flag
   1909       may require reading all four thunk fields. */
   1910    stmt( IRStmt_Put( OFFB_CC_NDEP, mk_amd64g_calculate_rflags_c()) );
   1911    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   1912    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(res))) );
   1913    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
   1914 }
   1915 
   1916 
   1917 /* Multiplies are pretty much like add and sub: DEP1 and DEP2 hold the
   1918    two arguments. */
   1919 
   1920 static
   1921 void setFlags_MUL ( IRType ty, IRTemp arg1, IRTemp arg2, ULong base_op )
   1922 {
   1923    switch (ty) {
   1924       case Ity_I8:
   1925          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+0) ) );
   1926          break;
   1927       case Ity_I16:
   1928          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+1) ) );
   1929          break;
   1930       case Ity_I32:
   1931          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+2) ) );
   1932          break;
   1933       case Ity_I64:
   1934          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+3) ) );
   1935          break;
   1936       default:
   1937          vpanic("setFlags_MUL(amd64)");
   1938    }
   1939    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(arg1)) ));
   1940    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(arg2)) ));
   1941 }
   1942 
   1943 
   1944 /* -------------- Condition codes. -------------- */
   1945 
   1946 /* Condition codes, using the AMD encoding.  */
   1947 
   1948 static const HChar* name_AMD64Condcode ( AMD64Condcode cond )
   1949 {
   1950    switch (cond) {
   1951       case AMD64CondO:      return "o";
   1952       case AMD64CondNO:     return "no";
   1953       case AMD64CondB:      return "b";
   1954       case AMD64CondNB:     return "ae"; /*"nb";*/
   1955       case AMD64CondZ:      return "e"; /*"z";*/
   1956       case AMD64CondNZ:     return "ne"; /*"nz";*/
   1957       case AMD64CondBE:     return "be";
   1958       case AMD64CondNBE:    return "a"; /*"nbe";*/
   1959       case AMD64CondS:      return "s";
   1960       case AMD64CondNS:     return "ns";
   1961       case AMD64CondP:      return "p";
   1962       case AMD64CondNP:     return "np";
   1963       case AMD64CondL:      return "l";
   1964       case AMD64CondNL:     return "ge"; /*"nl";*/
   1965       case AMD64CondLE:     return "le";
   1966       case AMD64CondNLE:    return "g"; /*"nle";*/
   1967       case AMD64CondAlways: return "ALWAYS";
   1968       default: vpanic("name_AMD64Condcode");
   1969    }
   1970 }
   1971 
   1972 static
   1973 AMD64Condcode positiveIse_AMD64Condcode ( AMD64Condcode  cond,
   1974                                           /*OUT*/Bool*   needInvert )
   1975 {
   1976    vassert(cond >= AMD64CondO && cond <= AMD64CondNLE);
   1977    if (cond & 1) {
   1978       *needInvert = True;
   1979       return cond-1;
   1980    } else {
   1981       *needInvert = False;
   1982       return cond;
   1983    }
   1984 }
   1985 
   1986 
   1987 /* -------------- Helpers for ADD/SUB with carry. -------------- */
   1988 
   1989 /* Given ta1, ta2 and tres, compute tres = ADC(ta1,ta2) and set flags
   1990    appropriately.
   1991 
   1992    Optionally, generate a store for the 'tres' value.  This can either
   1993    be a normal store, or it can be a cas-with-possible-failure style
   1994    store:
   1995 
   1996    if taddr is IRTemp_INVALID, then no store is generated.
   1997 
   1998    if taddr is not IRTemp_INVALID, then a store (using taddr as
   1999    the address) is generated:
   2000 
   2001      if texpVal is IRTemp_INVALID then a normal store is
   2002      generated, and restart_point must be zero (it is irrelevant).
   2003 
   2004      if texpVal is not IRTemp_INVALID then a cas-style store is
   2005      generated.  texpVal is the expected value, restart_point
   2006      is the restart point if the store fails, and texpVal must
   2007      have the same type as tres.
   2008 
   2009 */
   2010 static void helper_ADC ( Int sz,
   2011                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   2012                          /* info about optional store: */
   2013                          IRTemp taddr, IRTemp texpVal, Addr64 restart_point )
   2014 {
   2015    UInt    thunkOp;
   2016    IRType  ty    = szToITy(sz);
   2017    IRTemp  oldc  = newTemp(Ity_I64);
   2018    IRTemp  oldcn = newTemp(ty);
   2019    IROp    plus  = mkSizedOp(ty, Iop_Add8);
   2020    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   2021 
   2022    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   2023 
   2024    switch (sz) {
   2025       case 8:  thunkOp = AMD64G_CC_OP_ADCQ; break;
   2026       case 4:  thunkOp = AMD64G_CC_OP_ADCL; break;
   2027       case 2:  thunkOp = AMD64G_CC_OP_ADCW; break;
   2028       case 1:  thunkOp = AMD64G_CC_OP_ADCB; break;
   2029       default: vassert(0);
   2030    }
   2031 
   2032    /* oldc = old carry flag, 0 or 1 */
   2033    assign( oldc,  binop(Iop_And64,
   2034                         mk_amd64g_calculate_rflags_c(),
   2035                         mkU64(1)) );
   2036 
   2037    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   2038 
   2039    assign( tres, binop(plus,
   2040                        binop(plus,mkexpr(ta1),mkexpr(ta2)),
   2041                        mkexpr(oldcn)) );
   2042 
   2043    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   2044       start of this function. */
   2045    if (taddr != IRTemp_INVALID) {
   2046       if (texpVal == IRTemp_INVALID) {
   2047          vassert(restart_point == 0);
   2048          storeLE( mkexpr(taddr), mkexpr(tres) );
   2049       } else {
   2050          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   2051          /* .. and hence 'texpVal' has the same type as 'tres'. */
   2052          casLE( mkexpr(taddr),
   2053                 mkexpr(texpVal), mkexpr(tres), restart_point );
   2054       }
   2055    }
   2056 
   2057    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
   2058    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1))  ));
   2059    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
   2060                                                          mkexpr(oldcn)) )) );
   2061    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   2062 }
   2063 
   2064 
   2065 /* Given ta1, ta2 and tres, compute tres = SBB(ta1,ta2) and set flags
   2066    appropriately.  As with helper_ADC, possibly generate a store of
   2067    the result -- see comments on helper_ADC for details.
   2068 */
   2069 static void helper_SBB ( Int sz,
   2070                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   2071                          /* info about optional store: */
   2072                          IRTemp taddr, IRTemp texpVal, Addr64 restart_point )
   2073 {
   2074    UInt    thunkOp;
   2075    IRType  ty    = szToITy(sz);
   2076    IRTemp  oldc  = newTemp(Ity_I64);
   2077    IRTemp  oldcn = newTemp(ty);
   2078    IROp    minus = mkSizedOp(ty, Iop_Sub8);
   2079    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   2080 
   2081    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   2082 
   2083    switch (sz) {
   2084       case 8:  thunkOp = AMD64G_CC_OP_SBBQ; break;
   2085       case 4:  thunkOp = AMD64G_CC_OP_SBBL; break;
   2086       case 2:  thunkOp = AMD64G_CC_OP_SBBW; break;
   2087       case 1:  thunkOp = AMD64G_CC_OP_SBBB; break;
   2088       default: vassert(0);
   2089    }
   2090 
   2091    /* oldc = old carry flag, 0 or 1 */
   2092    assign( oldc, binop(Iop_And64,
   2093                        mk_amd64g_calculate_rflags_c(),
   2094                        mkU64(1)) );
   2095 
   2096    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   2097 
   2098    assign( tres, binop(minus,
   2099                        binop(minus,mkexpr(ta1),mkexpr(ta2)),
   2100                        mkexpr(oldcn)) );
   2101 
   2102    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   2103       start of this function. */
   2104    if (taddr != IRTemp_INVALID) {
   2105       if (texpVal == IRTemp_INVALID) {
   2106          vassert(restart_point == 0);
   2107          storeLE( mkexpr(taddr), mkexpr(tres) );
   2108       } else {
   2109          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   2110          /* .. and hence 'texpVal' has the same type as 'tres'. */
   2111          casLE( mkexpr(taddr),
   2112                 mkexpr(texpVal), mkexpr(tres), restart_point );
   2113       }
   2114    }
   2115 
   2116    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
   2117    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1) )) );
   2118    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
   2119                                                          mkexpr(oldcn)) )) );
   2120    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   2121 }
   2122 
   2123 
   2124 /* Given ta1, ta2 and tres, compute tres = ADCX(ta1,ta2) or tres = ADOX(ta1,ta2)
   2125    and set flags appropriately.
   2126 */
   2127 static void helper_ADCX_ADOX ( Bool isADCX, Int sz,
   2128                                IRTemp tres, IRTemp ta1, IRTemp ta2 )
   2129 {
   2130    UInt    thunkOp;
   2131    IRType  ty        = szToITy(sz);
   2132    IRTemp  oldflags  = newTemp(Ity_I64);
   2133    IRTemp  oldOC     = newTemp(Ity_I64); // old O or C flag
   2134    IRTemp  oldOCn    = newTemp(ty);      // old O or C flag, narrowed
   2135    IROp    plus      = mkSizedOp(ty, Iop_Add8);
   2136    IROp    xor       = mkSizedOp(ty, Iop_Xor8);
   2137 
   2138    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   2139 
   2140    switch (sz) {
   2141       case 8:  thunkOp = isADCX ? AMD64G_CC_OP_ADCX64
   2142                                 : AMD64G_CC_OP_ADOX64; break;
   2143       case 4:  thunkOp = isADCX ? AMD64G_CC_OP_ADCX32
   2144                                 : AMD64G_CC_OP_ADOX32; break;
   2145       default: vassert(0);
   2146    }
   2147 
   2148    assign( oldflags, mk_amd64g_calculate_rflags_all() );
   2149 
   2150    /* oldOC = old overflow/carry flag, 0 or 1 */
   2151    assign( oldOC, binop(Iop_And64,
   2152                         binop(Iop_Shr64,
   2153                               mkexpr(oldflags),
   2154                               mkU8(isADCX ? AMD64G_CC_SHIFT_C
   2155                                           : AMD64G_CC_SHIFT_O)),
   2156                         mkU64(1)) );
   2157 
   2158    assign( oldOCn, narrowTo(ty, mkexpr(oldOC)) );
   2159 
   2160    assign( tres, binop(plus,
   2161                        binop(plus,mkexpr(ta1),mkexpr(ta2)),
   2162                        mkexpr(oldOCn)) );
   2163 
   2164    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
   2165    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1))  ));
   2166    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
   2167                                                          mkexpr(oldOCn)) )) );
   2168    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldflags) ) );
   2169 }
   2170 
   2171 
   2172 /* -------------- Helpers for disassembly printing. -------------- */
   2173 
   2174 static const HChar* nameGrp1 ( Int opc_aux )
   2175 {
   2176    static const HChar* grp1_names[8]
   2177      = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
   2178    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(amd64)");
   2179    return grp1_names[opc_aux];
   2180 }
   2181 
   2182 static const HChar* nameGrp2 ( Int opc_aux )
   2183 {
   2184    static const HChar* grp2_names[8]
   2185      = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
   2186    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(amd64)");
   2187    return grp2_names[opc_aux];
   2188 }
   2189 
   2190 static const HChar* nameGrp4 ( Int opc_aux )
   2191 {
   2192    static const HChar* grp4_names[8]
   2193      = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
   2194    if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(amd64)");
   2195    return grp4_names[opc_aux];
   2196 }
   2197 
   2198 static const HChar* nameGrp5 ( Int opc_aux )
   2199 {
   2200    static const HChar* grp5_names[8]
   2201      = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
   2202    if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(amd64)");
   2203    return grp5_names[opc_aux];
   2204 }
   2205 
   2206 static const HChar* nameGrp8 ( Int opc_aux )
   2207 {
   2208    static const HChar* grp8_names[8]
   2209       = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
   2210    if (opc_aux < 4 || opc_aux > 7) vpanic("nameGrp8(amd64)");
   2211    return grp8_names[opc_aux];
   2212 }
   2213 
   2214 static const HChar* nameSReg ( UInt sreg )
   2215 {
   2216    switch (sreg) {
   2217       case R_ES: return "%es";
   2218       case R_CS: return "%cs";
   2219       case R_SS: return "%ss";
   2220       case R_DS: return "%ds";
   2221       case R_FS: return "%fs";
   2222       case R_GS: return "%gs";
   2223       default: vpanic("nameSReg(amd64)");
   2224    }
   2225 }
   2226 
   2227 static const HChar* nameMMXReg ( Int mmxreg )
   2228 {
   2229    static const HChar* mmx_names[8]
   2230      = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
   2231    if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(amd64,guest)");
   2232    return mmx_names[mmxreg];
   2233 }
   2234 
   2235 static const HChar* nameXMMReg ( Int xmmreg )
   2236 {
   2237    static const HChar* xmm_names[16]
   2238      = { "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3",
   2239          "%xmm4",  "%xmm5",  "%xmm6",  "%xmm7",
   2240          "%xmm8",  "%xmm9",  "%xmm10", "%xmm11",
   2241          "%xmm12", "%xmm13", "%xmm14", "%xmm15" };
   2242    if (xmmreg < 0 || xmmreg > 15) vpanic("nameXMMReg(amd64)");
   2243    return xmm_names[xmmreg];
   2244 }
   2245 
   2246 static const HChar* nameMMXGran ( Int gran )
   2247 {
   2248    switch (gran) {
   2249       case 0: return "b";
   2250       case 1: return "w";
   2251       case 2: return "d";
   2252       case 3: return "q";
   2253       default: vpanic("nameMMXGran(amd64,guest)");
   2254    }
   2255 }
   2256 
   2257 static HChar nameISize ( Int size )
   2258 {
   2259    switch (size) {
   2260       case 8: return 'q';
   2261       case 4: return 'l';
   2262       case 2: return 'w';
   2263       case 1: return 'b';
   2264       default: vpanic("nameISize(amd64)");
   2265    }
   2266 }
   2267 
   2268 static const HChar* nameYMMReg ( Int ymmreg )
   2269 {
   2270    static const HChar* ymm_names[16]
   2271      = { "%ymm0",  "%ymm1",  "%ymm2",  "%ymm3",
   2272          "%ymm4",  "%ymm5",  "%ymm6",  "%ymm7",
   2273          "%ymm8",  "%ymm9",  "%ymm10", "%ymm11",
   2274          "%ymm12", "%ymm13", "%ymm14", "%ymm15" };
   2275    if (ymmreg < 0 || ymmreg > 15) vpanic("nameYMMReg(amd64)");
   2276    return ymm_names[ymmreg];
   2277 }
   2278 
   2279 
   2280 /*------------------------------------------------------------*/
   2281 /*--- JMP helpers                                          ---*/
   2282 /*------------------------------------------------------------*/
   2283 
   2284 static void jmp_lit( /*MOD*/DisResult* dres,
   2285                      IRJumpKind kind, Addr64 d64 )
   2286 {
   2287    vassert(dres->whatNext    == Dis_Continue);
   2288    vassert(dres->len         == 0);
   2289    vassert(dres->continueAt  == 0);
   2290    vassert(dres->jk_StopHere == Ijk_INVALID);
   2291    dres->whatNext    = Dis_StopHere;
   2292    dres->jk_StopHere = kind;
   2293    stmt( IRStmt_Put( OFFB_RIP, mkU64(d64) ) );
   2294 }
   2295 
   2296 static void jmp_treg( /*MOD*/DisResult* dres,
   2297                       IRJumpKind kind, IRTemp t )
   2298 {
   2299    vassert(dres->whatNext    == Dis_Continue);
   2300    vassert(dres->len         == 0);
   2301    vassert(dres->continueAt  == 0);
   2302    vassert(dres->jk_StopHere == Ijk_INVALID);
   2303    dres->whatNext    = Dis_StopHere;
   2304    dres->jk_StopHere = kind;
   2305    stmt( IRStmt_Put( OFFB_RIP, mkexpr(t) ) );
   2306 }
   2307 
   2308 static
   2309 void jcc_01 ( /*MOD*/DisResult* dres,
   2310               AMD64Condcode cond, Addr64 d64_false, Addr64 d64_true )
   2311 {
   2312    Bool          invert;
   2313    AMD64Condcode condPos;
   2314    vassert(dres->whatNext    == Dis_Continue);
   2315    vassert(dres->len         == 0);
   2316    vassert(dres->continueAt  == 0);
   2317    vassert(dres->jk_StopHere == Ijk_INVALID);
   2318    dres->whatNext    = Dis_StopHere;
   2319    dres->jk_StopHere = Ijk_Boring;
   2320    condPos = positiveIse_AMD64Condcode ( cond, &invert );
   2321    if (invert) {
   2322       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
   2323                          Ijk_Boring,
   2324                          IRConst_U64(d64_false),
   2325                          OFFB_RIP ) );
   2326       stmt( IRStmt_Put( OFFB_RIP, mkU64(d64_true) ) );
   2327    } else {
   2328       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
   2329                          Ijk_Boring,
   2330                          IRConst_U64(d64_true),
   2331                          OFFB_RIP ) );
   2332       stmt( IRStmt_Put( OFFB_RIP, mkU64(d64_false) ) );
   2333    }
   2334 }
   2335 
   2336 /* Let new_rsp be the %rsp value after a call/return.  Let nia be the
   2337    guest address of the next instruction to be executed.
   2338 
   2339    This function generates an AbiHint to say that -128(%rsp)
   2340    .. -1(%rsp) should now be regarded as uninitialised.
   2341 */
   2342 static
   2343 void make_redzone_AbiHint ( const VexAbiInfo* vbi,
   2344                             IRTemp new_rsp, IRTemp nia, const HChar* who )
   2345 {
   2346    Int szB = vbi->guest_stack_redzone_size;
   2347    vassert(szB >= 0);
   2348 
   2349    /* A bit of a kludge.  Currently the only AbI we've guested AMD64
   2350       for is ELF.  So just check it's the expected 128 value
   2351       (paranoia). */
   2352    vassert(szB == 128);
   2353 
   2354    if (0) vex_printf("AbiHint: %s\n", who);
   2355    vassert(typeOfIRTemp(irsb->tyenv, new_rsp) == Ity_I64);
   2356    vassert(typeOfIRTemp(irsb->tyenv, nia) == Ity_I64);
   2357    if (szB > 0)
   2358       stmt( IRStmt_AbiHint(
   2359                binop(Iop_Sub64, mkexpr(new_rsp), mkU64(szB)),
   2360                szB,
   2361                mkexpr(nia)
   2362             ));
   2363 }
   2364 
   2365 
   2366 /*------------------------------------------------------------*/
   2367 /*--- Disassembling addressing modes                       ---*/
   2368 /*------------------------------------------------------------*/
   2369 
   2370 static
   2371 const HChar* segRegTxt ( Prefix pfx )
   2372 {
   2373    if (pfx & PFX_CS) return "%cs:";
   2374    if (pfx & PFX_DS) return "%ds:";
   2375    if (pfx & PFX_ES) return "%es:";
   2376    if (pfx & PFX_FS) return "%fs:";
   2377    if (pfx & PFX_GS) return "%gs:";
   2378    if (pfx & PFX_SS) return "%ss:";
   2379    return ""; /* no override */
   2380 }
   2381 
   2382 
   2383 /* 'virtual' is an IRExpr* holding a virtual address.  Convert it to a
   2384    linear address by adding any required segment override as indicated
   2385    by sorb, and also dealing with any address size override
   2386    present. */
   2387 static
   2388 IRExpr* handleAddrOverrides ( const VexAbiInfo* vbi,
   2389                               Prefix pfx, IRExpr* virtual )
   2390 {
   2391    /* --- address size override --- */
   2392    if (haveASO(pfx))
   2393       virtual = unop(Iop_32Uto64, unop(Iop_64to32, virtual));
   2394 
   2395    /* Note that the below are hacks that relies on the assumption
   2396       that %fs or %gs are constant.
   2397       Typically, %fs is always 0x63 on linux (in the main thread, it
   2398       stays at value 0), %gs always 0x60 on Darwin, ... */
   2399    /* --- segment overrides --- */
   2400    if (pfx & PFX_FS) {
   2401       if (vbi->guest_amd64_assume_fs_is_const) {
   2402          /* return virtual + guest_FS_CONST. */
   2403          virtual = binop(Iop_Add64, virtual,
   2404                                     IRExpr_Get(OFFB_FS_CONST, Ity_I64));
   2405       } else {
   2406          unimplemented("amd64 %fs segment override");
   2407       }
   2408    }
   2409 
   2410    if (pfx & PFX_GS) {
   2411       if (vbi->guest_amd64_assume_gs_is_const) {
   2412          /* return virtual + guest_GS_CONST. */
   2413          virtual = binop(Iop_Add64, virtual,
   2414                                     IRExpr_Get(OFFB_GS_CONST, Ity_I64));
   2415       } else {
   2416          unimplemented("amd64 %gs segment override");
   2417       }
   2418    }
   2419 
   2420    /* cs, ds, es and ss are simply ignored in 64-bit mode. */
   2421 
   2422    return virtual;
   2423 }
   2424 
   2425 //.. {
   2426 //..    Int    sreg;
   2427 //..    IRType hWordTy;
   2428 //..    IRTemp ldt_ptr, gdt_ptr, seg_selector, r64;
   2429 //..
   2430 //..    if (sorb == 0)
   2431 //..       /* the common case - no override */
   2432 //..       return virtual;
   2433 //..
   2434 //..    switch (sorb) {
   2435 //..       case 0x3E: sreg = R_DS; break;
   2436 //..       case 0x26: sreg = R_ES; break;
   2437 //..       case 0x64: sreg = R_FS; break;
   2438 //..       case 0x65: sreg = R_GS; break;
   2439 //..       default: vpanic("handleAddrOverrides(x86,guest)");
   2440 //..    }
   2441 //..
   2442 //..    hWordTy = sizeof(HWord)==4 ? Ity_I32 : Ity_I64;
   2443 //..
   2444 //..    seg_selector = newTemp(Ity_I32);
   2445 //..    ldt_ptr      = newTemp(hWordTy);
   2446 //..    gdt_ptr      = newTemp(hWordTy);
   2447 //..    r64          = newTemp(Ity_I64);
   2448 //..
   2449 //..    assign( seg_selector, unop(Iop_16Uto32, getSReg(sreg)) );
   2450 //..    assign( ldt_ptr, IRExpr_Get( OFFB_LDT, hWordTy ));
   2451 //..    assign( gdt_ptr, IRExpr_Get( OFFB_GDT, hWordTy ));
   2452 //..
   2453 //..    /*
   2454 //..    Call this to do the translation and limit checks:
   2455 //..    ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
   2456 //..                                  UInt seg_selector, UInt virtual_addr )
   2457 //..    */
   2458 //..    assign(
   2459 //..       r64,
   2460 //..       mkIRExprCCall(
   2461 //..          Ity_I64,
   2462 //..          0/*regparms*/,
   2463 //..          "x86g_use_seg_selector",
   2464 //..          &x86g_use_seg_selector,
   2465 //..          mkIRExprVec_4( mkexpr(ldt_ptr), mkexpr(gdt_ptr),
   2466 //..                         mkexpr(seg_selector), virtual)
   2467 //..       )
   2468 //..    );
   2469 //..
   2470 //..    /* If the high 32 of the result are non-zero, there was a
   2471 //..       failure in address translation.  In which case, make a
   2472 //..       quick exit.
   2473 //..    */
   2474 //..    stmt(
   2475 //..       IRStmt_Exit(
   2476 //..          binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
   2477 //..          Ijk_MapFail,
   2478 //..          IRConst_U32( guest_eip_curr_instr )
   2479 //..       )
   2480 //..    );
   2481 //..
   2482 //..    /* otherwise, here's the translated result. */
   2483 //..    return unop(Iop_64to32, mkexpr(r64));
   2484 //.. }
   2485 
   2486 
   2487 /* Generate IR to calculate an address indicated by a ModRM and
   2488    following SIB bytes.  The expression, and the number of bytes in
   2489    the address mode, are returned (the latter in *len).  Note that
   2490    this fn should not be called if the R/M part of the address denotes
   2491    a register instead of memory.  If print_codegen is true, text of
   2492    the addressing mode is placed in buf.
   2493 
   2494    The computed address is stored in a new tempreg, and the
   2495    identity of the tempreg is returned.
   2496 
   2497    extra_bytes holds the number of bytes after the amode, as supplied
   2498    by the caller.  This is needed to make sense of %rip-relative
   2499    addresses.  Note that the value that *len is set to is only the
   2500    length of the amode itself and does not include the value supplied
   2501    in extra_bytes.
   2502  */
   2503 
   2504 static IRTemp disAMode_copy2tmp ( IRExpr* addr64 )
   2505 {
   2506    IRTemp tmp = newTemp(Ity_I64);
   2507    assign( tmp, addr64 );
   2508    return tmp;
   2509 }
   2510 
   2511 static
   2512 IRTemp disAMode ( /*OUT*/Int* len,
   2513                   const VexAbiInfo* vbi, Prefix pfx, Long delta,
   2514                   /*OUT*/HChar* buf, Int extra_bytes )
   2515 {
   2516    UChar mod_reg_rm = getUChar(delta);
   2517    delta++;
   2518 
   2519    buf[0] = (UChar)0;
   2520    vassert(extra_bytes >= 0 && extra_bytes < 10);
   2521 
   2522    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   2523       jump table seems a bit excessive.
   2524    */
   2525    mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
   2526    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   2527                                                /* is now XX0XXYYY */
   2528    mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
   2529    switch (mod_reg_rm) {
   2530 
   2531       /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
   2532          REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
   2533       */
   2534       case 0x00: case 0x01: case 0x02: case 0x03:
   2535       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   2536          { UChar rm = toUChar(mod_reg_rm & 7);
   2537            DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
   2538            *len = 1;
   2539            return disAMode_copy2tmp(
   2540                   handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,rm)));
   2541          }
   2542 
   2543       /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
   2544          REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
   2545       */
   2546       case 0x08: case 0x09: case 0x0A: case 0x0B:
   2547       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   2548          { UChar rm = toUChar(mod_reg_rm & 7);
   2549            Long d   = getSDisp8(delta);
   2550            if (d == 0) {
   2551               DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
   2552            } else {
   2553               DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
   2554            }
   2555            *len = 2;
   2556            return disAMode_copy2tmp(
   2557                   handleAddrOverrides(vbi, pfx,
   2558                      binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
   2559          }
   2560 
   2561       /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
   2562          REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
   2563       */
   2564       case 0x10: case 0x11: case 0x12: case 0x13:
   2565       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   2566          { UChar rm = toUChar(mod_reg_rm & 7);
   2567            Long  d  = getSDisp32(delta);
   2568            DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
   2569            *len = 5;
   2570            return disAMode_copy2tmp(
   2571                   handleAddrOverrides(vbi, pfx,
   2572                      binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
   2573          }
   2574 
   2575       /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
   2576       /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
   2577       case 0x18: case 0x19: case 0x1A: case 0x1B:
   2578       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   2579          vpanic("disAMode(amd64): not an addr!");
   2580 
   2581       /* RIP + disp32.  This assumes that guest_RIP_curr_instr is set
   2582          correctly at the start of handling each instruction. */
   2583       case 0x05:
   2584          { Long d = getSDisp32(delta);
   2585            *len = 5;
   2586            DIS(buf, "%s%lld(%%rip)", segRegTxt(pfx), d);
   2587            /* We need to know the next instruction's start address.
   2588               Try and figure out what it is, record the guess, and ask
   2589               the top-level driver logic (bbToIR_AMD64) to check we
   2590               guessed right, after the instruction is completely
   2591               decoded. */
   2592            guest_RIP_next_mustcheck = True;
   2593            guest_RIP_next_assumed = guest_RIP_bbstart
   2594                                     + delta+4 + extra_bytes;
   2595            return disAMode_copy2tmp(
   2596                      handleAddrOverrides(vbi, pfx,
   2597                         binop(Iop_Add64, mkU64(guest_RIP_next_assumed),
   2598                                          mkU64(d))));
   2599          }
   2600 
   2601       case 0x04: {
   2602          /* SIB, with no displacement.  Special cases:
   2603             -- %rsp cannot act as an index value.
   2604                If index_r indicates %rsp, zero is used for the index.
   2605             -- when mod is zero and base indicates RBP or R13, base is
   2606                instead a 32-bit sign-extended literal.
   2607             It's all madness, I tell you.  Extract %index, %base and
   2608             scale from the SIB byte.  The value denoted is then:
   2609                | %index == %RSP && (%base == %RBP || %base == %R13)
   2610                = d32 following SIB byte
   2611                | %index == %RSP && !(%base == %RBP || %base == %R13)
   2612                = %base
   2613                | %index != %RSP && (%base == %RBP || %base == %R13)
   2614                = d32 following SIB byte + (%index << scale)
   2615                | %index != %RSP && !(%base == %RBP || %base == %R13)
   2616                = %base + (%index << scale)
   2617          */
   2618          UChar sib     = getUChar(delta);
   2619          UChar scale   = toUChar((sib >> 6) & 3);
   2620          UChar index_r = toUChar((sib >> 3) & 7);
   2621          UChar base_r  = toUChar(sib & 7);
   2622          /* correct since #(R13) == 8 + #(RBP) */
   2623          Bool  base_is_BPor13 = toBool(base_r == R_RBP);
   2624          Bool  index_is_SP    = toBool(index_r == R_RSP && 0==getRexX(pfx));
   2625          delta++;
   2626 
   2627          if ((!index_is_SP) && (!base_is_BPor13)) {
   2628             if (scale == 0) {
   2629                DIS(buf, "%s(%s,%s)", segRegTxt(pfx),
   2630                          nameIRegRexB(8,pfx,base_r),
   2631                          nameIReg64rexX(pfx,index_r));
   2632             } else {
   2633                DIS(buf, "%s(%s,%s,%d)", segRegTxt(pfx),
   2634                          nameIRegRexB(8,pfx,base_r),
   2635                          nameIReg64rexX(pfx,index_r), 1<<scale);
   2636             }
   2637             *len = 2;
   2638             return
   2639                disAMode_copy2tmp(
   2640                handleAddrOverrides(vbi, pfx,
   2641                   binop(Iop_Add64,
   2642                         getIRegRexB(8,pfx,base_r),
   2643                         binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
   2644                               mkU8(scale)))));
   2645          }
   2646 
   2647          if ((!index_is_SP) && base_is_BPor13) {
   2648             Long d = getSDisp32(delta);
   2649             DIS(buf, "%s%lld(,%s,%d)", segRegTxt(pfx), d,
   2650                       nameIReg64rexX(pfx,index_r), 1<<scale);
   2651             *len = 6;
   2652             return
   2653                disAMode_copy2tmp(
   2654                handleAddrOverrides(vbi, pfx,
   2655                   binop(Iop_Add64,
   2656                         binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
   2657                                          mkU8(scale)),
   2658                         mkU64(d))));
   2659          }
   2660 
   2661          if (index_is_SP && (!base_is_BPor13)) {
   2662             DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,base_r));
   2663             *len = 2;
   2664             return disAMode_copy2tmp(
   2665                    handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,base_r)));
   2666          }
   2667 
   2668          if (index_is_SP && base_is_BPor13) {
   2669             Long d = getSDisp32(delta);
   2670             DIS(buf, "%s%lld", segRegTxt(pfx), d);
   2671             *len = 6;
   2672             return disAMode_copy2tmp(
   2673                    handleAddrOverrides(vbi, pfx, mkU64(d)));
   2674          }
   2675 
   2676          vassert(0);
   2677       }
   2678 
   2679       /* SIB, with 8-bit displacement.  Special cases:
   2680          -- %esp cannot act as an index value.
   2681             If index_r indicates %esp, zero is used for the index.
   2682          Denoted value is:
   2683             | %index == %ESP
   2684             = d8 + %base
   2685             | %index != %ESP
   2686             = d8 + %base + (%index << scale)
   2687       */
   2688       case 0x0C: {
   2689          UChar sib     = getUChar(delta);
   2690          UChar scale   = toUChar((sib >> 6) & 3);
   2691          UChar index_r = toUChar((sib >> 3) & 7);
   2692          UChar base_r  = toUChar(sib & 7);
   2693          Long d        = getSDisp8(delta+1);
   2694 
   2695          if (index_r == R_RSP && 0==getRexX(pfx)) {
   2696             DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
   2697                                    d, nameIRegRexB(8,pfx,base_r));
   2698             *len = 3;
   2699             return disAMode_copy2tmp(
   2700                    handleAddrOverrides(vbi, pfx,
   2701                       binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
   2702          } else {
   2703             if (scale == 0) {
   2704                DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
   2705                          nameIRegRexB(8,pfx,base_r),
   2706                          nameIReg64rexX(pfx,index_r));
   2707             } else {
   2708                DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
   2709                          nameIRegRexB(8,pfx,base_r),
   2710                          nameIReg64rexX(pfx,index_r), 1<<scale);
   2711             }
   2712             *len = 3;
   2713             return
   2714                 disAMode_copy2tmp(
   2715                 handleAddrOverrides(vbi, pfx,
   2716                   binop(Iop_Add64,
   2717                         binop(Iop_Add64,
   2718                               getIRegRexB(8,pfx,base_r),
   2719                               binop(Iop_Shl64,
   2720                                     getIReg64rexX(pfx,index_r), mkU8(scale))),
   2721                         mkU64(d))));
   2722          }
   2723          vassert(0); /*NOTREACHED*/
   2724       }
   2725 
   2726       /* SIB, with 32-bit displacement.  Special cases:
   2727          -- %rsp cannot act as an index value.
   2728             If index_r indicates %rsp, zero is used for the index.
   2729          Denoted value is:
   2730             | %index == %RSP
   2731             = d32 + %base
   2732             | %index != %RSP
   2733             = d32 + %base + (%index << scale)
   2734       */
   2735       case 0x14: {
   2736          UChar sib     = getUChar(delta);
   2737          UChar scale   = toUChar((sib >> 6) & 3);
   2738          UChar index_r = toUChar((sib >> 3) & 7);
   2739          UChar base_r  = toUChar(sib & 7);
   2740          Long d        = getSDisp32(delta+1);
   2741 
   2742          if (index_r == R_RSP && 0==getRexX(pfx)) {
   2743             DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
   2744                                    d, nameIRegRexB(8,pfx,base_r));
   2745             *len = 6;
   2746             return disAMode_copy2tmp(
   2747                    handleAddrOverrides(vbi, pfx,
   2748                       binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
   2749          } else {
   2750             if (scale == 0) {
   2751                DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
   2752                          nameIRegRexB(8,pfx,base_r),
   2753                          nameIReg64rexX(pfx,index_r));
   2754             } else {
   2755                DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
   2756                          nameIRegRexB(8,pfx,base_r),
   2757                          nameIReg64rexX(pfx,index_r), 1<<scale);
   2758             }
   2759             *len = 6;
   2760             return
   2761                 disAMode_copy2tmp(
   2762                 handleAddrOverrides(vbi, pfx,
   2763                   binop(Iop_Add64,
   2764                         binop(Iop_Add64,
   2765                               getIRegRexB(8,pfx,base_r),
   2766                               binop(Iop_Shl64,
   2767                                     getIReg64rexX(pfx,index_r), mkU8(scale))),
   2768                         mkU64(d))));
   2769          }
   2770          vassert(0); /*NOTREACHED*/
   2771       }
   2772 
   2773       default:
   2774          vpanic("disAMode(amd64)");
   2775          return 0; /*notreached*/
   2776    }
   2777 }
   2778 
   2779 
   2780 /* Similarly for VSIB addressing.  This returns just the addend,
   2781    and fills in *rI and *vscale with the register number of the vector
   2782    index and its multiplicand.  */
   2783 static
   2784 IRTemp disAVSIBMode ( /*OUT*/Int* len,
   2785                       const VexAbiInfo* vbi, Prefix pfx, Long delta,
   2786                       /*OUT*/HChar* buf, /*OUT*/UInt* rI,
   2787                       IRType ty, /*OUT*/Int* vscale )
   2788 {
   2789    UChar mod_reg_rm = getUChar(delta);
   2790    const HChar *vindex;
   2791 
   2792    *len = 0;
   2793    *rI = 0;
   2794    *vscale = 0;
   2795    buf[0] = (UChar)0;
   2796    if ((mod_reg_rm & 7) != 4 || epartIsReg(mod_reg_rm))
   2797       return IRTemp_INVALID;
   2798 
   2799    UChar sib     = getUChar(delta+1);
   2800    UChar scale   = toUChar((sib >> 6) & 3);
   2801    UChar index_r = toUChar((sib >> 3) & 7);
   2802    UChar base_r  = toUChar(sib & 7);
   2803    Long  d       = 0;
   2804    /* correct since #(R13) == 8 + #(RBP) */
   2805    Bool  base_is_BPor13 = toBool(base_r == R_RBP);
   2806    delta += 2;
   2807    *len = 2;
   2808 
   2809    *rI = index_r | (getRexX(pfx) << 3);
   2810    if (ty == Ity_V128)
   2811       vindex = nameXMMReg(*rI);
   2812    else
   2813       vindex = nameYMMReg(*rI);
   2814    *vscale = 1<<scale;
   2815 
   2816    switch (mod_reg_rm >> 6) {
   2817    case 0:
   2818       if (base_is_BPor13) {
   2819          d = getSDisp32(delta);
   2820          *len += 4;
   2821          if (scale == 0) {
   2822             DIS(buf, "%s%lld(,%s)", segRegTxt(pfx), d, vindex);
   2823          } else {
   2824             DIS(buf, "%s%lld(,%s,%d)", segRegTxt(pfx), d, vindex, 1<<scale);
   2825          }
   2826          return disAMode_copy2tmp( mkU64(d) );
   2827       } else {
   2828          if (scale == 0) {
   2829             DIS(buf, "%s(%s,%s)", segRegTxt(pfx),
   2830                      nameIRegRexB(8,pfx,base_r), vindex);
   2831          } else {
   2832             DIS(buf, "%s(%s,%s,%d)", segRegTxt(pfx),
   2833                      nameIRegRexB(8,pfx,base_r), vindex, 1<<scale);
   2834          }
   2835       }
   2836       break;
   2837    case 1:
   2838       d = getSDisp8(delta);
   2839       *len += 1;
   2840       goto have_disp;
   2841    case 2:
   2842       d = getSDisp32(delta);
   2843       *len += 4;
   2844    have_disp:
   2845       if (scale == 0) {
   2846          DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
   2847                   nameIRegRexB(8,pfx,base_r), vindex);
   2848       } else {
   2849          DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
   2850                   nameIRegRexB(8,pfx,base_r), vindex, 1<<scale);
   2851       }
   2852       break;
   2853    }
   2854 
   2855    if (!d)
   2856       return disAMode_copy2tmp( getIRegRexB(8,pfx,base_r) );
   2857    return disAMode_copy2tmp( binop(Iop_Add64, getIRegRexB(8,pfx,base_r),
   2858                                    mkU64(d)) );
   2859 }
   2860 
   2861 
   2862 /* Figure out the number of (insn-stream) bytes constituting the amode
   2863    beginning at delta.  Is useful for getting hold of literals beyond
   2864    the end of the amode before it has been disassembled.  */
   2865 
   2866 static UInt lengthAMode ( Prefix pfx, Long delta )
   2867 {
   2868    UChar mod_reg_rm = getUChar(delta);
   2869    delta++;
   2870 
   2871    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   2872       jump table seems a bit excessive.
   2873    */
   2874    mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
   2875    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   2876                                                /* is now XX0XXYYY */
   2877    mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
   2878    switch (mod_reg_rm) {
   2879 
   2880       /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
   2881          REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
   2882       */
   2883       case 0x00: case 0x01: case 0x02: case 0x03:
   2884       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   2885          return 1;
   2886 
   2887       /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
   2888          REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
   2889       */
   2890       case 0x08: case 0x09: case 0x0A: case 0x0B:
   2891       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   2892          return 2;
   2893 
   2894       /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
   2895          REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
   2896       */
   2897       case 0x10: case 0x11: case 0x12: case 0x13:
   2898       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   2899          return 5;
   2900 
   2901       /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
   2902       /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
   2903       /* Not an address, but still handled. */
   2904       case 0x18: case 0x19: case 0x1A: case 0x1B:
   2905       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   2906          return 1;
   2907 
   2908       /* RIP + disp32. */
   2909       case 0x05:
   2910          return 5;
   2911 
   2912       case 0x04: {
   2913          /* SIB, with no displacement. */
   2914          UChar sib     = getUChar(delta);
   2915          UChar base_r  = toUChar(sib & 7);
   2916          /* correct since #(R13) == 8 + #(RBP) */
   2917          Bool  base_is_BPor13 = toBool(base_r == R_RBP);
   2918 
   2919          if (base_is_BPor13) {
   2920             return 6;
   2921          } else {
   2922             return 2;
   2923          }
   2924       }
   2925 
   2926       /* SIB, with 8-bit displacement. */
   2927       case 0x0C:
   2928          return 3;
   2929 
   2930       /* SIB, with 32-bit displacement. */
   2931       case 0x14:
   2932          return 6;
   2933 
   2934       default:
   2935          vpanic("lengthAMode(amd64)");
   2936          return 0; /*notreached*/
   2937    }
   2938 }
   2939 
   2940 
   2941 /*------------------------------------------------------------*/
   2942 /*--- Disassembling common idioms                          ---*/
   2943 /*------------------------------------------------------------*/
   2944 
   2945 typedef
   2946   enum { WithFlagNone=2, WithFlagCarry, WithFlagCarryX, WithFlagOverX }
   2947   WithFlag;
   2948 
   2949 /* Handle binary integer instructions of the form
   2950       op E, G  meaning
   2951       op reg-or-mem, reg
   2952    Is passed the a ptr to the modRM byte, the actual operation, and the
   2953    data size.  Returns the address advanced completely over this
   2954    instruction.
   2955 
   2956    E(src) is reg-or-mem
   2957    G(dst) is reg.
   2958 
   2959    If E is reg, -->    GET %G,  tmp
   2960                        OP %E,   tmp
   2961                        PUT tmp, %G
   2962 
   2963    If E is mem and OP is not reversible,
   2964                 -->    (getAddr E) -> tmpa
   2965                        LD (tmpa), tmpa
   2966                        GET %G, tmp2
   2967                        OP tmpa, tmp2
   2968                        PUT tmp2, %G
   2969 
   2970    If E is mem and OP is reversible
   2971                 -->    (getAddr E) -> tmpa
   2972                        LD (tmpa), tmpa
   2973                        OP %G, tmpa
   2974                        PUT tmpa, %G
   2975 */
   2976 static
   2977 ULong dis_op2_E_G ( const VexAbiInfo* vbi,
   2978                     Prefix      pfx,
   2979                     IROp        op8,
   2980                     WithFlag    flag,
   2981                     Bool        keep,
   2982                     Int         size,
   2983                     Long        delta0,
   2984                     const HChar* t_amd64opc )
   2985 {
   2986    HChar   dis_buf[50];
   2987    Int     len;
   2988    IRType  ty   = szToITy(size);
   2989    IRTemp  dst1 = newTemp(ty);
   2990    IRTemp  src  = newTemp(ty);
   2991    IRTemp  dst0 = newTemp(ty);
   2992    UChar   rm   = getUChar(delta0);
   2993    IRTemp  addr = IRTemp_INVALID;
   2994 
   2995    /* Stay sane -- check for valid (op8, flag, keep) combinations. */
   2996    switch (op8) {
   2997       case Iop_Add8:
   2998          switch (flag) {
   2999             case WithFlagNone: case WithFlagCarry:
   3000             case WithFlagCarryX: case WithFlagOverX:
   3001                vassert(keep);
   3002                break;
   3003             default:
   3004                vassert(0);
   3005          }
   3006          break;
   3007       case Iop_Sub8:
   3008          vassert(flag == WithFlagNone || flag == WithFlagCarry);
   3009          if (flag == WithFlagCarry) vassert(keep);
   3010          break;
   3011       case Iop_And8:
   3012          vassert(flag == WithFlagNone);
   3013          break;
   3014       case Iop_Or8: case Iop_Xor8:
   3015          vassert(flag == WithFlagNone);
   3016          vassert(keep);
   3017          break;
   3018       default:
   3019          vassert(0);
   3020    }
   3021 
   3022    if (epartIsReg(rm)) {
   3023       /* Specially handle XOR reg,reg, because that doesn't really
   3024          depend on reg, and doing the obvious thing potentially
   3025          generates a spurious value check failure due to the bogus
   3026          dependency.  Ditto SUB/SBB reg,reg. */
   3027       if ((op8 == Iop_Xor8 || ((op8 == Iop_Sub8) && keep))
   3028           && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
   3029          putIRegG(size,pfx,rm, mkU(ty,0));
   3030       }
   3031 
   3032       assign( dst0, getIRegG(size,pfx,rm) );
   3033       assign( src,  getIRegE(size,pfx,rm) );
   3034 
   3035       if (op8 == Iop_Add8 && flag == WithFlagCarry) {
   3036          helper_ADC( size, dst1, dst0, src,
   3037                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3038          putIRegG(size, pfx, rm, mkexpr(dst1));
   3039       } else
   3040       if (op8 == Iop_Sub8 && flag == WithFlagCarry) {
   3041          helper_SBB( size, dst1, dst0, src,
   3042                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3043          putIRegG(size, pfx, rm, mkexpr(dst1));
   3044       } else
   3045       if (op8 == Iop_Add8 && flag == WithFlagCarryX) {
   3046          helper_ADCX_ADOX( True/*isADCX*/, size, dst1, dst0, src );
   3047          putIRegG(size, pfx, rm, mkexpr(dst1));
   3048       } else
   3049       if (op8 == Iop_Add8 && flag == WithFlagOverX) {
   3050          helper_ADCX_ADOX( False/*!isADCX*/, size, dst1, dst0, src );
   3051          putIRegG(size, pfx, rm, mkexpr(dst1));
   3052       } else {
   3053          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   3054          if (isAddSub(op8))
   3055             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3056          else
   3057             setFlags_DEP1(op8, dst1, ty);
   3058          if (keep)
   3059             putIRegG(size, pfx, rm, mkexpr(dst1));
   3060       }
   3061 
   3062       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   3063                           nameIRegE(size,pfx,rm),
   3064                           nameIRegG(size,pfx,rm));
   3065       return 1+delta0;
   3066    } else {
   3067       /* E refers to memory */
   3068       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   3069       assign( dst0, getIRegG(size,pfx,rm) );
   3070       assign( src,  loadLE(szToITy(size), mkexpr(addr)) );
   3071 
   3072       if (op8 == Iop_Add8 && flag == WithFlagCarry) {
   3073          helper_ADC( size, dst1, dst0, src,
   3074                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3075          putIRegG(size, pfx, rm, mkexpr(dst1));
   3076       } else
   3077       if (op8 == Iop_Sub8 && flag == WithFlagCarry) {
   3078          helper_SBB( size, dst1, dst0, src,
   3079                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3080          putIRegG(size, pfx, rm, mkexpr(dst1));
   3081       } else
   3082       if (op8 == Iop_Add8 && flag == WithFlagCarryX) {
   3083          /* normal store */
   3084          helper_ADCX_ADOX( True/*isADCX*/, size, dst1, dst0, src );
   3085       } else
   3086       if (op8 == Iop_Add8 && flag == WithFlagOverX) {
   3087          /* normal store */
   3088          helper_ADCX_ADOX( False/*!isADCX*/, size, dst1, dst0, src );
   3089       } else {
   3090          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   3091          if (isAddSub(op8))
   3092             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3093          else
   3094             setFlags_DEP1(op8, dst1, ty);
   3095          if (keep)
   3096             putIRegG(size, pfx, rm, mkexpr(dst1));
   3097       }
   3098 
   3099       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   3100                           dis_buf, nameIRegG(size, pfx, rm));
   3101       return len+delta0;
   3102    }
   3103 }
   3104 
   3105 
   3106 
   3107 /* Handle binary integer instructions of the form
   3108       op G, E  meaning
   3109       op reg, reg-or-mem
   3110    Is passed the a ptr to the modRM byte, the actual operation, and the
   3111    data size.  Returns the address advanced completely over this
   3112    instruction.
   3113 
   3114    G(src) is reg.
   3115    E(dst) is reg-or-mem
   3116 
   3117    If E is reg, -->    GET %E,  tmp
   3118                        OP %G,   tmp
   3119                        PUT tmp, %E
   3120 
   3121    If E is mem, -->    (getAddr E) -> tmpa
   3122                        LD (tmpa), tmpv
   3123                        OP %G, tmpv
   3124                        ST tmpv, (tmpa)
   3125 */
   3126 static
   3127 ULong dis_op2_G_E ( const VexAbiInfo* vbi,
   3128                     Prefix      pfx,
   3129                     IROp        op8,
   3130                     WithFlag    flag,
   3131                     Bool        keep,
   3132                     Int         size,
   3133                     Long        delta0,
   3134                     const HChar* t_amd64opc )
   3135 {
   3136    HChar   dis_buf[50];
   3137    Int     len;
   3138    IRType  ty   = szToITy(size);
   3139    IRTemp  dst1 = newTemp(ty);
   3140    IRTemp  src  = newTemp(ty);
   3141    IRTemp  dst0 = newTemp(ty);
   3142    UChar   rm   = getUChar(delta0);
   3143    IRTemp  addr = IRTemp_INVALID;
   3144 
   3145    /* Stay sane -- check for valid (op8, flag, keep) combinations. */
   3146    switch (op8) {
   3147       case Iop_Add8:
   3148          vassert(flag == WithFlagNone || flag == WithFlagCarry);
   3149          vassert(keep);
   3150          break;
   3151       case Iop_Sub8:
   3152          vassert(flag == WithFlagNone || flag == WithFlagCarry);
   3153          if (flag == WithFlagCarry) vassert(keep);
   3154          break;
   3155       case Iop_And8: case Iop_Or8: case Iop_Xor8:
   3156          vassert(flag == WithFlagNone);
   3157          vassert(keep);
   3158          break;
   3159       default:
   3160          vassert(0);
   3161    }
   3162 
   3163    /* flag != WithFlagNone is only allowed for Add and Sub and indicates the
   3164       intended operation is add-with-carry or subtract-with-borrow. */
   3165 
   3166    if (epartIsReg(rm)) {
   3167       /* Specially handle XOR reg,reg, because that doesn't really
   3168          depend on reg, and doing the obvious thing potentially
   3169          generates a spurious value check failure due to the bogus
   3170          dependency.  Ditto SUB/SBB reg,reg. */
   3171       if ((op8 == Iop_Xor8 || ((op8 == Iop_Sub8) && keep))
   3172           && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
   3173          putIRegE(size,pfx,rm, mkU(ty,0));
   3174       }
   3175 
   3176       assign(dst0, getIRegE(size,pfx,rm));
   3177       assign(src,  getIRegG(size,pfx,rm));
   3178 
   3179       if (op8 == Iop_Add8 && flag == WithFlagCarry) {
   3180          helper_ADC( size, dst1, dst0, src,
   3181                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3182          putIRegE(size, pfx, rm, mkexpr(dst1));
   3183       } else
   3184       if (op8 == Iop_Sub8 && flag == WithFlagCarry) {
   3185          helper_SBB( size, dst1, dst0, src,
   3186                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3187          putIRegE(size, pfx, rm, mkexpr(dst1));
   3188       } else {
   3189          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3190          if (isAddSub(op8))
   3191             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3192          else
   3193             setFlags_DEP1(op8, dst1, ty);
   3194          if (keep)
   3195             putIRegE(size, pfx, rm, mkexpr(dst1));
   3196       }
   3197 
   3198       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   3199                           nameIRegG(size,pfx,rm),
   3200                           nameIRegE(size,pfx,rm));
   3201       return 1+delta0;
   3202    }
   3203 
   3204    /* E refers to memory */
   3205    {
   3206       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   3207       assign(dst0, loadLE(ty,mkexpr(addr)));
   3208       assign(src,  getIRegG(size,pfx,rm));
   3209 
   3210       if (op8 == Iop_Add8 && flag == WithFlagCarry) {
   3211          if (haveLOCK(pfx)) {
   3212             /* cas-style store */
   3213             helper_ADC( size, dst1, dst0, src,
   3214                         /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3215          } else {
   3216             /* normal store */
   3217             helper_ADC( size, dst1, dst0, src,
   3218                         /*store*/addr, IRTemp_INVALID, 0 );
   3219          }
   3220       } else
   3221       if (op8 == Iop_Sub8 && flag == WithFlagCarry) {
   3222          if (haveLOCK(pfx)) {
   3223             /* cas-style store */
   3224             helper_SBB( size, dst1, dst0, src,
   3225                         /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3226          } else {
   3227             /* normal store */
   3228             helper_SBB( size, dst1, dst0, src,
   3229                         /*store*/addr, IRTemp_INVALID, 0 );
   3230          }
   3231       } else {
   3232          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3233          if (keep) {
   3234             if (haveLOCK(pfx)) {
   3235                if (0) vex_printf("locked case\n" );
   3236                casLE( mkexpr(addr),
   3237                       mkexpr(dst0)/*expval*/,
   3238                       mkexpr(dst1)/*newval*/, guest_RIP_curr_instr );
   3239             } else {
   3240                if (0) vex_printf("nonlocked case\n");
   3241                storeLE(mkexpr(addr), mkexpr(dst1));
   3242             }
   3243          }
   3244          if (isAddSub(op8))
   3245             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3246          else
   3247             setFlags_DEP1(op8, dst1, ty);
   3248       }
   3249 
   3250       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   3251                           nameIRegG(size,pfx,rm), dis_buf);
   3252       return len+delta0;
   3253    }
   3254 }
   3255 
   3256 
   3257 /* Handle move instructions of the form
   3258       mov E, G  meaning
   3259       mov reg-or-mem, reg
   3260    Is passed the a ptr to the modRM byte, and the data size.  Returns
   3261    the address advanced completely over this instruction.
   3262 
   3263    E(src) is reg-or-mem
   3264    G(dst) is reg.
   3265 
   3266    If E is reg, -->    GET %E,  tmpv
   3267                        PUT tmpv, %G
   3268 
   3269    If E is mem  -->    (getAddr E) -> tmpa
   3270                        LD (tmpa), tmpb
   3271                        PUT tmpb, %G
   3272 */
   3273 static
   3274 ULong dis_mov_E_G ( const VexAbiInfo* vbi,
   3275                     Prefix      pfx,
   3276                     Int         size,
   3277                     Long        delta0 )
   3278 {
   3279    Int len;
   3280    UChar rm = getUChar(delta0);
   3281    HChar dis_buf[50];
   3282 
   3283    if (epartIsReg(rm)) {
   3284       putIRegG(size, pfx, rm, getIRegE(size, pfx, rm));
   3285       DIP("mov%c %s,%s\n", nameISize(size),
   3286                            nameIRegE(size,pfx,rm),
   3287                            nameIRegG(size,pfx,rm));
   3288       return 1+delta0;
   3289    }
   3290 
   3291    /* E refers to memory */
   3292    {
   3293       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   3294       putIRegG(size, pfx, rm, loadLE(szToITy(size), mkexpr(addr)));
   3295       DIP("mov%c %s,%s\n", nameISize(size),
   3296                            dis_buf,
   3297                            nameIRegG(size,pfx,rm));
   3298       return delta0+len;
   3299    }
   3300 }
   3301 
   3302 
   3303 /* Handle move instructions of the form
   3304       mov G, E  meaning
   3305       mov reg, reg-or-mem
   3306    Is passed the a ptr to the modRM byte, and the data size.  Returns
   3307    the address advanced completely over this instruction.
   3308    We have to decide here whether F2 or F3 are acceptable.  F2 never is.
   3309 
   3310    G(src) is reg.
   3311    E(dst) is reg-or-mem
   3312 
   3313    If E is reg, -->    GET %G,  tmp
   3314                        PUT tmp, %E
   3315 
   3316    If E is mem, -->    (getAddr E) -> tmpa
   3317                        GET %G, tmpv
   3318                        ST tmpv, (tmpa)
   3319 */
   3320 static
   3321 ULong dis_mov_G_E ( const VexAbiInfo*  vbi,
   3322                     Prefix       pfx,
   3323                     Int          size,
   3324                     Long         delta0,
   3325                     /*OUT*/Bool* ok )
   3326 {
   3327    Int   len;
   3328    UChar rm = getUChar(delta0);
   3329    HChar dis_buf[50];
   3330 
   3331    *ok = True;
   3332 
   3333    if (epartIsReg(rm)) {
   3334       if (haveF2orF3(pfx)) { *ok = False; return delta0; }
   3335       putIRegE(size, pfx, rm, getIRegG(size, pfx, rm));
   3336       DIP("mov%c %s,%s\n", nameISize(size),
   3337                            nameIRegG(size,pfx,rm),
   3338                            nameIRegE(size,pfx,rm));
   3339       return 1+delta0;
   3340    }
   3341 
   3342    /* E refers to memory */
   3343    {
   3344       if (haveF2(pfx)) { *ok = False; return delta0; }
   3345       /* F3(XRELEASE) is acceptable, though. */
   3346       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   3347       storeLE( mkexpr(addr), getIRegG(size, pfx, rm) );
   3348       DIP("mov%c %s,%s\n", nameISize(size),
   3349                            nameIRegG(size,pfx,rm),
   3350                            dis_buf);
   3351       return len+delta0;
   3352    }
   3353 }
   3354 
   3355 
   3356 /* op $immediate, AL/AX/EAX/RAX. */
   3357 static
   3358 ULong dis_op_imm_A ( Int    size,
   3359                      Bool   carrying,
   3360                      IROp   op8,
   3361                      Bool   keep,
   3362                      Long   delta,
   3363                      const HChar* t_amd64opc )
   3364 {
   3365    Int    size4 = imin(size,4);
   3366    IRType ty    = szToITy(size);
   3367    IRTemp dst0  = newTemp(ty);
   3368    IRTemp src   = newTemp(ty);
   3369    IRTemp dst1  = newTemp(ty);
   3370    Long  lit    = getSDisp(size4,delta);
   3371    assign(dst0, getIRegRAX(size));
   3372    assign(src,  mkU(ty,lit & mkSizeMask(size)));
   3373 
   3374    if (isAddSub(op8) && !carrying) {
   3375       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   3376       setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3377    }
   3378    else
   3379    if (isLogic(op8)) {
   3380       vassert(!carrying);
   3381       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   3382       setFlags_DEP1(op8, dst1, ty);
   3383    }
   3384    else
   3385    if (op8 == Iop_Add8 && carrying) {
   3386       helper_ADC( size, dst1, dst0, src,
   3387                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3388    }
   3389    else
   3390    if (op8 == Iop_Sub8 && carrying) {
   3391       helper_SBB( size, dst1, dst0, src,
   3392                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3393    }
   3394    else
   3395       vpanic("dis_op_imm_A(amd64,guest)");
   3396 
   3397    if (keep)
   3398       putIRegRAX(size, mkexpr(dst1));
   3399 
   3400    DIP("%s%c $%lld, %s\n", t_amd64opc, nameISize(size),
   3401                            lit, nameIRegRAX(size));
   3402    return delta+size4;
   3403 }
   3404 
   3405 
   3406 /* Sign- and Zero-extending moves. */
   3407 static
   3408 ULong dis_movx_E_G ( const VexAbiInfo* vbi,
   3409                      Prefix pfx,
   3410                      Long delta, Int szs, Int szd, Bool sign_extend )
   3411 {
   3412    UChar rm = getUChar(delta);
   3413    if (epartIsReg(rm)) {
   3414       putIRegG(szd, pfx, rm,
   3415                     doScalarWidening(
   3416                        szs,szd,sign_extend,
   3417                        getIRegE(szs,pfx,rm)));
   3418       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   3419                                nameISize(szs),
   3420                                nameISize(szd),
   3421                                nameIRegE(szs,pfx,rm),
   3422                                nameIRegG(szd,pfx,rm));
   3423       return 1+delta;
   3424    }
   3425 
   3426    /* E refers to memory */
   3427    {
   3428       Int    len;
   3429       HChar  dis_buf[50];
   3430       IRTemp addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   3431       putIRegG(szd, pfx, rm,
   3432                     doScalarWidening(
   3433                        szs,szd,sign_extend,
   3434                        loadLE(szToITy(szs),mkexpr(addr))));
   3435       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   3436                                nameISize(szs),
   3437                                nameISize(szd),
   3438                                dis_buf,
   3439                                nameIRegG(szd,pfx,rm));
   3440       return len+delta;
   3441    }
   3442 }
   3443 
   3444 
   3445 /* Generate code to divide ArchRegs RDX:RAX / EDX:EAX / DX:AX / AX by
   3446    the 64 / 32 / 16 / 8 bit quantity in the given IRTemp.  */
   3447 static
   3448 void codegen_div ( Int sz, IRTemp t, Bool signed_divide )
   3449 {
   3450    /* special-case the 64-bit case */
   3451    if (sz == 8) {
   3452       IROp   op     = signed_divide ? Iop_DivModS128to64
   3453                                     : Iop_DivModU128to64;
   3454       IRTemp src128 = newTemp(Ity_I128);
   3455       IRTemp dst128 = newTemp(Ity_I128);
   3456       assign( src128, binop(Iop_64HLto128,
   3457                             getIReg64(R_RDX),
   3458                             getIReg64(R_RAX)) );
   3459       assign( dst128, binop(op, mkexpr(src128), mkexpr(t)) );
   3460       putIReg64( R_RAX, unop(Iop_128to64,mkexpr(dst128)) );
   3461       putIReg64( R_RDX, unop(Iop_128HIto64,mkexpr(dst128)) );
   3462    } else {
   3463       IROp   op    = signed_divide ? Iop_DivModS64to32
   3464                                    : Iop_DivModU64to32;
   3465       IRTemp src64 = newTemp(Ity_I64);
   3466       IRTemp dst64 = newTemp(Ity_I64);
   3467       switch (sz) {
   3468       case 4:
   3469          assign( src64,
   3470                  binop(Iop_32HLto64, getIRegRDX(4), getIRegRAX(4)) );
   3471          assign( dst64,
   3472                  binop(op, mkexpr(src64), mkexpr(t)) );
   3473          putIRegRAX( 4, unop(Iop_64to32,mkexpr(dst64)) );
   3474          putIRegRDX( 4, unop(Iop_64HIto32,mkexpr(dst64)) );
   3475          break;
   3476       case 2: {
   3477          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   3478          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   3479          assign( src64, unop(widen3264,
   3480                              binop(Iop_16HLto32,
   3481                                    getIRegRDX(2),
   3482                                    getIRegRAX(2))) );
   3483          assign( dst64, binop(op, mkexpr(src64), unop(widen1632,mkexpr(t))) );
   3484          putIRegRAX( 2, unop(Iop_32to16,unop(Iop_64to32,mkexpr(dst64))) );
   3485          putIRegRDX( 2, unop(Iop_32to16,unop(Iop_64HIto32,mkexpr(dst64))) );
   3486          break;
   3487       }
   3488       case 1: {
   3489          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   3490          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   3491          IROp widen816  = signed_divide ? Iop_8Sto16  : Iop_8Uto16;
   3492          assign( src64, unop(widen3264,
   3493                         unop(widen1632, getIRegRAX(2))) );
   3494          assign( dst64,
   3495                  binop(op, mkexpr(src64),
   3496                            unop(widen1632, unop(widen816, mkexpr(t)))) );
   3497          putIRegRAX( 1, unop(Iop_16to8,
   3498                         unop(Iop_32to16,
   3499                         unop(Iop_64to32,mkexpr(dst64)))) );
   3500          putIRegAH( unop(Iop_16to8,
   3501                     unop(Iop_32to16,
   3502                     unop(Iop_64HIto32,mkexpr(dst64)))) );
   3503          break;
   3504       }
   3505       default:
   3506          vpanic("codegen_div(amd64)");
   3507       }
   3508    }
   3509 }
   3510 
   3511 static
   3512 ULong dis_Grp1 ( const VexAbiInfo* vbi,
   3513                  Prefix pfx,
   3514                  Long delta, UChar modrm,
   3515                  Int am_sz, Int d_sz, Int sz, Long d64 )
   3516 {
   3517    Int     len;
   3518    HChar   dis_buf[50];
   3519    IRType  ty   = szToITy(sz);
   3520    IRTemp  dst1 = newTemp(ty);
   3521    IRTemp  src  = newTemp(ty);
   3522    IRTemp  dst0 = newTemp(ty);
   3523    IRTemp  addr = IRTemp_INVALID;
   3524    IROp    op8  = Iop_INVALID;
   3525    ULong   mask = mkSizeMask(sz);
   3526 
   3527    switch (gregLO3ofRM(modrm)) {
   3528       case 0: op8 = Iop_Add8; break;  case 1: op8 = Iop_Or8;  break;
   3529       case 2: break;  // ADC
   3530       case 3: break;  // SBB
   3531       case 4: op8 = Iop_And8; break;  case 5: op8 = Iop_Sub8; break;
   3532       case 6: op8 = Iop_Xor8; break;  case 7: op8 = Iop_Sub8; break;
   3533       /*NOTREACHED*/
   3534       default: vpanic("dis_Grp1(amd64): unhandled case");
   3535    }
   3536 
   3537    if (epartIsReg(modrm)) {
   3538       vassert(am_sz == 1);
   3539 
   3540       assign(dst0, getIRegE(sz,pfx,modrm));
   3541       assign(src,  mkU(ty,d64 & mask));
   3542 
   3543       if (gregLO3ofRM(modrm) == 2 /* ADC */) {
   3544          helper_ADC( sz, dst1, dst0, src,
   3545                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3546       } else
   3547       if (gregLO3ofRM(modrm) == 3 /* SBB */) {
   3548          helper_SBB( sz, dst1, dst0, src,
   3549                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3550       } else {
   3551          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3552          if (isAddSub(op8))
   3553             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3554          else
   3555             setFlags_DEP1(op8, dst1, ty);
   3556       }
   3557 
   3558       if (gregLO3ofRM(modrm) < 7)
   3559          putIRegE(sz, pfx, modrm, mkexpr(dst1));
   3560 
   3561       delta += (am_sz + d_sz);
   3562       DIP("%s%c $%lld, %s\n",
   3563           nameGrp1(gregLO3ofRM(modrm)), nameISize(sz), d64,
   3564           nameIRegE(sz,pfx,modrm));
   3565    } else {
   3566       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
   3567 
   3568       assign(dst0, loadLE(ty,mkexpr(addr)));
   3569       assign(src, mkU(ty,d64 & mask));
   3570 
   3571       if (gregLO3ofRM(modrm) == 2 /* ADC */) {
   3572          if (haveLOCK(pfx)) {
   3573             /* cas-style store */
   3574             helper_ADC( sz, dst1, dst0, src,
   3575                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3576          } else {
   3577             /* normal store */
   3578             helper_ADC( sz, dst1, dst0, src,
   3579                         /*store*/addr, IRTemp_INVALID, 0 );
   3580          }
   3581       } else
   3582       if (gregLO3ofRM(modrm) == 3 /* SBB */) {
   3583          if (haveLOCK(pfx)) {
   3584             /* cas-style store */
   3585             helper_SBB( sz, dst1, dst0, src,
   3586                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3587          } else {
   3588             /* normal store */
   3589             helper_SBB( sz, dst1, dst0, src,
   3590                         /*store*/addr, IRTemp_INVALID, 0 );
   3591          }
   3592       } else {
   3593          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3594          if (gregLO3ofRM(modrm) < 7) {
   3595             if (haveLOCK(pfx)) {
   3596                casLE( mkexpr(addr), mkexpr(dst0)/*expVal*/,
   3597                                     mkexpr(dst1)/*newVal*/,
   3598                                     guest_RIP_curr_instr );
   3599             } else {
   3600                storeLE(mkexpr(addr), mkexpr(dst1));
   3601             }
   3602          }
   3603          if (isAddSub(op8))
   3604             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3605          else
   3606             setFlags_DEP1(op8, dst1, ty);
   3607       }
   3608 
   3609       delta += (len+d_sz);
   3610       DIP("%s%c $%lld, %s\n",
   3611           nameGrp1(gregLO3ofRM(modrm)), nameISize(sz),
   3612           d64, dis_buf);
   3613    }
   3614    return delta;
   3615 }
   3616 
   3617 
   3618 /* Group 2 extended opcodes.  shift_expr must be an 8-bit typed
   3619    expression. */
   3620 
   3621 static
   3622 ULong dis_Grp2 ( const VexAbiInfo* vbi,
   3623                  Prefix pfx,
   3624                  Long delta, UChar modrm,
   3625                  Int am_sz, Int d_sz, Int sz, IRExpr* shift_expr,
   3626                  const HChar* shift_expr_txt, Bool* decode_OK )
   3627 {
   3628    /* delta on entry points at the modrm byte. */
   3629    HChar  dis_buf[50];
   3630    Int    len;
   3631    Bool   isShift, isRotate, isRotateC;
   3632    IRType ty    = szToITy(sz);
   3633    IRTemp dst0  = newTemp(ty);
   3634    IRTemp dst1  = newTemp(ty);
   3635    IRTemp addr  = IRTemp_INVALID;
   3636 
   3637    *decode_OK = True;
   3638 
   3639    vassert(sz == 1 || sz == 2 || sz == 4 || sz == 8);
   3640 
   3641    /* Put value to shift/rotate in dst0. */
   3642    if (epartIsReg(modrm)) {
   3643       assign(dst0, getIRegE(sz, pfx, modrm));
   3644       delta += (am_sz + d_sz);
   3645    } else {
   3646       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
   3647       assign(dst0, loadLE(ty,mkexpr(addr)));
   3648       delta += len + d_sz;
   3649    }
   3650 
   3651    isShift = False;
   3652    switch (gregLO3ofRM(modrm)) { case 4: case 5: case 6: case 7: isShift = True; }
   3653 
   3654    isRotate = False;
   3655    switch (gregLO3ofRM(modrm)) { case 0: case 1: isRotate = True; }
   3656 
   3657    isRotateC = False;
   3658    switch (gregLO3ofRM(modrm)) { case 2: case 3: isRotateC = True; }
   3659 
   3660    if (!isShift && !isRotate && !isRotateC) {
   3661       /*NOTREACHED*/
   3662       vpanic("dis_Grp2(Reg): unhandled case(amd64)");
   3663    }
   3664 
   3665    if (isRotateC) {
   3666       /* Call a helper; this insn is so ridiculous it does not deserve
   3667          better.  One problem is, the helper has to calculate both the
   3668          new value and the new flags.  This is more than 64 bits, and
   3669          there is no way to return more than 64 bits from the helper.
   3670          Hence the crude and obvious solution is to call it twice,
   3671          using the sign of the sz field to indicate whether it is the
   3672          value or rflags result we want.
   3673       */
   3674       Bool     left = toBool(gregLO3ofRM(modrm) == 2);
   3675       IRExpr** argsVALUE;
   3676       IRExpr** argsRFLAGS;
   3677 
   3678       IRTemp new_value  = newTemp(Ity_I64);
   3679       IRTemp new_rflags = newTemp(Ity_I64);
   3680       IRTemp old_rflags = newTemp(Ity_I64);
   3681 
   3682       assign( old_rflags, widenUto64(mk_amd64g_calculate_rflags_all()) );
   3683 
   3684       argsVALUE
   3685          = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
   3686                           widenUto64(shift_expr),   /* rotate amount */
   3687                           mkexpr(old_rflags),
   3688                           mkU64(sz) );
   3689       assign( new_value,
   3690                  mkIRExprCCall(
   3691                     Ity_I64,
   3692                     0/*regparm*/,
   3693                     left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
   3694                     left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
   3695                     argsVALUE
   3696                  )
   3697             );
   3698 
   3699       argsRFLAGS
   3700          = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
   3701                           widenUto64(shift_expr),   /* rotate amount */
   3702                           mkexpr(old_rflags),
   3703                           mkU64(-sz) );
   3704       assign( new_rflags,
   3705                  mkIRExprCCall(
   3706                     Ity_I64,
   3707                     0/*regparm*/,
   3708                     left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
   3709                     left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
   3710                     argsRFLAGS
   3711                  )
   3712             );
   3713 
   3714       assign( dst1, narrowTo(ty, mkexpr(new_value)) );
   3715       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   3716       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(new_rflags) ));
   3717       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   3718       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   3719    }
   3720 
   3721    else
   3722    if (isShift) {
   3723 
   3724       IRTemp pre64     = newTemp(Ity_I64);
   3725       IRTemp res64     = newTemp(Ity_I64);
   3726       IRTemp res64ss   = newTemp(Ity_I64);
   3727       IRTemp shift_amt = newTemp(Ity_I8);
   3728       UChar  mask      = toUChar(sz==8 ? 63 : 31);
   3729       IROp   op64;
   3730 
   3731       switch (gregLO3ofRM(modrm)) {
   3732          case 4: op64 = Iop_Shl64; break;
   3733          case 5: op64 = Iop_Shr64; break;
   3734          case 6: op64 = Iop_Shl64; break;
   3735          case 7: op64 = Iop_Sar64; break;
   3736          /*NOTREACHED*/
   3737          default: vpanic("dis_Grp2:shift"); break;
   3738       }
   3739 
   3740       /* Widen the value to be shifted to 64 bits, do the shift, and
   3741          narrow back down.  This seems surprisingly long-winded, but
   3742          unfortunately the AMD semantics requires that 8/16/32-bit
   3743          shifts give defined results for shift values all the way up
   3744          to 32, and this seems the simplest way to do it.  It has the
   3745          advantage that the only IR level shifts generated are of 64
   3746          bit values, and the shift amount is guaranteed to be in the
   3747          range 0 .. 63, thereby observing the IR semantics requiring
   3748          all shift values to be in the range 0 .. 2^word_size-1.
   3749 
   3750          Therefore the shift amount is masked with 63 for 64-bit shifts
   3751          and 31 for all others.
   3752       */
   3753       /* shift_amt = shift_expr & MASK, regardless of operation size */
   3754       assign( shift_amt, binop(Iop_And8, shift_expr, mkU8(mask)) );
   3755 
   3756       /* suitably widen the value to be shifted to 64 bits. */
   3757       assign( pre64, op64==Iop_Sar64 ? widenSto64(mkexpr(dst0))
   3758                                      : widenUto64(mkexpr(dst0)) );
   3759 
   3760       /* res64 = pre64 `shift` shift_amt */
   3761       assign( res64, binop(op64, mkexpr(pre64), mkexpr(shift_amt)) );
   3762 
   3763       /* res64ss = pre64 `shift` ((shift_amt - 1) & MASK) */
   3764       assign( res64ss,
   3765               binop(op64,
   3766                     mkexpr(pre64),
   3767                     binop(Iop_And8,
   3768                           binop(Iop_Sub8,
   3769                                 mkexpr(shift_amt), mkU8(1)),
   3770                           mkU8(mask))) );
   3771 
   3772       /* Build the flags thunk. */
   3773       setFlags_DEP1_DEP2_shift(op64, res64, res64ss, ty, shift_amt);
   3774 
   3775       /* Narrow the result back down. */
   3776       assign( dst1, narrowTo(ty, mkexpr(res64)) );
   3777 
   3778    } /* if (isShift) */
   3779 
   3780    else
   3781    if (isRotate) {
   3782       Int    ccOp      = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1
   3783                                         : (ty==Ity_I32 ? 2 : 3));
   3784       Bool   left      = toBool(gregLO3ofRM(modrm) == 0);
   3785       IRTemp rot_amt   = newTemp(Ity_I8);
   3786       IRTemp rot_amt64 = newTemp(Ity_I8);
   3787       IRTemp oldFlags  = newTemp(Ity_I64);
   3788       UChar  mask      = toUChar(sz==8 ? 63 : 31);
   3789 
   3790       /* rot_amt = shift_expr & mask */
   3791       /* By masking the rotate amount thusly, the IR-level Shl/Shr
   3792          expressions never shift beyond the word size and thus remain
   3793          well defined. */
   3794       assign(rot_amt64, binop(Iop_And8, shift_expr, mkU8(mask)));
   3795 
   3796       if (ty == Ity_I64)
   3797          assign(rot_amt, mkexpr(rot_amt64));
   3798       else
   3799          assign(rot_amt, binop(Iop_And8, mkexpr(rot_amt64), mkU8(8*sz-1)));
   3800 
   3801       if (left) {
   3802 
   3803          /* dst1 = (dst0 << rot_amt) | (dst0 >>u (wordsize-rot_amt)) */
   3804          assign(dst1,
   3805             binop( mkSizedOp(ty,Iop_Or8),
   3806                    binop( mkSizedOp(ty,Iop_Shl8),
   3807                           mkexpr(dst0),
   3808                           mkexpr(rot_amt)
   3809                    ),
   3810                    binop( mkSizedOp(ty,Iop_Shr8),
   3811                           mkexpr(dst0),
   3812                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   3813                    )
   3814             )
   3815          );
   3816          ccOp += AMD64G_CC_OP_ROLB;
   3817 
   3818       } else { /* right */
   3819 
   3820          /* dst1 = (dst0 >>u rot_amt) | (dst0 << (wordsize-rot_amt)) */
   3821          assign(dst1,
   3822             binop( mkSizedOp(ty,Iop_Or8),
   3823                    binop( mkSizedOp(ty,Iop_Shr8),
   3824                           mkexpr(dst0),
   3825                           mkexpr(rot_amt)
   3826                    ),
   3827                    binop( mkSizedOp(ty,Iop_Shl8),
   3828                           mkexpr(dst0),
   3829                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   3830                    )
   3831             )
   3832          );
   3833          ccOp += AMD64G_CC_OP_RORB;
   3834 
   3835       }
   3836 
   3837       /* dst1 now holds the rotated value.  Build flag thunk.  We
   3838          need the resulting value for this, and the previous flags.
   3839          Except don't set it if the rotate count is zero. */
   3840 
   3841       assign(oldFlags, mk_amd64g_calculate_rflags_all());
   3842 
   3843       /* rot_amt64 :: Ity_I8.  We need to convert it to I1. */
   3844       IRTemp rot_amt64b = newTemp(Ity_I1);
   3845       assign(rot_amt64b, binop(Iop_CmpNE8, mkexpr(rot_amt64), mkU8(0)) );
   3846 
   3847       /* CC_DEP1 is the rotated value.  CC_NDEP is flags before. */
   3848       stmt( IRStmt_Put( OFFB_CC_OP,
   3849                         IRExpr_ITE( mkexpr(rot_amt64b),
   3850                                     mkU64(ccOp),
   3851                                     IRExpr_Get(OFFB_CC_OP,Ity_I64) ) ));
   3852       stmt( IRStmt_Put( OFFB_CC_DEP1,
   3853                         IRExpr_ITE( mkexpr(rot_amt64b),
   3854                                     widenUto64(mkexpr(dst1)),
   3855                                     IRExpr_Get(OFFB_CC_DEP1,Ity_I64) ) ));
   3856       stmt( IRStmt_Put( OFFB_CC_DEP2,
   3857                         IRExpr_ITE( mkexpr(rot_amt64b),
   3858                                     mkU64(0),
   3859                                     IRExpr_Get(OFFB_CC_DEP2,Ity_I64) ) ));
   3860       stmt( IRStmt_Put( OFFB_CC_NDEP,
   3861                         IRExpr_ITE( mkexpr(rot_amt64b),
   3862                                     mkexpr(oldFlags),
   3863                                     IRExpr_Get(OFFB_CC_NDEP,Ity_I64) ) ));
   3864    } /* if (isRotate) */
   3865 
   3866    /* Save result, and finish up. */
   3867    if (epartIsReg(modrm)) {
   3868       putIRegE(sz, pfx, modrm, mkexpr(dst1));
   3869       if (vex_traceflags & VEX_TRACE_FE) {
   3870          vex_printf("%s%c ",
   3871                     nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
   3872          if (shift_expr_txt)
   3873             vex_printf("%s", shift_expr_txt);
   3874          else
   3875             ppIRExpr(shift_expr);
   3876          vex_printf(", %s\n", nameIRegE(sz,pfx,modrm));
   3877       }
   3878    } else {
   3879       storeLE(mkexpr(addr), mkexpr(dst1));
   3880       if (vex_traceflags & VEX_TRACE_FE) {
   3881          vex_printf("%s%c ",
   3882                     nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
   3883          if (shift_expr_txt)
   3884             vex_printf("%s", shift_expr_txt);
   3885          else
   3886             ppIRExpr(shift_expr);
   3887          vex_printf(", %s\n", dis_buf);
   3888       }
   3889    }
   3890    return delta;
   3891 }
   3892 
   3893 
   3894 /* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
   3895 static
   3896 ULong dis_Grp8_Imm ( const VexAbiInfo* vbi,
   3897                      Prefix pfx,
   3898                      Long delta, UChar modrm,
   3899                      Int am_sz, Int sz, ULong src_val,
   3900                      Bool* decode_OK )
   3901 {
   3902    /* src_val denotes a d8.
   3903       And delta on entry points at the modrm byte. */
   3904 
   3905    IRType ty     = szToITy(sz);
   3906    IRTemp t2     = newTemp(Ity_I64);
   3907    IRTemp t2m    = newTemp(Ity_I64);
   3908    IRTemp t_addr = IRTemp_INVALID;
   3909    HChar  dis_buf[50];
   3910    ULong  mask;
   3911 
   3912    /* we're optimists :-) */
   3913    *decode_OK = True;
   3914 
   3915    /* Check whether F2 or F3 are acceptable. */
   3916    if (epartIsReg(modrm)) {
   3917       /* F2 or F3 are not allowed in the register case. */
   3918       if (haveF2orF3(pfx)) {
   3919          *decode_OK = False;
   3920          return delta;
   3921      }
   3922    } else {
   3923       /* F2 or F3 (but not both) are allowable provided LOCK is also
   3924          present. */
   3925       if (haveF2orF3(pfx)) {
   3926          if (haveF2andF3(pfx) || !haveLOCK(pfx)) {
   3927             *decode_OK = False;
   3928             return delta;
   3929          }
   3930       }
   3931    }
   3932 
   3933    /* Limit src_val -- the bit offset -- to something within a word.
   3934       The Intel docs say that literal offsets larger than a word are
   3935       masked in this way. */
   3936    switch (sz) {
   3937       case 2:  src_val &= 15; break;
   3938       case 4:  src_val &= 31; break;
   3939       case 8:  src_val &= 63; break;
   3940       default: *decode_OK = False; return delta;
   3941    }
   3942 
   3943    /* Invent a mask suitable for the operation. */
   3944    switch (gregLO3ofRM(modrm)) {
   3945       case 4: /* BT */  mask = 0;                  break;
   3946       case 5: /* BTS */ mask = 1ULL << src_val;    break;
   3947       case 6: /* BTR */ mask = ~(1ULL << src_val); break;
   3948       case 7: /* BTC */ mask = 1ULL << src_val;    break;
   3949          /* If this needs to be extended, probably simplest to make a
   3950             new function to handle the other cases (0 .. 3).  The
   3951             Intel docs do however not indicate any use for 0 .. 3, so
   3952             we don't expect this to happen. */
   3953       default: *decode_OK = False; return delta;
   3954    }
   3955 
   3956    /* Fetch the value to be tested and modified into t2, which is
   3957       64-bits wide regardless of sz. */
   3958    if (epartIsReg(modrm)) {
   3959       vassert(am_sz == 1);
   3960       assign( t2, widenUto64(getIRegE(sz, pfx, modrm)) );
   3961       delta += (am_sz + 1);
   3962       DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
   3963                                 nameISize(sz),
   3964                                 src_val, nameIRegE(sz,pfx,modrm));
   3965    } else {
   3966       Int len;
   3967       t_addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 1 );
   3968       delta  += (len+1);
   3969       assign( t2, widenUto64(loadLE(ty, mkexpr(t_addr))) );
   3970       DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
   3971                                 nameISize(sz),
   3972                                 src_val, dis_buf);
   3973    }
   3974 
   3975    /* Compute the new value into t2m, if non-BT. */
   3976    switch (gregLO3ofRM(modrm)) {
   3977       case 4: /* BT */
   3978          break;
   3979       case 5: /* BTS */
   3980          assign( t2m, binop(Iop_Or64, mkU64(mask), mkexpr(t2)) );
   3981          break;
   3982       case 6: /* BTR */
   3983          assign( t2m, binop(Iop_And64, mkU64(mask), mkexpr(t2)) );
   3984          break;
   3985       case 7: /* BTC */
   3986          assign( t2m, binop(Iop_Xor64, mkU64(mask), mkexpr(t2)) );
   3987          break;
   3988      default:
   3989          /*NOTREACHED*/ /*the previous switch guards this*/
   3990          vassert(0);
   3991    }
   3992 
   3993    /* Write the result back, if non-BT. */
   3994    if (gregLO3ofRM(modrm) != 4 /* BT */) {
   3995       if (epartIsReg(modrm)) {
   3996         putIRegE(sz, pfx, modrm, narrowTo(ty, mkexpr(t2m)));
   3997       } else {
   3998          if (haveLOCK(pfx)) {
   3999             casLE( mkexpr(t_addr),
   4000                    narrowTo(ty, mkexpr(t2))/*expd*/,
   4001                    narrowTo(ty, mkexpr(t2m))/*new*/,
   4002                    guest_RIP_curr_instr );
   4003          } else {
   4004             storeLE(mkexpr(t_addr), narrowTo(ty, mkexpr(t2m)));
   4005          }
   4006       }
   4007    }
   4008 
   4009    /* Copy relevant bit from t2 into the carry flag. */
   4010    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
   4011    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   4012    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   4013    stmt( IRStmt_Put(
   4014             OFFB_CC_DEP1,
   4015             binop(Iop_And64,
   4016                   binop(Iop_Shr64, mkexpr(t2), mkU8(src_val)),
   4017                   mkU64(1))
   4018        ));
   4019    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   4020       elimination of previous stores to this field work better. */
   4021    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   4022 
   4023    return delta;
   4024 }
   4025 
   4026 
   4027 /* Signed/unsigned widening multiply.  Generate IR to multiply the
   4028    value in RAX/EAX/AX/AL by the given IRTemp, and park the result in
   4029    RDX:RAX/EDX:EAX/DX:AX/AX.
   4030 */
   4031 static void codegen_mulL_A_D ( Int sz, Bool syned,
   4032                                IRTemp tmp, const HChar* tmp_txt )
   4033 {
   4034    IRType ty = szToITy(sz);
   4035    IRTemp t1 = newTemp(ty);
   4036 
   4037    assign( t1, getIRegRAX(sz) );
   4038 
   4039    switch (ty) {
   4040       case Ity_I64: {
   4041          IRTemp res128  = newTemp(Ity_I128);
   4042          IRTemp resHi   = newTemp(Ity_I64);
   4043          IRTemp resLo   = newTemp(Ity_I64);
   4044          IROp   mulOp   = syned ? Iop_MullS64 : Iop_MullU64;
   4045          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   4046          setFlags_MUL ( Ity_I64, t1, tmp, tBaseOp );
   4047          assign( res128, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   4048          assign( resHi, unop(Iop_128HIto64,mkexpr(res128)));
   4049          assign( resLo, unop(Iop_128to64,mkexpr(res128)));
   4050          putIReg64(R_RDX, mkexpr(resHi));
   4051          putIReg64(R_RAX, mkexpr(resLo));
   4052          break;
   4053       }
   4054       case Ity_I32: {
   4055          IRTemp res64   = newTemp(Ity_I64);
   4056          IRTemp resHi   = newTemp(Ity_I32);
   4057          IRTemp resLo   = newTemp(Ity_I32);
   4058          IROp   mulOp   = syned ? Iop_MullS32 : Iop_MullU32;
   4059          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   4060          setFlags_MUL ( Ity_I32, t1, tmp, tBaseOp );
   4061          assign( res64, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   4062          assign( resHi, unop(Iop_64HIto32,mkexpr(res64)));
   4063          assign( resLo, unop(Iop_64to32,mkexpr(res64)));
   4064          putIRegRDX(4, mkexpr(resHi));
   4065          putIRegRAX(4, mkexpr(resLo));
   4066          break;
   4067       }
   4068       case Ity_I16: {
   4069          IRTemp res32   = newTemp(Ity_I32);
   4070          IRTemp resHi   = newTemp(Ity_I16);
   4071          IRTemp resLo   = newTemp(Ity_I16);
   4072          IROp   mulOp   = syned ? Iop_MullS16 : Iop_MullU16;
   4073          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   4074          setFlags_MUL ( Ity_I16, t1, tmp, tBaseOp );
   4075          assign( res32, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   4076          assign( resHi, unop(Iop_32HIto16,mkexpr(res32)));
   4077          assign( resLo, unop(Iop_32to16,mkexpr(res32)));
   4078          putIRegRDX(2, mkexpr(resHi));
   4079          putIRegRAX(2, mkexpr(resLo));
   4080          break;
   4081       }
   4082       case Ity_I8: {
   4083          IRTemp res16   = newTemp(Ity_I16);
   4084          IRTemp resHi   = newTemp(Ity_I8);
   4085          IRTemp resLo   = newTemp(Ity_I8);
   4086          IROp   mulOp   = syned ? Iop_MullS8 : Iop_MullU8;
   4087          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   4088          setFlags_MUL ( Ity_I8, t1, tmp, tBaseOp );
   4089          assign( res16, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   4090          assign( resHi, unop(Iop_16HIto8,mkexpr(res16)));
   4091          assign( resLo, unop(Iop_16to8,mkexpr(res16)));
   4092          putIRegRAX(2, mkexpr(res16));
   4093          break;
   4094       }
   4095       default:
   4096          ppIRType(ty);
   4097          vpanic("codegen_mulL_A_D(amd64)");
   4098    }
   4099    DIP("%s%c %s\n", syned ? "imul" : "mul", nameISize(sz), tmp_txt);
   4100 }
   4101 
   4102 
   4103 /* Group 3 extended opcodes.  We have to decide here whether F2 and F3
   4104    might be valid.*/
   4105 static
   4106 ULong dis_Grp3 ( const VexAbiInfo* vbi,
   4107                  Prefix pfx, Int sz, Long delta, Bool* decode_OK )
   4108 {
   4109    Long    d64;
   4110    UChar   modrm;
   4111    HChar   dis_buf[50];
   4112    Int     len;
   4113    IRTemp  addr;
   4114    IRType  ty = szToITy(sz);
   4115    IRTemp  t1 = newTemp(ty);
   4116    IRTemp dst1, src, dst0;
   4117    *decode_OK = True;
   4118    modrm = getUChar(delta);
   4119    if (epartIsReg(modrm)) {
   4120       /* F2/XACQ and F3/XREL are always invalid in the non-mem case. */
   4121       if (haveF2orF3(pfx)) goto unhandled;
   4122       switch (gregLO3ofRM(modrm)) {
   4123          case 0: { /* TEST */
   4124             delta++;
   4125             d64 = getSDisp(imin(4,sz), delta);
   4126             delta += imin(4,sz);
   4127             dst1 = newTemp(ty);
   4128             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   4129                                getIRegE(sz,pfx,modrm),
   4130                                mkU(ty, d64 & mkSizeMask(sz))));
   4131             setFlags_DEP1( Iop_And8, dst1, ty );
   4132             DIP("test%c $%lld, %s\n",
   4133                 nameISize(sz), d64,
   4134                 nameIRegE(sz, pfx, modrm));
   4135             break;
   4136          }
   4137          case 1:
   4138             *decode_OK = False;
   4139             return delta;
   4140          case 2: /* NOT */
   4141             delta++;
   4142             putIRegE(sz, pfx, modrm,
   4143                               unop(mkSizedOp(ty,Iop_Not8),
   4144                                    getIRegE(sz, pfx, modrm)));
   4145             DIP("not%c %s\n", nameISize(sz),
   4146                               nameIRegE(sz, pfx, modrm));
   4147             break;
   4148          case 3: /* NEG */
   4149             delta++;
   4150             dst0 = newTemp(ty);
   4151             src  = newTemp(ty);
   4152             dst1 = newTemp(ty);
   4153             assign(dst0, mkU(ty,0));
   4154             assign(src,  getIRegE(sz, pfx, modrm));
   4155             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
   4156                                                        mkexpr(src)));
   4157             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   4158             putIRegE(sz, pfx, modrm, mkexpr(dst1));
   4159             DIP("neg%c %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm));
   4160             break;
   4161          case 4: /* MUL (unsigned widening) */
   4162             delta++;
   4163             src = newTemp(ty);
   4164             assign(src, getIRegE(sz,pfx,modrm));
   4165             codegen_mulL_A_D ( sz, False, src,
   4166                                nameIRegE(sz,pfx,modrm) );
   4167             break;
   4168          case 5: /* IMUL (signed widening) */
   4169             delta++;
   4170             src = newTemp(ty);
   4171             assign(src, getIRegE(sz,pfx,modrm));
   4172             codegen_mulL_A_D ( sz, True, src,
   4173                                nameIRegE(sz,pfx,modrm) );
   4174             break;
   4175          case 6: /* DIV */
   4176             delta++;
   4177             assign( t1, getIRegE(sz, pfx, modrm) );
   4178             codegen_div ( sz, t1, False );
   4179             DIP("div%c %s\n", nameISize(sz),
   4180                               nameIRegE(sz, pfx, modrm));
   4181             break;
   4182          case 7: /* IDIV */
   4183             delta++;
   4184             assign( t1, getIRegE(sz, pfx, modrm) );
   4185             codegen_div ( sz, t1, True );
   4186             DIP("idiv%c %s\n", nameISize(sz),
   4187                                nameIRegE(sz, pfx, modrm));
   4188             break;
   4189          default:
   4190             /*NOTREACHED*/
   4191             vpanic("Grp3(amd64,R)");
   4192       }
   4193    } else {
   4194       /* Decide if F2/XACQ or F3/XREL might be valid. */
   4195       Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
   4196       if ((gregLO3ofRM(modrm) == 3/*NEG*/ || gregLO3ofRM(modrm) == 2/*NOT*/)
   4197           && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
   4198          validF2orF3 = True;
   4199       }
   4200       if (!validF2orF3) goto unhandled;
   4201       /* */
   4202       addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
   4203                         /* we have to inform disAMode of any immediate
   4204                            bytes used */
   4205                         gregLO3ofRM(modrm)==0/*TEST*/
   4206                            ? imin(4,sz)
   4207                            : 0
   4208                       );
   4209       t1   = newTemp(ty);
   4210       delta += len;
   4211       assign(t1, loadLE(ty,mkexpr(addr)));
   4212       switch (gregLO3ofRM(modrm)) {
   4213          case 0: { /* TEST */
   4214             d64 = getSDisp(imin(4,sz), delta);
   4215             delta += imin(4,sz);
   4216             dst1 = newTemp(ty);
   4217             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   4218                                mkexpr(t1),
   4219                                mkU(ty, d64 & mkSizeMask(sz))));
   4220             setFlags_DEP1( Iop_And8, dst1, ty );
   4221             DIP("test%c $%lld, %s\n", nameISize(sz), d64, dis_buf);
   4222             break;
   4223          }
   4224          case 1:
   4225             *decode_OK = False;
   4226             return delta;
   4227          case 2: /* NOT */
   4228             dst1 = newTemp(ty);
   4229             assign(dst1, unop(mkSizedOp(ty,Iop_Not8), mkexpr(t1)));
   4230             if (haveLOCK(pfx)) {
   4231                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   4232                                     guest_RIP_curr_instr );
   4233             } else {
   4234                storeLE( mkexpr(addr), mkexpr(dst1) );
   4235             }
   4236             DIP("not%c %s\n", nameISize(sz), dis_buf);
   4237             break;
   4238          case 3: /* NEG */
   4239             dst0 = newTemp(ty);
   4240             src  = newTemp(ty);
   4241             dst1 = newTemp(ty);
   4242             assign(dst0, mkU(ty,0));
   4243             assign(src,  mkexpr(t1));
   4244             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
   4245                                                        mkexpr(src)));
   4246             if (haveLOCK(pfx)) {
   4247                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   4248                                     guest_RIP_curr_instr );
   4249             } else {
   4250                storeLE( mkexpr(addr), mkexpr(dst1) );
   4251             }
   4252             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   4253             DIP("neg%c %s\n", nameISize(sz), dis_buf);
   4254             break;
   4255          case 4: /* MUL (unsigned widening) */
   4256             codegen_mulL_A_D ( sz, False, t1, dis_buf );
   4257             break;
   4258          case 5: /* IMUL */
   4259             codegen_mulL_A_D ( sz, True, t1, dis_buf );
   4260             break;
   4261          case 6: /* DIV */
   4262             codegen_div ( sz, t1, False );
   4263             DIP("div%c %s\n", nameISize(sz), dis_buf);
   4264             break;
   4265          case 7: /* IDIV */
   4266             codegen_div ( sz, t1, True );
   4267             DIP("idiv%c %s\n", nameISize(sz), dis_buf);
   4268             break;
   4269          default:
   4270             /*NOTREACHED*/
   4271             vpanic("Grp3(amd64,M)");
   4272       }
   4273    }
   4274    return delta;
   4275   unhandled:
   4276    *decode_OK = False;
   4277    return delta;
   4278 }
   4279 
   4280 
   4281 /* Group 4 extended opcodes.  We have to decide here whether F2 and F3
   4282    might be valid. */
   4283 static
   4284 ULong dis_Grp4 ( const VexAbiInfo* vbi,
   4285                  Prefix pfx, Long delta, Bool* decode_OK )
   4286 {
   4287    Int   alen;
   4288    UChar modrm;
   4289    HChar dis_buf[50];
   4290    IRType ty = Ity_I8;
   4291    IRTemp t1 = newTemp(ty);
   4292    IRTemp t2 = newTemp(ty);
   4293 
   4294    *decode_OK = True;
   4295 
   4296    modrm = getUChar(delta);
   4297    if (epartIsReg(modrm)) {
   4298       /* F2/XACQ and F3/XREL are always invalid in the non-mem case. */
   4299       if (haveF2orF3(pfx)) goto unhandled;
   4300       assign(t1, getIRegE(1, pfx, modrm));
   4301       switch (gregLO3ofRM(modrm)) {
   4302          case 0: /* INC */
   4303             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   4304             putIRegE(1, pfx, modrm, mkexpr(t2));
   4305             setFlags_INC_DEC( True, t2, ty );
   4306             break;
   4307          case 1: /* DEC */
   4308             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   4309             putIRegE(1, pfx, modrm, mkexpr(t2));
   4310             setFlags_INC_DEC( False, t2, ty );
   4311             break;
   4312          default:
   4313             *decode_OK = False;
   4314             return delta;
   4315       }
   4316       delta++;
   4317       DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)),
   4318                       nameIRegE(1, pfx, modrm));
   4319    } else {
   4320       /* Decide if F2/XACQ or F3/XREL might be valid. */
   4321       Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
   4322       if ((gregLO3ofRM(modrm) == 0/*INC*/ || gregLO3ofRM(modrm) == 1/*DEC*/)
   4323           && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
   4324          validF2orF3 = True;
   4325       }
   4326       if (!validF2orF3) goto unhandled;
   4327       /* */
   4328       IRTemp addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   4329       assign( t1, loadLE(ty, mkexpr(addr)) );
   4330       switch (gregLO3ofRM(modrm)) {
   4331          case 0: /* INC */
   4332             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   4333             if (haveLOCK(pfx)) {
   4334                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   4335                       guest_RIP_curr_instr );
   4336             } else {
   4337                storeLE( mkexpr(addr), mkexpr(t2) );
   4338             }
   4339             setFlags_INC_DEC( True, t2, ty );
   4340             break;
   4341          case 1: /* DEC */
   4342             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   4343             if (haveLOCK(pfx)) {
   4344                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   4345                       guest_RIP_curr_instr );
   4346             } else {
   4347                storeLE( mkexpr(addr), mkexpr(t2) );
   4348             }
   4349             setFlags_INC_DEC( False, t2, ty );
   4350             break;
   4351          default:
   4352             *decode_OK = False;
   4353             return delta;
   4354       }
   4355       delta += alen;
   4356       DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)), dis_buf);
   4357    }
   4358    return delta;
   4359   unhandled:
   4360    *decode_OK = False;
   4361    return delta;
   4362 }
   4363 
   4364 
   4365 /* Group 5 extended opcodes.  We have to decide here whether F2 and F3
   4366    might be valid. */
   4367 static
   4368 ULong dis_Grp5 ( const VexAbiInfo* vbi,
   4369                  Prefix pfx, Int sz, Long delta,
   4370                  /*MOD*/DisResult* dres, /*OUT*/Bool* decode_OK )
   4371 {
   4372    Int     len;
   4373    UChar   modrm;
   4374    HChar   dis_buf[50];
   4375    IRTemp  addr = IRTemp_INVALID;
   4376    IRType  ty = szToITy(sz);
   4377    IRTemp  t1 = newTemp(ty);
   4378    IRTemp  t2 = IRTemp_INVALID;
   4379    IRTemp  t3 = IRTemp_INVALID;
   4380    Bool    showSz = True;
   4381 
   4382    *decode_OK = True;
   4383 
   4384    modrm = getUChar(delta);
   4385    if (epartIsReg(modrm)) {
   4386       /* F2/XACQ and F3/XREL are always invalid in the non-mem case.
   4387          F2/CALL and F2/JMP may have bnd prefix. */
   4388      if (haveF2orF3(pfx)
   4389          && ! (haveF2(pfx)
   4390                && (gregLO3ofRM(modrm) == 2 || gregLO3ofRM(modrm) == 4)))
   4391         goto unhandledR;
   4392       assign(t1, getIRegE(sz,pfx,modrm));
   4393       switch (gregLO3ofRM(modrm)) {
   4394          case 0: /* INC */
   4395             t2 = newTemp(ty);
   4396             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   4397                              mkexpr(t1), mkU(ty,1)));
   4398             setFlags_INC_DEC( True, t2, ty );
   4399             putIRegE(sz,pfx,modrm, mkexpr(t2));
   4400             break;
   4401          case 1: /* DEC */
   4402             t2 = newTemp(ty);
   4403             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   4404                              mkexpr(t1), mkU(ty,1)));
   4405             setFlags_INC_DEC( False, t2, ty );
   4406             putIRegE(sz,pfx,modrm, mkexpr(t2));
   4407             break;
   4408          case 2: /* call Ev */
   4409             /* Ignore any sz value and operate as if sz==8. */
   4410             if (!(sz == 4 || sz == 8)) goto unhandledR;
   4411             if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   4412             sz = 8;
   4413             t3 = newTemp(Ity_I64);
   4414             assign(t3, getIRegE(sz,pfx,modrm));
   4415             t2 = newTemp(Ity_I64);
   4416             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   4417             putIReg64(R_RSP, mkexpr(t2));
   4418             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+1));
   4419             make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(reg)");
   4420             jmp_treg(dres, Ijk_Call, t3);
   4421             vassert(dres->whatNext == Dis_StopHere);
   4422             showSz = False;
   4423             break;
   4424          case 4: /* jmp Ev */
   4425             /* Ignore any sz value and operate as if sz==8. */
   4426             if (!(sz == 4 || sz == 8)) goto unhandledR;
   4427             if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   4428             sz = 8;
   4429             t3 = newTemp(Ity_I64);
   4430             assign(t3, getIRegE(sz,pfx,modrm));
   4431             jmp_treg(dres, Ijk_Boring, t3);
   4432             vassert(dres->whatNext == Dis_StopHere);
   4433             showSz = False;
   4434             break;
   4435          case 6: /* PUSH Ev */
   4436             /* There is no encoding for 32-bit operand size; hence ... */
   4437             if (sz == 4) sz = 8;
   4438             if (sz == 8 || sz == 2) {
   4439                ty = szToITy(sz); /* redo it, since sz might have changed */
   4440                t3 = newTemp(ty);
   4441                assign(t3, getIRegE(sz,pfx,modrm));
   4442                t2 = newTemp(Ity_I64);
   4443                assign( t2, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
   4444                putIReg64(R_RSP, mkexpr(t2) );
   4445                storeLE( mkexpr(t2), mkexpr(t3) );
   4446                break;
   4447             } else {
   4448                goto unhandledR; /* awaiting test case */
   4449             }
   4450          default:
   4451          unhandledR:
   4452             *decode_OK = False;
   4453             return delta;
   4454       }
   4455       delta++;
   4456       DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
   4457                        showSz ? nameISize(sz) : ' ',
   4458                        nameIRegE(sz, pfx, modrm));
   4459    } else {
   4460       /* Decide if F2/XACQ, F3/XREL, F2/CALL or F2/JMP might be valid. */
   4461       Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
   4462       if ((gregLO3ofRM(modrm) == 0/*INC*/ || gregLO3ofRM(modrm) == 1/*DEC*/)
   4463           && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
   4464          validF2orF3 = True;
   4465       } else if ((gregLO3ofRM(modrm) == 2 || gregLO3ofRM(modrm) == 4)
   4466                  && (haveF2(pfx) && !haveF3(pfx))) {
   4467          validF2orF3 = True;
   4468       }
   4469       if (!validF2orF3) goto unhandledM;
   4470       /* */
   4471       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   4472       if (gregLO3ofRM(modrm) != 2 && gregLO3ofRM(modrm) != 4
   4473                                   && gregLO3ofRM(modrm) != 6) {
   4474          assign(t1, loadLE(ty,mkexpr(addr)));
   4475       }
   4476       switch (gregLO3ofRM(modrm)) {
   4477          case 0: /* INC */
   4478             t2 = newTemp(ty);
   4479             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   4480                              mkexpr(t1), mkU(ty,1)));
   4481             if (haveLOCK(pfx)) {
   4482                casLE( mkexpr(addr),
   4483                       mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
   4484             } else {
   4485                storeLE(mkexpr(addr),mkexpr(t2));
   4486             }
   4487             setFlags_INC_DEC( True, t2, ty );
   4488             break;
   4489          case 1: /* DEC */
   4490             t2 = newTemp(ty);
   4491             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   4492                              mkexpr(t1), mkU(ty,1)));
   4493             if (haveLOCK(pfx)) {
   4494                casLE( mkexpr(addr),
   4495                       mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
   4496             } else {
   4497                storeLE(mkexpr(addr),mkexpr(t2));
   4498             }
   4499             setFlags_INC_DEC( False, t2, ty );
   4500             break;
   4501          case 2: /* call Ev */
   4502             /* Ignore any sz value and operate as if sz==8. */
   4503             if (!(sz == 4 || sz == 8)) goto unhandledM;
   4504             if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   4505             sz = 8;
   4506             t3 = newTemp(Ity_I64);
   4507             assign(t3, loadLE(Ity_I64,mkexpr(addr)));
   4508             t2 = newTemp(Ity_I64);
   4509             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   4510             putIReg64(R_RSP, mkexpr(t2));
   4511             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+len));
   4512             make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(mem)");
   4513             jmp_treg(dres, Ijk_Call, t3);
   4514             vassert(dres->whatNext == Dis_StopHere);
   4515             showSz = False;
   4516             break;
   4517          case 4: /* JMP Ev */
   4518             /* Ignore any sz value and operate as if sz==8. */
   4519             if (!(sz == 4 || sz == 8)) goto unhandledM;
   4520             if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   4521             sz = 8;
   4522             t3 = newTemp(Ity_I64);
   4523             assign(t3, loadLE(Ity_I64,mkexpr(addr)));
   4524             jmp_treg(dres, Ijk_Boring, t3);
   4525             vassert(dres->whatNext == Dis_StopHere);
   4526             showSz = False;
   4527             break;
   4528          case 6: /* PUSH Ev */
   4529             /* There is no encoding for 32-bit operand size; hence ... */
   4530             if (sz == 4) sz = 8;
   4531             if (sz == 8 || sz == 2) {
   4532                ty = szToITy(sz); /* redo it, since sz might have changed */
   4533                t3 = newTemp(ty);
   4534                assign(t3, loadLE(ty,mkexpr(addr)));
   4535                t2 = newTemp(Ity_I64);
   4536                assign( t2, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
   4537                putIReg64(R_RSP, mkexpr(t2) );
   4538                storeLE( mkexpr(t2), mkexpr(t3) );
   4539                break;
   4540             } else {
   4541                goto unhandledM; /* awaiting test case */
   4542             }
   4543          default:
   4544          unhandledM:
   4545             *decode_OK = False;
   4546             return delta;
   4547       }
   4548       delta += len;
   4549       DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
   4550                        showSz ? nameISize(sz) : ' ',
   4551                        dis_buf);
   4552    }
   4553    return delta;
   4554 }
   4555 
   4556 
   4557 /*------------------------------------------------------------*/
   4558 /*--- Disassembling string ops (including REP prefixes)    ---*/
   4559 /*------------------------------------------------------------*/
   4560 
   4561 /* Code shared by all the string ops */
   4562 static
   4563 void dis_string_op_increment ( Int sz, IRTemp t_inc )
   4564 {
   4565    UChar logSz;
   4566    if (sz == 8 || sz == 4 || sz == 2) {
   4567       logSz = 1;
   4568       if (sz == 4) logSz = 2;
   4569       if (sz == 8) logSz = 3;
   4570       assign( t_inc,
   4571               binop(Iop_Shl64, IRExpr_Get( OFFB_DFLAG, Ity_I64 ),
   4572                                mkU8(logSz) ) );
   4573    } else {
   4574       assign( t_inc,
   4575               IRExpr_Get( OFFB_DFLAG, Ity_I64 ) );
   4576    }
   4577 }
   4578 
   4579 static
   4580 void dis_string_op( void (*dis_OP)( Int, IRTemp, Prefix pfx ),
   4581                     Int sz, const HChar* name, Prefix pfx )
   4582 {
   4583    IRTemp t_inc = newTemp(Ity_I64);
   4584    /* Really we ought to inspect the override prefixes, but we don't.
   4585       The following assertion catches any resulting sillyness. */
   4586    vassert(pfx == clearSegBits(pfx));
   4587    dis_string_op_increment(sz, t_inc);
   4588    dis_OP( sz, t_inc, pfx );
   4589    DIP("%s%c\n", name, nameISize(sz));
   4590 }
   4591 
   4592 static
   4593 void dis_MOVS ( Int sz, IRTemp t_inc, Prefix pfx )
   4594 {
   4595    IRType ty = szToITy(sz);
   4596    IRTemp td = newTemp(Ity_I64);   /* RDI */
   4597    IRTemp ts = newTemp(Ity_I64);   /* RSI */
   4598    IRExpr *incd, *incs;
   4599 
   4600    if (haveASO(pfx)) {
   4601       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4602       assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
   4603    } else {
   4604       assign( td, getIReg64(R_RDI) );
   4605       assign( ts, getIReg64(R_RSI) );
   4606    }
   4607 
   4608    storeLE( mkexpr(td), loadLE(ty,mkexpr(ts)) );
   4609 
   4610    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4611    incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
   4612    if (haveASO(pfx)) {
   4613       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4614       incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
   4615    }
   4616    putIReg64( R_RDI, incd );
   4617    putIReg64( R_RSI, incs );
   4618 }
   4619 
   4620 static
   4621 void dis_LODS ( Int sz, IRTemp t_inc, Prefix pfx )
   4622 {
   4623    IRType ty = szToITy(sz);
   4624    IRTemp ts = newTemp(Ity_I64);   /* RSI */
   4625    IRExpr *incs;
   4626 
   4627    if (haveASO(pfx))
   4628       assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
   4629    else
   4630       assign( ts, getIReg64(R_RSI) );
   4631 
   4632    putIRegRAX ( sz, loadLE(ty, mkexpr(ts)) );
   4633 
   4634    incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
   4635    if (haveASO(pfx))
   4636       incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
   4637    putIReg64( R_RSI, incs );
   4638 }
   4639 
   4640 static
   4641 void dis_STOS ( Int sz, IRTemp t_inc, Prefix pfx )
   4642 {
   4643    IRType ty = szToITy(sz);
   4644    IRTemp ta = newTemp(ty);        /* rAX */
   4645    IRTemp td = newTemp(Ity_I64);   /* RDI */
   4646    IRExpr *incd;
   4647 
   4648    assign( ta, getIRegRAX(sz) );
   4649 
   4650    if (haveASO(pfx))
   4651       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4652    else
   4653       assign( td, getIReg64(R_RDI) );
   4654 
   4655    storeLE( mkexpr(td), mkexpr(ta) );
   4656 
   4657    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4658    if (haveASO(pfx))
   4659       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4660    putIReg64( R_RDI, incd );
   4661 }
   4662 
   4663 static
   4664 void dis_CMPS ( Int sz, IRTemp t_inc, Prefix pfx )
   4665 {
   4666    IRType ty  = szToITy(sz);
   4667    IRTemp tdv = newTemp(ty);      /* (RDI) */
   4668    IRTemp tsv = newTemp(ty);      /* (RSI) */
   4669    IRTemp td  = newTemp(Ity_I64); /*  RDI  */
   4670    IRTemp ts  = newTemp(Ity_I64); /*  RSI  */
   4671    IRExpr *incd, *incs;
   4672 
   4673    if (haveASO(pfx)) {
   4674       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4675       assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
   4676    } else {
   4677       assign( td, getIReg64(R_RDI) );
   4678       assign( ts, getIReg64(R_RSI) );
   4679    }
   4680 
   4681    assign( tdv, loadLE(ty,mkexpr(td)) );
   4682 
   4683    assign( tsv, loadLE(ty,mkexpr(ts)) );
   4684 
   4685    setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
   4686 
   4687    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4688    incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
   4689    if (haveASO(pfx)) {
   4690       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4691       incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
   4692    }
   4693    putIReg64( R_RDI, incd );
   4694    putIReg64( R_RSI, incs );
   4695 }
   4696 
   4697 static
   4698 void dis_SCAS ( Int sz, IRTemp t_inc, Prefix pfx )
   4699 {
   4700    IRType ty  = szToITy(sz);
   4701    IRTemp ta  = newTemp(ty);       /*  rAX  */
   4702    IRTemp td  = newTemp(Ity_I64);  /*  RDI  */
   4703    IRTemp tdv = newTemp(ty);       /* (RDI) */
   4704    IRExpr *incd;
   4705 
   4706    assign( ta, getIRegRAX(sz) );
   4707 
   4708    if (haveASO(pfx))
   4709       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4710    else
   4711       assign( td, getIReg64(R_RDI) );
   4712 
   4713    assign( tdv, loadLE(ty,mkexpr(td)) );
   4714 
   4715    setFlags_DEP1_DEP2 ( Iop_Sub8, ta, tdv, ty );
   4716 
   4717    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4718    if (haveASO(pfx))
   4719       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4720    putIReg64( R_RDI, incd );
   4721 }
   4722 
   4723 
   4724 /* Wrap the appropriate string op inside a REP/REPE/REPNE.  We assume
   4725    the insn is the last one in the basic block, and so emit a jump to
   4726    the next insn, rather than just falling through. */
   4727 static
   4728 void dis_REP_op ( /*MOD*/DisResult* dres,
   4729                   AMD64Condcode cond,
   4730                   void (*dis_OP)(Int, IRTemp, Prefix),
   4731                   Int sz, Addr64 rip, Addr64 rip_next, const HChar* name,
   4732                   Prefix pfx )
   4733 {
   4734    IRTemp t_inc = newTemp(Ity_I64);
   4735    IRTemp tc;
   4736    IRExpr* cmp;
   4737 
   4738    /* Really we ought to inspect the override prefixes, but we don't.
   4739       The following assertion catches any resulting sillyness. */
   4740    vassert(pfx == clearSegBits(pfx));
   4741 
   4742    if (haveASO(pfx)) {
   4743       tc = newTemp(Ity_I32);  /*  ECX  */
   4744       assign( tc, getIReg32(R_RCX) );
   4745       cmp = binop(Iop_CmpEQ32, mkexpr(tc), mkU32(0));
   4746    } else {
   4747       tc = newTemp(Ity_I64);  /*  RCX  */
   4748       assign( tc, getIReg64(R_RCX) );
   4749       cmp = binop(Iop_CmpEQ64, mkexpr(tc), mkU64(0));
   4750    }
   4751 
   4752    stmt( IRStmt_Exit( cmp, Ijk_Boring,
   4753                       IRConst_U64(rip_next), OFFB_RIP ) );
   4754 
   4755    if (haveASO(pfx))
   4756       putIReg32(R_RCX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
   4757   else
   4758       putIReg64(R_RCX, binop(Iop_Sub64, mkexpr(tc), mkU64(1)) );
   4759 
   4760    dis_string_op_increment(sz, t_inc);
   4761    dis_OP (sz, t_inc, pfx);
   4762 
   4763    if (cond == AMD64CondAlways) {
   4764       jmp_lit(dres, Ijk_Boring, rip);
   4765       vassert(dres->whatNext == Dis_StopHere);
   4766    } else {
   4767       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(cond),
   4768                          Ijk_Boring,
   4769                          IRConst_U64(rip),
   4770                          OFFB_RIP ) );
   4771       jmp_lit(dres, Ijk_Boring, rip_next);
   4772       vassert(dres->whatNext == Dis_StopHere);
   4773    }
   4774    DIP("%s%c\n", name, nameISize(sz));
   4775 }
   4776 
   4777 
   4778 /*------------------------------------------------------------*/
   4779 /*--- Arithmetic, etc.                                     ---*/
   4780 /*------------------------------------------------------------*/
   4781 
   4782 /* IMUL E, G.  Supplied eip points to the modR/M byte. */
   4783 static
   4784 ULong dis_mul_E_G ( const VexAbiInfo* vbi,
   4785                     Prefix      pfx,
   4786                     Int         size,
   4787                     Long        delta0 )
   4788 {
   4789    Int    alen;
   4790    HChar  dis_buf[50];
   4791    UChar  rm = getUChar(delta0);
   4792    IRType ty = szToITy(size);
   4793    IRTemp te = newTemp(ty);
   4794    IRTemp tg = newTemp(ty);
   4795    IRTemp resLo = newTemp(ty);
   4796 
   4797    assign( tg, getIRegG(size, pfx, rm) );
   4798    if (epartIsReg(rm)) {
   4799       assign( te, getIRegE(size, pfx, rm) );
   4800    } else {
   4801       IRTemp addr = disAMode( &alen, vbi, pfx, delta0, dis_buf, 0 );
   4802       assign( te, loadLE(ty,mkexpr(addr)) );
   4803    }
   4804 
   4805    setFlags_MUL ( ty, te, tg, AMD64G_CC_OP_SMULB );
   4806 
   4807    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tg) ) );
   4808 
   4809    putIRegG(size, pfx, rm, mkexpr(resLo) );
   4810 
   4811    if (epartIsReg(rm)) {
   4812       DIP("imul%c %s, %s\n", nameISize(size),
   4813                              nameIRegE(size,pfx,rm),
   4814                              nameIRegG(size,pfx,rm));
   4815       return 1+delta0;
   4816    } else {
   4817       DIP("imul%c %s, %s\n", nameISize(size),
   4818                              dis_buf,
   4819                              nameIRegG(size,pfx,rm));
   4820       return alen+delta0;
   4821    }
   4822 }
   4823 
   4824 
   4825 /* IMUL I * E -> G.  Supplied rip points to the modR/M byte. */
   4826 static
   4827 ULong dis_imul_I_E_G ( const VexAbiInfo* vbi,
   4828                        Prefix      pfx,
   4829                        Int         size,
   4830                        Long        delta,
   4831                        Int         litsize )
   4832 {
   4833    Long   d64;
   4834    Int    alen;
   4835    HChar  dis_buf[50];
   4836    UChar  rm = getUChar(delta);
   4837    IRType ty = szToITy(size);
   4838    IRTemp te = newTemp(ty);
   4839    IRTemp tl = newTemp(ty);
   4840    IRTemp resLo = newTemp(ty);
   4841 
   4842    vassert(/*size == 1 ||*/ size == 2 || size == 4 || size == 8);
   4843 
   4844    if (epartIsReg(rm)) {
   4845       assign(te, getIRegE(size, pfx, rm));
   4846       delta++;
   4847    } else {
   4848       IRTemp addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   4849                                      imin(4,litsize) );
   4850       assign(te, loadLE(ty, mkexpr(addr)));
   4851       delta += alen;
   4852    }
   4853    d64 = getSDisp(imin(4,litsize),delta);
   4854    delta += imin(4,litsize);
   4855 
   4856    d64 &= mkSizeMask(size);
   4857    assign(tl, mkU(ty,d64));
   4858 
   4859    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tl) ));
   4860 
   4861    setFlags_MUL ( ty, te, tl, AMD64G_CC_OP_SMULB );
   4862 
   4863    putIRegG(size, pfx, rm, mkexpr(resLo));
   4864 
   4865    DIP("imul%c $%lld, %s, %s\n",
   4866        nameISize(size), d64,
   4867        ( epartIsReg(rm) ? nameIRegE(size,pfx,rm) : dis_buf ),
   4868        nameIRegG(size,pfx,rm) );
   4869    return delta;
   4870 }
   4871 
   4872 
   4873 /* Generate an IR sequence to do a popcount operation on the supplied
   4874    IRTemp, and return a new IRTemp holding the result.  'ty' may be
   4875    Ity_I16, Ity_I32 or Ity_I64 only. */
   4876 static IRTemp gen_POPCOUNT ( IRType ty, IRTemp src )
   4877 {
   4878    Int i;
   4879    if (ty == Ity_I16) {
   4880       IRTemp old = IRTemp_INVALID;
   4881       IRTemp nyu = IRTemp_INVALID;
   4882       IRTemp mask[4], shift[4];
   4883       for (i = 0; i < 4; i++) {
   4884          mask[i]  = newTemp(ty);
   4885          shift[i] = 1 << i;
   4886       }
   4887       assign(mask[0], mkU16(0x5555));
   4888       assign(mask[1], mkU16(0x3333));
   4889       assign(mask[2], mkU16(0x0F0F));
   4890       assign(mask[3], mkU16(0x00FF));
   4891       old = src;
   4892       for (i = 0; i < 4; i++) {
   4893          nyu = newTemp(ty);
   4894          assign(nyu,
   4895                 binop(Iop_Add16,
   4896                       binop(Iop_And16,
   4897                             mkexpr(old),
   4898                             mkexpr(mask[i])),
   4899                       binop(Iop_And16,
   4900                             binop(Iop_Shr16, mkexpr(old), mkU8(shift[i])),
   4901                             mkexpr(mask[i]))));
   4902          old = nyu;
   4903       }
   4904       return nyu;
   4905    }
   4906    if (ty == Ity_I32) {
   4907       IRTemp old = IRTemp_INVALID;
   4908       IRTemp nyu = IRTemp_INVALID;
   4909       IRTemp mask[5], shift[5];
   4910       for (i = 0; i < 5; i++) {
   4911          mask[i]  = newTemp(ty);
   4912          shift[i] = 1 << i;
   4913       }
   4914       assign(mask[0], mkU32(0x55555555));
   4915       assign(mask[1], mkU32(0x33333333));
   4916       assign(mask[2], mkU32(0x0F0F0F0F));
   4917       assign(mask[3], mkU32(0x00FF00FF));
   4918       assign(mask[4], mkU32(0x0000FFFF));
   4919       old = src;
   4920       for (i = 0; i < 5; i++) {
   4921          nyu = newTemp(ty);
   4922          assign(nyu,
   4923                 binop(Iop_Add32,
   4924                       binop(Iop_And32,
   4925                             mkexpr(old),
   4926                             mkexpr(mask[i])),
   4927                       binop(Iop_And32,
   4928                             binop(Iop_Shr32, mkexpr(old), mkU8(shift[i])),
   4929                             mkexpr(mask[i]))));
   4930          old = nyu;
   4931       }
   4932       return nyu;
   4933    }
   4934    if (ty == Ity_I64) {
   4935       IRTemp old = IRTemp_INVALID;
   4936       IRTemp nyu = IRTemp_INVALID;
   4937       IRTemp mask[6], shift[6];
   4938       for (i = 0; i < 6; i++) {
   4939          mask[i]  = newTemp(ty);
   4940          shift[i] = 1 << i;
   4941       }
   4942       assign(mask[0], mkU64(0x5555555555555555ULL));
   4943       assign(mask[1], mkU64(0x3333333333333333ULL));
   4944       assign(mask[2], mkU64(0x0F0F0F0F0F0F0F0FULL));
   4945       assign(mask[3], mkU64(0x00FF00FF00FF00FFULL));
   4946       assign(mask[4], mkU64(0x0000FFFF0000FFFFULL));
   4947       assign(mask[5], mkU64(0x00000000FFFFFFFFULL));
   4948       old = src;
   4949       for (i = 0; i < 6; i++) {
   4950          nyu = newTemp(ty);
   4951          assign(nyu,
   4952                 binop(Iop_Add64,
   4953                       binop(Iop_And64,
   4954                             mkexpr(old),
   4955                             mkexpr(mask[i])),
   4956                       binop(Iop_And64,
   4957                             binop(Iop_Shr64, mkexpr(old), mkU8(shift[i])),
   4958                             mkexpr(mask[i]))));
   4959          old = nyu;
   4960       }
   4961       return nyu;
   4962    }
   4963    /*NOTREACHED*/
   4964    vassert(0);
   4965 }
   4966 
   4967 
   4968 /* Generate an IR sequence to do a count-leading-zeroes operation on
   4969    the supplied IRTemp, and return a new IRTemp holding the result.
   4970    'ty' may be Ity_I16, Ity_I32 or Ity_I64 only.  In the case where
   4971    the argument is zero, return the number of bits in the word (the
   4972    natural semantics). */
   4973 static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
   4974 {
   4975    vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16);
   4976 
   4977    IRTemp src64 = newTemp(Ity_I64);
   4978    assign(src64, widenUto64( mkexpr(src) ));
   4979 
   4980    IRTemp src64x = newTemp(Ity_I64);
   4981    assign(src64x,
   4982           binop(Iop_Shl64, mkexpr(src64),
   4983                            mkU8(64 - 8 * sizeofIRType(ty))));
   4984 
   4985    // Clz64 has undefined semantics when its input is zero, so
   4986    // special-case around that.
   4987    IRTemp res64 = newTemp(Ity_I64);
   4988    assign(res64,
   4989           IRExpr_ITE(
   4990              binop(Iop_CmpEQ64, mkexpr(src64x), mkU64(0)),
   4991              mkU64(8 * sizeofIRType(ty)),
   4992              unop(Iop_Clz64, mkexpr(src64x))
   4993    ));
   4994 
   4995    IRTemp res = newTemp(ty);
   4996    assign(res, narrowTo(ty, mkexpr(res64)));
   4997    return res;
   4998 }
   4999 
   5000 
   5001 /* Generate an IR sequence to do a count-trailing-zeroes operation on
   5002    the supplied IRTemp, and return a new IRTemp holding the result.
   5003    'ty' may be Ity_I16, Ity_I32 or Ity_I64 only.  In the case where
   5004    the argument is zero, return the number of bits in the word (the
   5005    natural semantics). */
   5006 static IRTemp gen_TZCNT ( IRType ty, IRTemp src )
   5007 {
   5008    vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16);
   5009 
   5010    IRTemp src64 = newTemp(Ity_I64);
   5011    assign(src64, widenUto64( mkexpr(src) ));
   5012 
   5013    // Ctz64 has undefined semantics when its input is zero, so
   5014    // special-case around that.
   5015    IRTemp res64 = newTemp(Ity_I64);
   5016    assign(res64,
   5017           IRExpr_ITE(
   5018              binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0)),
   5019              mkU64(8 * sizeofIRType(ty)),
   5020              unop(Iop_Ctz64, mkexpr(src64))
   5021    ));
   5022 
   5023    IRTemp res = newTemp(ty);
   5024    assign(res, narrowTo(ty, mkexpr(res64)));
   5025    return res;
   5026 }
   5027 
   5028 
   5029 /*------------------------------------------------------------*/
   5030 /*---                                                      ---*/
   5031 /*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
   5032 /*---                                                      ---*/
   5033 /*------------------------------------------------------------*/
   5034 
   5035 /* --- Helper functions for dealing with the register stack. --- */
   5036 
   5037 /* --- Set the emulation-warning pseudo-register. --- */
   5038 
   5039 static void put_emwarn ( IRExpr* e /* :: Ity_I32 */ )
   5040 {
   5041    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   5042    stmt( IRStmt_Put( OFFB_EMNOTE, e ) );
   5043 }
   5044 
   5045 /* --- Produce an IRExpr* denoting a 64-bit QNaN. --- */
   5046 
   5047 static IRExpr* mkQNaN64 ( void )
   5048 {
   5049   /* QNaN is 0 2047 1 0(51times)
   5050      == 0b 11111111111b 1 0(51times)
   5051      == 0x7FF8 0000 0000 0000
   5052    */
   5053    return IRExpr_Const(IRConst_F64i(0x7FF8000000000000ULL));
   5054 }
   5055 
   5056 /* --------- Get/put the top-of-stack pointer :: Ity_I32 --------- */
   5057 
   5058 static IRExpr* get_ftop ( void )
   5059 {
   5060    return IRExpr_Get( OFFB_FTOP, Ity_I32 );
   5061 }
   5062 
   5063 static void put_ftop ( IRExpr* e )
   5064 {
   5065    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   5066    stmt( IRStmt_Put( OFFB_FTOP, e ) );
   5067 }
   5068 
   5069 /* --------- Get/put the C3210 bits. --------- */
   5070 
   5071 static IRExpr*  /* :: Ity_I64 */ get_C3210 ( void )
   5072 {
   5073    return IRExpr_Get( OFFB_FC3210, Ity_I64 );
   5074 }
   5075 
   5076 static void put_C3210 ( IRExpr* e  /* :: Ity_I64 */ )
   5077 {
   5078    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
   5079    stmt( IRStmt_Put( OFFB_FC3210, e ) );
   5080 }
   5081 
   5082 /* --------- Get/put the FPU rounding mode. --------- */
   5083 static IRExpr* /* :: Ity_I32 */ get_fpround ( void )
   5084 {
   5085    return unop(Iop_64to32, IRExpr_Get( OFFB_FPROUND, Ity_I64 ));
   5086 }
   5087 
   5088 static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
   5089 {
   5090    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   5091    stmt( IRStmt_Put( OFFB_FPROUND, unop(Iop_32Uto64,e) ) );
   5092 }
   5093 
   5094 
   5095 /* --------- Synthesise a 2-bit FPU rounding mode. --------- */
   5096 /* Produces a value in 0 .. 3, which is encoded as per the type
   5097    IRRoundingMode.  Since the guest_FPROUND value is also encoded as
   5098    per IRRoundingMode, we merely need to get it and mask it for
   5099    safety.
   5100 */
   5101 static IRExpr* /* :: Ity_I32 */ get_roundingmode ( void )
   5102 {
   5103    return binop( Iop_And32, get_fpround(), mkU32(3) );
   5104 }
   5105 
   5106 static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
   5107 {
   5108    return mkU32(Irrm_NEAREST);
   5109 }
   5110 
   5111 
   5112 /* --------- Get/set FP register tag bytes. --------- */
   5113 
   5114 /* Given i, and some expression e, generate 'ST_TAG(i) = e'. */
   5115 
   5116 static void put_ST_TAG ( Int i, IRExpr* value )
   5117 {
   5118    IRRegArray* descr;
   5119    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_I8);
   5120    descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   5121    stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
   5122 }
   5123 
   5124 /* Given i, generate an expression yielding 'ST_TAG(i)'.  This will be
   5125    zero to indicate "Empty" and nonzero to indicate "NonEmpty".  */
   5126 
   5127 static IRExpr* get_ST_TAG ( Int i )
   5128 {
   5129    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   5130    return IRExpr_GetI( descr, get_ftop(), i );
   5131 }
   5132 
   5133 
   5134 /* --------- Get/set FP registers. --------- */
   5135 
   5136 /* Given i, and some expression e, emit 'ST(i) = e' and set the
   5137    register's tag to indicate the register is full.  The previous
   5138    state of the register is not checked. */
   5139 
   5140 static void put_ST_UNCHECKED ( Int i, IRExpr* value )
   5141 {
   5142    IRRegArray* descr;
   5143    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_F64);
   5144    descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   5145    stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
   5146    /* Mark the register as in-use. */
   5147    put_ST_TAG(i, mkU8(1));
   5148 }
   5149 
   5150 /* Given i, and some expression e, emit
   5151       ST(i) = is_full(i) ? NaN : e
   5152    and set the tag accordingly.
   5153 */
   5154 
   5155 static void put_ST ( Int i, IRExpr* value )
   5156 {
   5157    put_ST_UNCHECKED(
   5158       i,
   5159       IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
   5160                   /* non-0 means full */
   5161                   mkQNaN64(),
   5162                   /* 0 means empty */
   5163                   value
   5164       )
   5165    );
   5166 }
   5167 
   5168 
   5169 /* Given i, generate an expression yielding 'ST(i)'. */
   5170 
   5171 static IRExpr* get_ST_UNCHECKED ( Int i )
   5172 {
   5173    IRRegArray* descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   5174    return IRExpr_GetI( descr, get_ftop(), i );
   5175 }
   5176 
   5177 
   5178 /* Given i, generate an expression yielding
   5179   is_full(i) ? ST(i) : NaN
   5180 */
   5181 
   5182 static IRExpr* get_ST ( Int i )
   5183 {
   5184    return
   5185       IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
   5186                   /* non-0 means full */
   5187                   get_ST_UNCHECKED(i),
   5188                   /* 0 means empty */
   5189                   mkQNaN64());
   5190 }
   5191 
   5192 
   5193 /* Given i, and some expression e, and a condition cond, generate IR
   5194    which has the same effect as put_ST(i,e) when cond is true and has
   5195    no effect when cond is false.  Given the lack of proper
   5196    if-then-else in the IR, this is pretty tricky.
   5197 */
   5198 
   5199 static void maybe_put_ST ( IRTemp cond, Int i, IRExpr* value )
   5200 {
   5201    // new_tag = if cond then FULL else old_tag
   5202    // new_val = if cond then (if old_tag==FULL then NaN else val)
   5203    //                   else old_val
   5204 
   5205    IRTemp old_tag = newTemp(Ity_I8);
   5206    assign(old_tag, get_ST_TAG(i));
   5207    IRTemp new_tag = newTemp(Ity_I8);
   5208    assign(new_tag,
   5209           IRExpr_ITE(mkexpr(cond), mkU8(1)/*FULL*/, mkexpr(old_tag)));
   5210 
   5211    IRTemp old_val = newTemp(Ity_F64);
   5212    assign(old_val, get_ST_UNCHECKED(i));
   5213    IRTemp new_val = newTemp(Ity_F64);
   5214    assign(new_val,
   5215           IRExpr_ITE(mkexpr(cond),
   5216                      IRExpr_ITE(binop(Iop_CmpNE8, mkexpr(old_tag), mkU8(0)),
   5217                                 /* non-0 means full */
   5218                                 mkQNaN64(),
   5219                                 /* 0 means empty */
   5220                                 value),
   5221                      mkexpr(old_val)));
   5222 
   5223    put_ST_UNCHECKED(i, mkexpr(new_val));
   5224    // put_ST_UNCHECKED incorrectly sets tag(i) to always be FULL.  So
   5225    // now set it to new_tag instead.
   5226    put_ST_TAG(i, mkexpr(new_tag));
   5227 }
   5228 
   5229 /* Adjust FTOP downwards by one register. */
   5230 
   5231 static void fp_push ( void )
   5232 {
   5233    put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
   5234 }
   5235 
   5236 /* Adjust FTOP downwards by one register when COND is 1:I1.  Else
   5237    don't change it. */
   5238 
   5239 static void maybe_fp_push ( IRTemp cond )
   5240 {
   5241    put_ftop( binop(Iop_Sub32, get_ftop(), unop(Iop_1Uto32,mkexpr(cond))) );
   5242 }
   5243 
   5244 /* Adjust FTOP upwards by one register, and mark the vacated register
   5245    as empty.  */
   5246 
   5247 static void fp_pop ( void )
   5248 {
   5249    put_ST_TAG(0, mkU8(0));
   5250    put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   5251 }
   5252 
   5253 /* Set the C2 bit of the FPU status register to e[0].  Assumes that
   5254    e[31:1] == 0.
   5255 */
   5256 static void set_C2 ( IRExpr* e )
   5257 {
   5258    IRExpr* cleared = binop(Iop_And64, get_C3210(), mkU64(~AMD64G_FC_MASK_C2));
   5259    put_C3210( binop(Iop_Or64,
   5260                     cleared,
   5261                     binop(Iop_Shl64, e, mkU8(AMD64G_FC_SHIFT_C2))) );
   5262 }
   5263 
   5264 /* Generate code to check that abs(d64) < 2^63 and is finite.  This is
   5265    used to do the range checks for FSIN, FCOS, FSINCOS and FPTAN.  The
   5266    test is simple, but the derivation of it is not so simple.
   5267 
   5268    The exponent field for an IEEE754 double is 11 bits.  That means it
   5269    can take values 0 through 0x7FF.  If the exponent has value 0x7FF,
   5270    the number is either a NaN or an Infinity and so is not finite.
   5271    Furthermore, a finite value of exactly 2^63 is the smallest value
   5272    that has exponent value 0x43E.  Hence, what we need to do is
   5273    extract the exponent, ignoring the sign bit and mantissa, and check
   5274    it is < 0x43E, or <= 0x43D.
   5275 
   5276    To make this easily applicable to 32- and 64-bit targets, a
   5277    roundabout approach is used.  First the number is converted to I64,
   5278    then the top 32 bits are taken.  Shifting them right by 20 bits
   5279    places the sign bit and exponent in the bottom 12 bits.  Anding
   5280    with 0x7FF gets rid of the sign bit, leaving just the exponent
   5281    available for comparison.
   5282 */
   5283 static IRTemp math_IS_TRIG_ARG_FINITE_AND_IN_RANGE ( IRTemp d64 )
   5284 {
   5285    IRTemp i64 = newTemp(Ity_I64);
   5286    assign(i64, unop(Iop_ReinterpF64asI64, mkexpr(d64)) );
   5287    IRTemp exponent = newTemp(Ity_I32);
   5288    assign(exponent,
   5289           binop(Iop_And32,
   5290                 binop(Iop_Shr32, unop(Iop_64HIto32, mkexpr(i64)), mkU8(20)),
   5291                 mkU32(0x7FF)));
   5292    IRTemp in_range_and_finite = newTemp(Ity_I1);
   5293    assign(in_range_and_finite,
   5294           binop(Iop_CmpLE32U, mkexpr(exponent), mkU32(0x43D)));
   5295    return in_range_and_finite;
   5296 }
   5297 
   5298 /* Invent a plausible-looking FPU status word value:
   5299       ((ftop & 7) << 11) | (c3210 & 0x4700)
   5300  */
   5301 static IRExpr* get_FPU_sw ( void )
   5302 {
   5303    return
   5304       unop(Iop_32to16,
   5305            binop(Iop_Or32,
   5306                  binop(Iop_Shl32,
   5307                        binop(Iop_And32, get_ftop(), mkU32(7)),
   5308                              mkU8(11)),
   5309                        binop(Iop_And32, unop(Iop_64to32, get_C3210()),
   5310                                         mkU32(0x4700))
   5311       ));
   5312 }
   5313 
   5314 
   5315 /* Generate a dirty helper call that initialises the x87 state a la
   5316    FINIT.  If |guard| is NULL, it is done unconditionally.  Otherwise
   5317    |guard| is used as a guarding condition.
   5318 */
   5319 static void gen_FINIT_SEQUENCE ( IRExpr* guard )
   5320 {
   5321    /* Uses dirty helper:
   5322          void amd64g_do_FINIT ( VexGuestAMD64State* ) */
   5323    IRDirty* d  = unsafeIRDirty_0_N (
   5324                     0/*regparms*/,
   5325                     "amd64g_dirtyhelper_FINIT",
   5326                     &amd64g_dirtyhelper_FINIT,
   5327                     mkIRExprVec_1( IRExpr_GSPTR() )
   5328                  );
   5329 
   5330    /* declare we're writing guest state */
   5331    d->nFxState = 5;
   5332    vex_bzero(&d->fxState, sizeof(d->fxState));
   5333 
   5334    d->fxState[0].fx     = Ifx_Write;
   5335    d->fxState[0].offset = OFFB_FTOP;
   5336    d->fxState[0].size   = sizeof(UInt);
   5337 
   5338    d->fxState[1].fx     = Ifx_Write;
   5339    d->fxState[1].offset = OFFB_FPREGS;
   5340    d->fxState[1].size   = 8 * sizeof(ULong);
   5341 
   5342    d->fxState[2].fx     = Ifx_Write;
   5343    d->fxState[2].offset = OFFB_FPTAGS;
   5344    d->fxState[2].size   = 8 * sizeof(UChar);
   5345 
   5346    d->fxState[3].fx     = Ifx_Write;
   5347    d->fxState[3].offset = OFFB_FPROUND;
   5348    d->fxState[3].size   = sizeof(ULong);
   5349 
   5350    d->fxState[4].fx     = Ifx_Write;
   5351    d->fxState[4].offset = OFFB_FC3210;
   5352    d->fxState[4].size   = sizeof(ULong);
   5353 
   5354    if (guard)
   5355       d->guard = guard;
   5356 
   5357    stmt( IRStmt_Dirty(d) );
   5358 }
   5359 
   5360 
   5361 /* ------------------------------------------------------- */
   5362 /* Given all that stack-mangling junk, we can now go ahead
   5363    and describe FP instructions.
   5364 */
   5365 
   5366 /* ST(0) = ST(0) `op` mem64/32(addr)
   5367    Need to check ST(0)'s tag on read, but not on write.
   5368 */
   5369 static
   5370 void fp_do_op_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
   5371                          IROp op, Bool dbl )
   5372 {
   5373    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   5374    if (dbl) {
   5375       put_ST_UNCHECKED(0,
   5376          triop( op,
   5377                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5378                 get_ST(0),
   5379                 loadLE(Ity_F64,mkexpr(addr))
   5380          ));
   5381    } else {
   5382       put_ST_UNCHECKED(0,
   5383          triop( op,
   5384                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5385                 get_ST(0),
   5386                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
   5387          ));
   5388    }
   5389 }
   5390 
   5391 
   5392 /* ST(0) = mem64/32(addr) `op` ST(0)
   5393    Need to check ST(0)'s tag on read, but not on write.
   5394 */
   5395 static
   5396 void fp_do_oprev_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
   5397                             IROp op, Bool dbl )
   5398 {
   5399    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   5400    if (dbl) {
   5401       put_ST_UNCHECKED(0,
   5402          triop( op,
   5403                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5404                 loadLE(Ity_F64,mkexpr(addr)),
   5405                 get_ST(0)
   5406          ));
   5407    } else {
   5408       put_ST_UNCHECKED(0,
   5409          triop( op,
   5410                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5411                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
   5412                 get_ST(0)
   5413          ));
   5414    }
   5415 }
   5416 
   5417 
   5418 /* ST(dst) = ST(dst) `op` ST(src).
   5419    Check dst and src tags when reading but not on write.
   5420 */
   5421 static
   5422 void fp_do_op_ST_ST ( const HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
   5423                       Bool pop_after )
   5424 {
   5425    DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
   5426    put_ST_UNCHECKED(
   5427       st_dst,
   5428       triop( op,
   5429              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5430              get_ST(st_dst),
   5431              get_ST(st_src) )
   5432    );
   5433    if (pop_after)
   5434       fp_pop();
   5435 }
   5436 
   5437 /* ST(dst) = ST(src) `op` ST(dst).
   5438    Check dst and src tags when reading but not on write.
   5439 */
   5440 static
   5441 void fp_do_oprev_ST_ST ( const HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
   5442                          Bool pop_after )
   5443 {
   5444    DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
   5445    put_ST_UNCHECKED(
   5446       st_dst,
   5447       triop( op,
   5448              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5449              get_ST(st_src),
   5450              get_ST(st_dst) )
   5451    );
   5452    if (pop_after)
   5453       fp_pop();
   5454 }
   5455 
   5456 /* %rflags(Z,P,C) = UCOMI( st(0), st(i) ) */
   5457 static void fp_do_ucomi_ST0_STi ( UInt i, Bool pop_after )
   5458 {
   5459    DIP("fucomi%s %%st(0),%%st(%u)\n", pop_after ? "p" : "", i);
   5460    /* This is a bit of a hack (and isn't really right).  It sets
   5461       Z,P,C,O correctly, but forces A and S to zero, whereas the Intel
   5462       documentation implies A and S are unchanged.
   5463    */
   5464    /* It's also fishy in that it is used both for COMIP and
   5465       UCOMIP, and they aren't the same (although similar). */
   5466    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   5467    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   5468    stmt( IRStmt_Put(
   5469             OFFB_CC_DEP1,
   5470             binop( Iop_And64,
   5471                    unop( Iop_32Uto64,
   5472                          binop(Iop_CmpF64, get_ST(0), get_ST(i))),
   5473                    mkU64(0x45)
   5474         )));
   5475    if (pop_after)
   5476       fp_pop();
   5477 }
   5478 
   5479 
   5480 /* returns
   5481    32to16( if e32 <s -32768 || e32 >s 32767 then -32768 else e32 )
   5482 */
   5483 static IRExpr* x87ishly_qnarrow_32_to_16 ( IRExpr* e32 )
   5484 {
   5485    IRTemp t32 = newTemp(Ity_I32);
   5486    assign( t32, e32 );
   5487    return
   5488       IRExpr_ITE(
   5489          binop(Iop_CmpLT64U,
   5490                unop(Iop_32Uto64,
   5491                     binop(Iop_Add32, mkexpr(t32), mkU32(32768))),
   5492                mkU64(65536)),
   5493          unop(Iop_32to16, mkexpr(t32)),
   5494          mkU16( 0x8000 ) );
   5495 }
   5496 
   5497 
   5498 static
   5499 ULong dis_FPU ( /*OUT*/Bool* decode_ok,
   5500                 const VexAbiInfo* vbi, Prefix pfx, Long delta )
   5501 {
   5502    Int    len;
   5503    UInt   r_src, r_dst;
   5504    HChar  dis_buf[50];
   5505    IRTemp t1, t2;
   5506 
   5507    /* On entry, delta points at the second byte of the insn (the modrm
   5508       byte).*/
   5509    UChar first_opcode = getUChar(delta-1);
   5510    UChar modrm        = getUChar(delta+0);
   5511 
   5512    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */
   5513 
   5514    if (first_opcode == 0xD8) {
   5515       if (modrm < 0xC0) {
   5516 
   5517          /* bits 5,4,3 are an opcode extension, and the modRM also
   5518            specifies an address. */
   5519          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   5520          delta += len;
   5521 
   5522          switch (gregLO3ofRM(modrm)) {
   5523 
   5524             case 0: /* FADD single-real */
   5525                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, False );
   5526                break;
   5527 
   5528             case 1: /* FMUL single-real */
   5529                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, False );
   5530                break;
   5531 
   5532             case 2: /* FCOM single-real */
   5533                DIP("fcoms %s\n", dis_buf);
   5534                /* This forces C1 to zero, which isn't right. */
   5535                /* The AMD documentation suggests that forcing C1 to
   5536                   zero is correct (Eliot Moss) */
   5537                put_C3210(
   5538                    unop( Iop_32Uto64,
   5539                        binop( Iop_And32,
   5540                               binop(Iop_Shl32,
   5541                                     binop(Iop_CmpF64,
   5542                                           get_ST(0),
   5543                                           unop(Iop_F32toF64,
   5544                                                loadLE(Ity_F32,mkexpr(addr)))),
   5545                                     mkU8(8)),
   5546                               mkU32(0x4500)
   5547                    )));
   5548                break;
   5549 
   5550             case 3: /* FCOMP single-real */
   5551                /* The AMD documentation suggests that forcing C1 to
   5552                   zero is correct (Eliot Moss) */
   5553                DIP("fcomps %s\n", dis_buf);
   5554                /* This forces C1 to zero, which isn't right. */
   5555                put_C3210(
   5556                    unop( Iop_32Uto64,
   5557                        binop( Iop_And32,
   5558                               binop(Iop_Shl32,
   5559                                     binop(Iop_CmpF64,
   5560                                           get_ST(0),
   5561                                           unop(Iop_F32toF64,
   5562                                                loadLE(Ity_F32,mkexpr(addr)))),
   5563                                     mkU8(8)),
   5564                               mkU32(0x4500)
   5565                    )));
   5566                fp_pop();
   5567                break;
   5568 
   5569             case 4: /* FSUB single-real */
   5570                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, False );
   5571                break;
   5572 
   5573             case 5: /* FSUBR single-real */
   5574                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, False );
   5575                break;
   5576 
   5577             case 6: /* FDIV single-real */
   5578                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, False );
   5579                break;
   5580 
   5581             case 7: /* FDIVR single-real */
   5582                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, False );
   5583                break;
   5584 
   5585             default:
   5586                vex_printf("unhandled opc_aux = 0x%2x\n",
   5587                           (UInt)gregLO3ofRM(modrm));
   5588                vex_printf("first_opcode == 0xD8\n");
   5589                goto decode_fail;
   5590          }
   5591       } else {
   5592          delta++;
   5593          switch (modrm) {
   5594 
   5595             case 0xC0 ... 0xC7: /* FADD %st(?),%st(0) */
   5596                fp_do_op_ST_ST ( "add", Iop_AddF64, modrm - 0xC0, 0, False );
   5597                break;
   5598 
   5599             case 0xC8 ... 0xCF: /* FMUL %st(?),%st(0) */
   5600                fp_do_op_ST_ST ( "mul", Iop_MulF64, modrm - 0xC8, 0, False );
   5601                break;
   5602 
   5603             /* Dunno if this is right */
   5604             case 0xD0 ... 0xD7: /* FCOM %st(?),%st(0) */
   5605                r_dst = (UInt)modrm - 0xD0;
   5606                DIP("fcom %%st(0),%%st(%u)\n", r_dst);
   5607                /* This forces C1 to zero, which isn't right. */
   5608                put_C3210(
   5609                    unop(Iop_32Uto64,
   5610                    binop( Iop_And32,
   5611                           binop(Iop_Shl32,
   5612                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   5613                                 mkU8(8)),
   5614                           mkU32(0x4500)
   5615                    )));
   5616                break;
   5617 
   5618             /* Dunno if this is right */
   5619             case 0xD8 ... 0xDF: /* FCOMP %st(?),%st(0) */
   5620                r_dst = (UInt)modrm - 0xD8;
   5621                DIP("fcomp %%st(0),%%st(%u)\n", r_dst);
   5622                /* This forces C1 to zero, which isn't right. */
   5623                put_C3210(
   5624                    unop(Iop_32Uto64,
   5625                    binop( Iop_And32,
   5626                           binop(Iop_Shl32,
   5627                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   5628                                 mkU8(8)),
   5629                           mkU32(0x4500)
   5630                    )));
   5631                fp_pop();
   5632                break;
   5633 
   5634             case 0xE0 ... 0xE7: /* FSUB %st(?),%st(0) */
   5635                fp_do_op_ST_ST ( "sub", Iop_SubF64, modrm - 0xE0, 0, False );
   5636                break;
   5637 
   5638             case 0xE8 ... 0xEF: /* FSUBR %st(?),%st(0) */
   5639                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, modrm - 0xE8, 0, False );
   5640                break;
   5641 
   5642             case 0xF0 ... 0xF7: /* FDIV %st(?),%st(0) */
   5643                fp_do_op_ST_ST ( "div", Iop_DivF64, modrm - 0xF0, 0, False );
   5644                break;
   5645 
   5646             case 0xF8 ... 0xFF: /* FDIVR %st(?),%st(0) */
   5647                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, modrm - 0xF8, 0, False );
   5648                break;
   5649 
   5650             default:
   5651                goto decode_fail;
   5652          }
   5653       }
   5654    }
   5655 
   5656    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
   5657    else
   5658    if (first_opcode == 0xD9) {
   5659       if (modrm < 0xC0) {
   5660 
   5661          /* bits 5,4,3 are an opcode extension, and the modRM also
   5662             specifies an address. */
   5663          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   5664          delta += len;
   5665 
   5666          switch (gregLO3ofRM(modrm)) {
   5667 
   5668             case 0: /* FLD single-real */
   5669                DIP("flds %s\n", dis_buf);
   5670                fp_push();
   5671                put_ST(0, unop(Iop_F32toF64,
   5672                               loadLE(Ity_F32, mkexpr(addr))));
   5673                break;
   5674 
   5675             case 2: /* FST single-real */
   5676                DIP("fsts %s\n", dis_buf);
   5677                storeLE(mkexpr(addr),
   5678                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   5679                break;
   5680 
   5681             case 3: /* FSTP single-real */
   5682                DIP("fstps %s\n", dis_buf);
   5683                storeLE(mkexpr(addr),
   5684                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   5685                fp_pop();
   5686                break;
   5687 
   5688             case 4: { /* FLDENV m28 */
   5689                /* Uses dirty helper:
   5690                      VexEmNote amd64g_do_FLDENV ( VexGuestX86State*, HWord ) */
   5691                IRTemp    ew = newTemp(Ity_I32);
   5692                IRTemp   w64 = newTemp(Ity_I64);
   5693                IRDirty*   d = unsafeIRDirty_0_N (
   5694                                  0/*regparms*/,
   5695                                  "amd64g_dirtyhelper_FLDENV",
   5696                                  &amd64g_dirtyhelper_FLDENV,
   5697                                  mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
   5698                               );
   5699                d->tmp       = w64;
   5700                /* declare we're reading memory */
   5701                d->mFx   = Ifx_Read;
   5702                d->mAddr = mkexpr(addr);
   5703                d->mSize = 28;
   5704 
   5705                /* declare we're writing guest state */
   5706                d->nFxState = 4;
   5707                vex_bzero(&d->fxState, sizeof(d->fxState));
   5708 
   5709                d->fxState[0].fx     = Ifx_Write;
   5710                d->fxState[0].offset = OFFB_FTOP;
   5711                d->fxState[0].size   = sizeof(UInt);
   5712 
   5713                d->fxState[1].fx     = Ifx_Write;
   5714                d->fxState[1].offset = OFFB_FPTAGS;
   5715                d->fxState[1].size   = 8 * sizeof(UChar);
   5716 
   5717                d->fxState[2].fx     = Ifx_Write;
   5718                d->fxState[2].offset = OFFB_FPROUND;
   5719                d->fxState[2].size   = sizeof(ULong);
   5720 
   5721                d->fxState[3].fx     = Ifx_Write;
   5722                d->fxState[3].offset = OFFB_FC3210;
   5723                d->fxState[3].size   = sizeof(ULong);
   5724 
   5725                stmt( IRStmt_Dirty(d) );
   5726 
   5727                /* ew contains any emulation warning we may need to
   5728                   issue.  If needed, side-exit to the next insn,
   5729                   reporting the warning, so that Valgrind's dispatcher
   5730                   sees the warning. */
   5731                assign(ew, unop(Iop_64to32,mkexpr(w64)) );
   5732                put_emwarn( mkexpr(ew) );
   5733                stmt(
   5734                   IRStmt_Exit(
   5735                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   5736                      Ijk_EmWarn,
   5737                      IRConst_U64( guest_RIP_bbstart+delta ),
   5738                      OFFB_RIP
   5739                   )
   5740                );
   5741 
   5742                DIP("fldenv %s\n", dis_buf);
   5743                break;
   5744             }
   5745 
   5746             case 5: {/* FLDCW */
   5747                /* The only thing we observe in the control word is the
   5748                   rounding mode.  Therefore, pass the 16-bit value
   5749                   (x87 native-format control word) to a clean helper,
   5750                   getting back a 64-bit value, the lower half of which
   5751                   is the FPROUND value to store, and the upper half of
   5752                   which is the emulation-warning token which may be
   5753                   generated.
   5754                */
   5755                /* ULong amd64h_check_fldcw ( ULong ); */
   5756                IRTemp t64 = newTemp(Ity_I64);
   5757                IRTemp ew = newTemp(Ity_I32);
   5758                DIP("fldcw %s\n", dis_buf);
   5759                assign( t64, mkIRExprCCall(
   5760                                Ity_I64, 0/*regparms*/,
   5761                                "amd64g_check_fldcw",
   5762                                &amd64g_check_fldcw,
   5763                                mkIRExprVec_1(
   5764                                   unop( Iop_16Uto64,
   5765                                         loadLE(Ity_I16, mkexpr(addr)))
   5766                                )
   5767                             )
   5768                      );
   5769 
   5770                put_fpround( unop(Iop_64to32, mkexpr(t64)) );
   5771                assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   5772                put_emwarn( mkexpr(ew) );
   5773                /* Finally, if an emulation warning was reported,
   5774                   side-exit to the next insn, reporting the warning,
   5775                   so that Valgrind's dispatcher sees the warning. */
   5776                stmt(
   5777                   IRStmt_Exit(
   5778                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   5779                      Ijk_EmWarn,
   5780                      IRConst_U64( guest_RIP_bbstart+delta ),
   5781                      OFFB_RIP
   5782                   )
   5783                );
   5784                break;
   5785             }
   5786 
   5787             case 6: { /* FNSTENV m28 */
   5788                /* Uses dirty helper:
   5789                      void amd64g_do_FSTENV ( VexGuestAMD64State*, HWord ) */
   5790                IRDirty* d = unsafeIRDirty_0_N (
   5791                                0/*regparms*/,
   5792                                "amd64g_dirtyhelper_FSTENV",
   5793                                &amd64g_dirtyhelper_FSTENV,
   5794                                mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
   5795                             );
   5796                /* declare we're writing memory */
   5797                d->mFx   = Ifx_Write;
   5798                d->mAddr = mkexpr(addr);
   5799                d->mSize = 28;
   5800 
   5801                /* declare we're reading guest state */
   5802                d->nFxState = 4;
   5803                vex_bzero(&d->fxState, sizeof(d->fxState));
   5804 
   5805                d->fxState[0].fx     = Ifx_Read;
   5806                d->fxState[0].offset = OFFB_FTOP;
   5807                d->fxState[0].size   = sizeof(UInt);
   5808 
   5809                d->fxState[1].fx     = Ifx_Read;
   5810                d->fxState[1].offset = OFFB_FPTAGS;
   5811                d->fxState[1].size   = 8 * sizeof(UChar);
   5812 
   5813                d->fxState[2].fx     = Ifx_Read;
   5814                d->fxState[2].offset = OFFB_FPROUND;
   5815                d->fxState[2].size   = sizeof(ULong);
   5816 
   5817                d->fxState[3].fx     = Ifx_Read;
   5818                d->fxState[3].offset = OFFB_FC3210;
   5819                d->fxState[3].size   = sizeof(ULong);
   5820 
   5821                stmt( IRStmt_Dirty(d) );
   5822 
   5823                DIP("fnstenv %s\n", dis_buf);
   5824                break;
   5825             }
   5826 
   5827             case 7: /* FNSTCW */
   5828                /* Fake up a native x87 FPU control word.  The only
   5829                   thing it depends on is FPROUND[1:0], so call a clean
   5830                   helper to cook it up. */
   5831                /* ULong amd64g_create_fpucw ( ULong fpround ) */
   5832                DIP("fnstcw %s\n", dis_buf);
   5833                storeLE(
   5834                   mkexpr(addr),
   5835                   unop( Iop_64to16,
   5836                         mkIRExprCCall(
   5837                            Ity_I64, 0/*regp*/,
   5838                            "amd64g_create_fpucw", &amd64g_create_fpucw,
   5839                            mkIRExprVec_1( unop(Iop_32Uto64, get_fpround()) )
   5840                         )
   5841                   )
   5842                );
   5843                break;
   5844 
   5845             default:
   5846                vex_printf("unhandled opc_aux = 0x%2x\n",
   5847                           (UInt)gregLO3ofRM(modrm));
   5848                vex_printf("first_opcode == 0xD9\n");
   5849                goto decode_fail;
   5850          }
   5851 
   5852       } else {
   5853          delta++;
   5854          switch (modrm) {
   5855 
   5856             case 0xC0 ... 0xC7: /* FLD %st(?) */
   5857                r_src = (UInt)modrm - 0xC0;
   5858                DIP("fld %%st(%u)\n", r_src);
   5859                t1 = newTemp(Ity_F64);
   5860                assign(t1, get_ST(r_src));
   5861                fp_push();
   5862                put_ST(0, mkexpr(t1));
   5863                break;
   5864 
   5865             case 0xC8 ... 0xCF: /* FXCH %st(?) */
   5866                r_src = (UInt)modrm - 0xC8;
   5867                DIP("fxch %%st(%u)\n", r_src);
   5868                t1 = newTemp(Ity_F64);
   5869                t2 = newTemp(Ity_F64);
   5870                assign(t1, get_ST(0));
   5871                assign(t2, get_ST(r_src));
   5872                put_ST_UNCHECKED(0, mkexpr(t2));
   5873                put_ST_UNCHECKED(r_src, mkexpr(t1));
   5874                break;
   5875 
   5876             case 0xE0: /* FCHS */
   5877                DIP("fchs\n");
   5878                put_ST_UNCHECKED(0, unop(Iop_NegF64, get_ST(0)));
   5879                break;
   5880 
   5881             case 0xE1: /* FABS */
   5882                DIP("fabs\n");
   5883                put_ST_UNCHECKED(0, unop(Iop_AbsF64, get_ST(0)));
   5884                break;
   5885 
   5886             case 0xE5: { /* FXAM */
   5887                /* This is an interesting one.  It examines %st(0),
   5888                   regardless of whether the tag says it's empty or not.
   5889                   Here, just pass both the tag (in our format) and the
   5890                   value (as a double, actually a ULong) to a helper
   5891                   function. */
   5892                IRExpr** args
   5893                   = mkIRExprVec_2( unop(Iop_8Uto64, get_ST_TAG(0)),
   5894                                    unop(Iop_ReinterpF64asI64,
   5895                                         get_ST_UNCHECKED(0)) );
   5896                put_C3210(mkIRExprCCall(
   5897                             Ity_I64,
   5898                             0/*regparm*/,
   5899                             "amd64g_calculate_FXAM", &amd64g_calculate_FXAM,
   5900                             args
   5901                         ));
   5902                DIP("fxam\n");
   5903                break;
   5904             }
   5905 
   5906             case 0xE8: /* FLD1 */
   5907                DIP("fld1\n");
   5908                fp_push();
   5909                /* put_ST(0, IRExpr_Const(IRConst_F64(1.0))); */
   5910                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL)));
   5911                break;
   5912 
   5913             case 0xE9: /* FLDL2T */
   5914                DIP("fldl2t\n");
   5915                fp_push();
   5916                /* put_ST(0, IRExpr_Const(IRConst_F64(3.32192809488736234781))); */
   5917                put_ST(0, IRExpr_Const(IRConst_F64i(0x400a934f0979a371ULL)));
   5918                break;
   5919 
   5920             case 0xEA: /* FLDL2E */
   5921                DIP("fldl2e\n");
   5922                fp_push();
   5923                /* put_ST(0, IRExpr_Const(IRConst_F64(1.44269504088896340739))); */
   5924                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff71547652b82feULL)));
   5925                break;
   5926 
   5927             case 0xEB: /* FLDPI */
   5928                DIP("fldpi\n");
   5929                fp_push();
   5930                /* put_ST(0, IRExpr_Const(IRConst_F64(3.14159265358979323851))); */
   5931                put_ST(0, IRExpr_Const(IRConst_F64i(0x400921fb54442d18ULL)));
   5932                break;
   5933 
   5934             case 0xEC: /* FLDLG2 */
   5935                DIP("fldlg2\n");
   5936                fp_push();
   5937                /* put_ST(0, IRExpr_Const(IRConst_F64(0.301029995663981143))); */
   5938                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fd34413509f79ffULL)));
   5939                break;
   5940 
   5941             case 0xED: /* FLDLN2 */
   5942                DIP("fldln2\n");
   5943                fp_push();
   5944                /* put_ST(0, IRExpr_Const(IRConst_F64(0.69314718055994530942))); */
   5945                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fe62e42fefa39efULL)));
   5946                break;
   5947 
   5948             case 0xEE: /* FLDZ */
   5949                DIP("fldz\n");
   5950                fp_push();
   5951                /* put_ST(0, IRExpr_Const(IRConst_F64(0.0))); */
   5952                put_ST(0, IRExpr_Const(IRConst_F64i(0x0000000000000000ULL)));
   5953                break;
   5954 
   5955             case 0xF0: /* F2XM1 */
   5956                DIP("f2xm1\n");
   5957                put_ST_UNCHECKED(0,
   5958                   binop(Iop_2xm1F64,
   5959                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5960                         get_ST(0)));
   5961                break;
   5962 
   5963             case 0xF1: /* FYL2X */
   5964                DIP("fyl2x\n");
   5965                put_ST_UNCHECKED(1,
   5966                   triop(Iop_Yl2xF64,
   5967                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5968                         get_ST(1),
   5969                         get_ST(0)));
   5970                fp_pop();
   5971                break;
   5972 
   5973             case 0xF2: { /* FPTAN */
   5974                DIP("fptan\n");
   5975                IRTemp argD = newTemp(Ity_F64);
   5976                assign(argD, get_ST(0));
   5977                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
   5978                IRTemp resD = newTemp(Ity_F64);
   5979                assign(resD,
   5980                   IRExpr_ITE(
   5981                      mkexpr(argOK),
   5982                      binop(Iop_TanF64,
   5983                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5984                            mkexpr(argD)),
   5985                      mkexpr(argD))
   5986                );
   5987                put_ST_UNCHECKED(0, mkexpr(resD));
   5988                /* Conditionally push 1.0 on the stack, if the arg is
   5989                   in range */
   5990                maybe_fp_push(argOK);
   5991                maybe_put_ST(argOK, 0,
   5992                             IRExpr_Const(IRConst_F64(1.0)));
   5993                set_C2( binop(Iop_Xor64,
   5994                              unop(Iop_1Uto64, mkexpr(argOK)),
   5995                              mkU64(1)) );
   5996                break;
   5997             }
   5998 
   5999             case 0xF3: /* FPATAN */
   6000                DIP("fpatan\n");
   6001                put_ST_UNCHECKED(1,
   6002                   triop(Iop_AtanF64,
   6003                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6004                         get_ST(1),
   6005                         get_ST(0)));
   6006                fp_pop();
   6007                break;
   6008 
   6009             case 0xF4: { /* FXTRACT */
   6010                IRTemp argF = newTemp(Ity_F64);
   6011                IRTemp sigF = newTemp(Ity_F64);
   6012                IRTemp expF = newTemp(Ity_F64);
   6013                IRTemp argI = newTemp(Ity_I64);
   6014                IRTemp sigI = newTemp(Ity_I64);
   6015                IRTemp expI = newTemp(Ity_I64);
   6016                DIP("fxtract\n");
   6017                assign( argF, get_ST(0) );
   6018                assign( argI, unop(Iop_ReinterpF64asI64, mkexpr(argF)));
   6019                assign( sigI,
   6020                        mkIRExprCCall(
   6021                           Ity_I64, 0/*regparms*/,
   6022                           "x86amd64g_calculate_FXTRACT",
   6023                           &x86amd64g_calculate_FXTRACT,
   6024                           mkIRExprVec_2( mkexpr(argI),
   6025                                          mkIRExpr_HWord(0)/*sig*/ ))
   6026                );
   6027                assign( expI,
   6028                        mkIRExprCCall(
   6029                           Ity_I64, 0/*regparms*/,
   6030                           "x86amd64g_calculate_FXTRACT",
   6031                           &x86amd64g_calculate_FXTRACT,
   6032                           mkIRExprVec_2( mkexpr(argI),
   6033                                          mkIRExpr_HWord(1)/*exp*/ ))
   6034                );
   6035                assign( sigF, unop(Iop_ReinterpI64asF64, mkexpr(sigI)) );
   6036                assign( expF, unop(Iop_ReinterpI64asF64, mkexpr(expI)) );
   6037                /* exponent */
   6038                put_ST_UNCHECKED(0, mkexpr(expF) );
   6039                fp_push();
   6040                /* significand */
   6041                put_ST(0, mkexpr(sigF) );
   6042                break;
   6043             }
   6044 
   6045             case 0xF5: { /* FPREM1 -- IEEE compliant */
   6046                IRTemp a1 = newTemp(Ity_F64);
   6047                IRTemp a2 = newTemp(Ity_F64);
   6048                DIP("fprem1\n");
   6049                /* Do FPREM1 twice, once to get the remainder, and once
   6050                   to get the C3210 flag values. */
   6051                assign( a1, get_ST(0) );
   6052                assign( a2, get_ST(1) );
   6053                put_ST_UNCHECKED(0,
   6054                   triop(Iop_PRem1F64,
   6055                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6056                         mkexpr(a1),
   6057                         mkexpr(a2)));
   6058                put_C3210(
   6059                   unop(Iop_32Uto64,
   6060                   triop(Iop_PRem1C3210F64,
   6061                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6062                         mkexpr(a1),
   6063                         mkexpr(a2)) ));
   6064                break;
   6065             }
   6066 
   6067             case 0xF7: /* FINCSTP */
   6068                DIP("fincstp\n");
   6069                put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   6070                break;
   6071 
   6072             case 0xF8: { /* FPREM -- not IEEE compliant */
   6073                IRTemp a1 = newTemp(Ity_F64);
   6074                IRTemp a2 = newTemp(Ity_F64);
   6075                DIP("fprem\n");
   6076                /* Do FPREM twice, once to get the remainder, and once
   6077                   to get the C3210 flag values. */
   6078                assign( a1, get_ST(0) );
   6079                assign( a2, get_ST(1) );
   6080                put_ST_UNCHECKED(0,
   6081                   triop(Iop_PRemF64,
   6082                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6083                         mkexpr(a1),
   6084                         mkexpr(a2)));
   6085                put_C3210(
   6086                   unop(Iop_32Uto64,
   6087                   triop(Iop_PRemC3210F64,
   6088                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6089                         mkexpr(a1),
   6090                         mkexpr(a2)) ));
   6091                break;
   6092             }
   6093 
   6094             case 0xF9: /* FYL2XP1 */
   6095                DIP("fyl2xp1\n");
   6096                put_ST_UNCHECKED(1,
   6097                   triop(Iop_Yl2xp1F64,
   6098                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6099                         get_ST(1),
   6100                         get_ST(0)));
   6101                fp_pop();
   6102                break;
   6103 
   6104             case 0xFA: /* FSQRT */
   6105                DIP("fsqrt\n");
   6106                put_ST_UNCHECKED(0,
   6107                   binop(Iop_SqrtF64,
   6108                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6109                         get_ST(0)));
   6110                break;
   6111 
   6112             case 0xFB: { /* FSINCOS */
   6113                DIP("fsincos\n");
   6114                IRTemp argD = newTemp(Ity_F64);
   6115                assign(argD, get_ST(0));
   6116                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
   6117                IRTemp resD = newTemp(Ity_F64);
   6118                assign(resD,
   6119                   IRExpr_ITE(
   6120                      mkexpr(argOK),
   6121                      binop(Iop_SinF64,
   6122                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6123                            mkexpr(argD)),
   6124                      mkexpr(argD))
   6125                );
   6126                put_ST_UNCHECKED(0, mkexpr(resD));
   6127                /* Conditionally push the cos value on the stack, if
   6128                   the arg is in range */
   6129                maybe_fp_push(argOK);
   6130                maybe_put_ST(argOK, 0,
   6131                   binop(Iop_CosF64,
   6132                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6133                         mkexpr(argD)));
   6134                set_C2( binop(Iop_Xor64,
   6135                              unop(Iop_1Uto64, mkexpr(argOK)),
   6136                              mkU64(1)) );
   6137                break;
   6138             }
   6139 
   6140             case 0xFC: /* FRNDINT */
   6141                DIP("frndint\n");
   6142                put_ST_UNCHECKED(0,
   6143                   binop(Iop_RoundF64toInt, get_roundingmode(), get_ST(0)) );
   6144                break;
   6145 
   6146             case 0xFD: /* FSCALE */
   6147                DIP("fscale\n");
   6148                put_ST_UNCHECKED(0,
   6149                   triop(Iop_ScaleF64,
   6150                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6151                         get_ST(0),
   6152                         get_ST(1)));
   6153                break;
   6154 
   6155             case 0xFE:   /* FSIN */
   6156             case 0xFF: { /* FCOS */
   6157                Bool isSIN = modrm == 0xFE;
   6158                DIP("%s\n", isSIN ? "fsin" : "fcos");
   6159                IRTemp argD = newTemp(Ity_F64);
   6160                assign(argD, get_ST(0));
   6161                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
   6162                IRTemp resD = newTemp(Ity_F64);
   6163                assign(resD,
   6164                   IRExpr_ITE(
   6165                      mkexpr(argOK),
   6166                      binop(isSIN ? Iop_SinF64 : Iop_CosF64,
   6167                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6168                            mkexpr(argD)),
   6169                      mkexpr(argD))
   6170                );
   6171                put_ST_UNCHECKED(0, mkexpr(resD));
   6172                set_C2( binop(Iop_Xor64,
   6173                              unop(Iop_1Uto64, mkexpr(argOK)),
   6174                              mkU64(1)) );
   6175                break;
   6176             }
   6177 
   6178             default:
   6179                goto decode_fail;
   6180          }
   6181       }
   6182    }
   6183 
   6184    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
   6185    else
   6186    if (first_opcode == 0xDA) {
   6187 
   6188       if (modrm < 0xC0) {
   6189 
   6190          /* bits 5,4,3 are an opcode extension, and the modRM also
   6191             specifies an address. */
   6192          IROp   fop;
   6193          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6194          delta += len;
   6195          switch (gregLO3ofRM(modrm)) {
   6196 
   6197             case 0: /* FIADD m32int */ /* ST(0) += m32int */
   6198                DIP("fiaddl %s\n", dis_buf);
   6199                fop = Iop_AddF64;
   6200                goto do_fop_m32;
   6201 
   6202             case 1: /* FIMUL m32int */ /* ST(0) *= m32int */
   6203                DIP("fimull %s\n", dis_buf);
   6204                fop = Iop_MulF64;
   6205                goto do_fop_m32;
   6206 
   6207             case 4: /* FISUB m32int */ /* ST(0) -= m32int */
   6208                DIP("fisubl %s\n", dis_buf);
   6209                fop = Iop_SubF64;
   6210                goto do_fop_m32;
   6211 
   6212             case 5: /* FISUBR m32int */ /* ST(0) = m32int - ST(0) */
   6213                DIP("fisubrl %s\n", dis_buf);
   6214                fop = Iop_SubF64;
   6215                goto do_foprev_m32;
   6216 
   6217             case 6: /* FIDIV m32int */ /* ST(0) /= m32int */
   6218                DIP("fisubl %s\n", dis_buf);
   6219                fop = Iop_DivF64;
   6220                goto do_fop_m32;
   6221 
   6222             case 7: /* FIDIVR m32int */ /* ST(0) = m32int / ST(0) */
   6223                DIP("fidivrl %s\n", dis_buf);
   6224                fop = Iop_DivF64;
   6225                goto do_foprev_m32;
   6226 
   6227             do_fop_m32:
   6228                put_ST_UNCHECKED(0,
   6229                   triop(fop,
   6230                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6231                         get_ST(0),
   6232                         unop(Iop_I32StoF64,
   6233                              loadLE(Ity_I32, mkexpr(addr)))));
   6234                break;
   6235 
   6236             do_foprev_m32:
   6237                put_ST_UNCHECKED(0,
   6238                   triop(fop,
   6239                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6240                         unop(Iop_I32StoF64,
   6241                              loadLE(Ity_I32, mkexpr(addr))),
   6242                         get_ST(0)));
   6243                break;
   6244 
   6245             default:
   6246                vex_printf("unhandled opc_aux = 0x%2x\n",
   6247                           (UInt)gregLO3ofRM(modrm));
   6248                vex_printf("first_opcode == 0xDA\n");
   6249                goto decode_fail;
   6250          }
   6251 
   6252       } else {
   6253 
   6254          delta++;
   6255          switch (modrm) {
   6256 
   6257             case 0xC0 ... 0xC7: /* FCMOVB ST(i), ST(0) */
   6258                r_src = (UInt)modrm - 0xC0;
   6259                DIP("fcmovb %%st(%u), %%st(0)\n", r_src);
   6260                put_ST_UNCHECKED(0,
   6261                                 IRExpr_ITE(
   6262                                     mk_amd64g_calculate_condition(AMD64CondB),
   6263                                     get_ST(r_src), get_ST(0)) );
   6264                break;
   6265 
   6266             case 0xC8 ... 0xCF: /* FCMOVE(Z) ST(i), ST(0) */
   6267                r_src = (UInt)modrm - 0xC8;
   6268                DIP("fcmovz %%st(%u), %%st(0)\n", r_src);
   6269                put_ST_UNCHECKED(0,
   6270                                 IRExpr_ITE(
   6271                                     mk_amd64g_calculate_condition(AMD64CondZ),
   6272                                     get_ST(r_src), get_ST(0)) );
   6273                break;
   6274 
   6275             case 0xD0 ... 0xD7: /* FCMOVBE ST(i), ST(0) */
   6276                r_src = (UInt)modrm - 0xD0;
   6277                DIP("fcmovbe %%st(%u), %%st(0)\n", r_src);
   6278                put_ST_UNCHECKED(0,
   6279                                 IRExpr_ITE(
   6280                                     mk_amd64g_calculate_condition(AMD64CondBE),
   6281                                     get_ST(r_src), get_ST(0)) );
   6282                break;
   6283 
   6284             case 0xD8 ... 0xDF: /* FCMOVU ST(i), ST(0) */
   6285                r_src = (UInt)modrm - 0xD8;
   6286                DIP("fcmovu %%st(%u), %%st(0)\n", r_src);
   6287                put_ST_UNCHECKED(0,
   6288                                 IRExpr_ITE(
   6289                                     mk_amd64g_calculate_condition(AMD64CondP),
   6290                                     get_ST(r_src), get_ST(0)) );
   6291                break;
   6292 
   6293             case 0xE9: /* FUCOMPP %st(0),%st(1) */
   6294                DIP("fucompp %%st(0),%%st(1)\n");
   6295                /* This forces C1 to zero, which isn't right. */
   6296                put_C3210(
   6297                    unop(Iop_32Uto64,
   6298                    binop( Iop_And32,
   6299                           binop(Iop_Shl32,
   6300                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   6301                                 mkU8(8)),
   6302                           mkU32(0x4500)
   6303                    )));
   6304                fp_pop();
   6305                fp_pop();
   6306                break;
   6307 
   6308             default:
   6309                goto decode_fail;
   6310          }
   6311 
   6312       }
   6313    }
   6314 
   6315    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDB opcodes +-+-+-+-+-+-+-+ */
   6316    else
   6317    if (first_opcode == 0xDB) {
   6318       if (modrm < 0xC0) {
   6319 
   6320          /* bits 5,4,3 are an opcode extension, and the modRM also
   6321             specifies an address. */
   6322          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6323          delta += len;
   6324 
   6325          switch (gregLO3ofRM(modrm)) {
   6326 
   6327             case 0: /* FILD m32int */
   6328                DIP("fildl %s\n", dis_buf);
   6329                fp_push();
   6330                put_ST(0, unop(Iop_I32StoF64,
   6331                               loadLE(Ity_I32, mkexpr(addr))));
   6332                break;
   6333 
   6334             case 1: /* FISTTPL m32 (SSE3) */
   6335                DIP("fisttpl %s\n", dis_buf);
   6336                storeLE( mkexpr(addr),
   6337                         binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) );
   6338                fp_pop();
   6339                break;
   6340 
   6341             case 2: /* FIST m32 */
   6342                DIP("fistl %s\n", dis_buf);
   6343                storeLE( mkexpr(addr),
   6344                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   6345                break;
   6346 
   6347             case 3: /* FISTP m32 */
   6348                DIP("fistpl %s\n", dis_buf);
   6349                storeLE( mkexpr(addr),
   6350                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   6351                fp_pop();
   6352                break;
   6353 
   6354             case 5: { /* FLD extended-real */
   6355                /* Uses dirty helper:
   6356                      ULong amd64g_loadF80le ( ULong )
   6357                   addr holds the address.  First, do a dirty call to
   6358                   get hold of the data. */
   6359                IRTemp   val  = newTemp(Ity_I64);
   6360                IRExpr** args = mkIRExprVec_1 ( mkexpr(addr) );
   6361 
   6362                IRDirty* d = unsafeIRDirty_1_N (
   6363                                val,
   6364                                0/*regparms*/,
   6365                                "amd64g_dirtyhelper_loadF80le",
   6366                                &amd64g_dirtyhelper_loadF80le,
   6367                                args
   6368                             );
   6369                /* declare that we're reading memory */
   6370                d->mFx   = Ifx_Read;
   6371                d->mAddr = mkexpr(addr);
   6372                d->mSize = 10;
   6373 
   6374                /* execute the dirty call, dumping the result in val. */
   6375                stmt( IRStmt_Dirty(d) );
   6376                fp_push();
   6377                put_ST(0, unop(Iop_ReinterpI64asF64, mkexpr(val)));
   6378 
   6379                DIP("fldt %s\n", dis_buf);
   6380                break;
   6381             }
   6382 
   6383             case 7: { /* FSTP extended-real */
   6384                /* Uses dirty helper:
   6385                      void amd64g_storeF80le ( ULong addr, ULong data )
   6386                */
   6387                IRExpr** args
   6388                   = mkIRExprVec_2( mkexpr(addr),
   6389                                    unop(Iop_ReinterpF64asI64, get_ST(0)) );
   6390 
   6391                IRDirty* d = unsafeIRDirty_0_N (
   6392                                0/*regparms*/,
   6393                                "amd64g_dirtyhelper_storeF80le",
   6394                                &amd64g_dirtyhelper_storeF80le,
   6395                                args
   6396                             );
   6397                /* declare we're writing memory */
   6398                d->mFx   = Ifx_Write;
   6399                d->mAddr = mkexpr(addr);
   6400                d->mSize = 10;
   6401 
   6402                /* execute the dirty call. */
   6403                stmt( IRStmt_Dirty(d) );
   6404                fp_pop();
   6405 
   6406                DIP("fstpt\n %s", dis_buf);
   6407                break;
   6408             }
   6409 
   6410             default:
   6411                vex_printf("unhandled opc_aux = 0x%2x\n",
   6412                           (UInt)gregLO3ofRM(modrm));
   6413                vex_printf("first_opcode == 0xDB\n");
   6414                goto decode_fail;
   6415          }
   6416 
   6417       } else {
   6418 
   6419          delta++;
   6420          switch (modrm) {
   6421 
   6422             case 0xC0 ... 0xC7: /* FCMOVNB ST(i), ST(0) */
   6423                r_src = (UInt)modrm - 0xC0;
   6424                DIP("fcmovnb %%st(%u), %%st(0)\n", r_src);
   6425                put_ST_UNCHECKED(0,
   6426                                 IRExpr_ITE(
   6427                                     mk_amd64g_calculate_condition(AMD64CondNB),
   6428                                     get_ST(r_src), get_ST(0)) );
   6429                break;
   6430 
   6431             case 0xC8 ... 0xCF: /* FCMOVNE(NZ) ST(i), ST(0) */
   6432                r_src = (UInt)modrm - 0xC8;
   6433                DIP("fcmovnz %%st(%u), %%st(0)\n", r_src);
   6434                put_ST_UNCHECKED(
   6435                   0,
   6436                   IRExpr_ITE(
   6437                      mk_amd64g_calculate_condition(AMD64CondNZ),
   6438                      get_ST(r_src),
   6439                      get_ST(0)
   6440                   )
   6441                );
   6442                break;
   6443 
   6444             case 0xD0 ... 0xD7: /* FCMOVNBE ST(i), ST(0) */
   6445                r_src = (UInt)modrm - 0xD0;
   6446                DIP("fcmovnbe %%st(%u), %%st(0)\n", r_src);
   6447                put_ST_UNCHECKED(
   6448                   0,
   6449                   IRExpr_ITE(
   6450                      mk_amd64g_calculate_condition(AMD64CondNBE),
   6451                      get_ST(r_src),
   6452                      get_ST(0)
   6453                   )
   6454                );
   6455                break;
   6456 
   6457             case 0xD8 ... 0xDF: /* FCMOVNU ST(i), ST(0) */
   6458                r_src = (UInt)modrm - 0xD8;
   6459                DIP("fcmovnu %%st(%u), %%st(0)\n", r_src);
   6460                put_ST_UNCHECKED(
   6461                   0,
   6462                   IRExpr_ITE(
   6463                      mk_amd64g_calculate_condition(AMD64CondNP),
   6464                      get_ST(r_src),
   6465                      get_ST(0)
   6466                   )
   6467                );
   6468                break;
   6469 
   6470             case 0xE2:
   6471                DIP("fnclex\n");
   6472                break;
   6473 
   6474             case 0xE3: {
   6475                gen_FINIT_SEQUENCE(NULL/*no guarding condition*/);
   6476                DIP("fninit\n");
   6477                break;
   6478             }
   6479 
   6480             case 0xE8 ... 0xEF: /* FUCOMI %st(0),%st(?) */
   6481                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, False );
   6482                break;
   6483 
   6484             case 0xF0 ... 0xF7: /* FCOMI %st(0),%st(?) */
   6485                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, False );
   6486                break;
   6487 
   6488             default:
   6489                goto decode_fail;
   6490          }
   6491       }
   6492    }
   6493 
   6494    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDC opcodes +-+-+-+-+-+-+-+ */
   6495    else
   6496    if (first_opcode == 0xDC) {
   6497       if (modrm < 0xC0) {
   6498 
   6499          /* bits 5,4,3 are an opcode extension, and the modRM also
   6500             specifies an address. */
   6501          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6502          delta += len;
   6503 
   6504          switch (gregLO3ofRM(modrm)) {
   6505 
   6506             case 0: /* FADD double-real */
   6507                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, True );
   6508                break;
   6509 
   6510             case 1: /* FMUL double-real */
   6511                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, True );
   6512                break;
   6513 
   6514             case 2: /* FCOM double-real */
   6515                DIP("fcoml %s\n", dis_buf);
   6516                /* This forces C1 to zero, which isn't right. */
   6517                put_C3210(
   6518                    unop(Iop_32Uto64,
   6519                    binop( Iop_And32,
   6520                           binop(Iop_Shl32,
   6521                                 binop(Iop_CmpF64,
   6522                                       get_ST(0),
   6523                                       loadLE(Ity_F64,mkexpr(addr))),
   6524                                 mkU8(8)),
   6525                           mkU32(0x4500)
   6526                    )));
   6527                break;
   6528 
   6529             case 3: /* FCOMP double-real */
   6530                DIP("fcompl %s\n", dis_buf);
   6531                /* This forces C1 to zero, which isn't right. */
   6532                put_C3210(
   6533                    unop(Iop_32Uto64,
   6534                    binop( Iop_And32,
   6535                           binop(Iop_Shl32,
   6536                                 binop(Iop_CmpF64,
   6537                                       get_ST(0),
   6538                                       loadLE(Ity_F64,mkexpr(addr))),
   6539                                 mkU8(8)),
   6540                           mkU32(0x4500)
   6541                    )));
   6542                fp_pop();
   6543                break;
   6544 
   6545             case 4: /* FSUB double-real */
   6546                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, True );
   6547                break;
   6548 
   6549             case 5: /* FSUBR double-real */
   6550                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, True );
   6551                break;
   6552 
   6553             case 6: /* FDIV double-real */
   6554                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, True );
   6555                break;
   6556 
   6557             case 7: /* FDIVR double-real */
   6558                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, True );
   6559                break;
   6560 
   6561             default:
   6562                vex_printf("unhandled opc_aux = 0x%2x\n",
   6563                           (UInt)gregLO3ofRM(modrm));
   6564                vex_printf("first_opcode == 0xDC\n");
   6565                goto decode_fail;
   6566          }
   6567 
   6568       } else {
   6569 
   6570          delta++;
   6571          switch (modrm) {
   6572 
   6573             case 0xC0 ... 0xC7: /* FADD %st(0),%st(?) */
   6574                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, False );
   6575                break;
   6576 
   6577             case 0xC8 ... 0xCF: /* FMUL %st(0),%st(?) */
   6578                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, False );
   6579                break;
   6580 
   6581             case 0xE0 ... 0xE7: /* FSUBR %st(0),%st(?) */
   6582                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0, modrm - 0xE0, False );
   6583                break;
   6584 
   6585             case 0xE8 ... 0xEF: /* FSUB %st(0),%st(?) */
   6586                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0, modrm - 0xE8, False );
   6587                break;
   6588 
   6589             case 0xF0 ... 0xF7: /* FDIVR %st(0),%st(?) */
   6590                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, False );
   6591                break;
   6592 
   6593             case 0xF8 ... 0xFF: /* FDIV %st(0),%st(?) */
   6594                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, False );
   6595                break;
   6596 
   6597             default:
   6598                goto decode_fail;
   6599          }
   6600 
   6601       }
   6602    }
   6603 
   6604    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDD opcodes +-+-+-+-+-+-+-+ */
   6605    else
   6606    if (first_opcode == 0xDD) {
   6607 
   6608       if (modrm < 0xC0) {
   6609 
   6610          /* bits 5,4,3 are an opcode extension, and the modRM also
   6611             specifies an address. */
   6612          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6613          delta += len;
   6614 
   6615          switch (gregLO3ofRM(modrm)) {
   6616 
   6617             case 0: /* FLD double-real */
   6618                DIP("fldl %s\n", dis_buf);
   6619                fp_push();
   6620                put_ST(0, loadLE(Ity_F64, mkexpr(addr)));
   6621                break;
   6622 
   6623             case 1: /* FISTTPQ m64 (SSE3) */
   6624                DIP("fistppll %s\n", dis_buf);
   6625                storeLE( mkexpr(addr),
   6626                         binop(Iop_F64toI64S, mkU32(Irrm_ZERO), get_ST(0)) );
   6627                fp_pop();
   6628                break;
   6629 
   6630             case 2: /* FST double-real */
   6631                DIP("fstl %s\n", dis_buf);
   6632                storeLE(mkexpr(addr), get_ST(0));
   6633                break;
   6634 
   6635             case 3: /* FSTP double-real */
   6636                DIP("fstpl %s\n", dis_buf);
   6637                storeLE(mkexpr(addr), get_ST(0));
   6638                fp_pop();
   6639                break;
   6640 
   6641             case 4: { /* FRSTOR m94/m108 */
   6642                IRTemp   ew = newTemp(Ity_I32);
   6643                IRTemp  w64 = newTemp(Ity_I64);
   6644                IRDirty*  d;
   6645                if ( have66(pfx) ) {
   6646                   /* Uses dirty helper:
   6647                      VexEmNote amd64g_dirtyhelper_FRSTORS
   6648                                   ( VexGuestAMD64State*, HWord ) */
   6649                   d = unsafeIRDirty_0_N (
   6650                          0/*regparms*/,
   6651                          "amd64g_dirtyhelper_FRSTORS",
   6652                          &amd64g_dirtyhelper_FRSTORS,
   6653                          mkIRExprVec_1( mkexpr(addr) )
   6654                       );
   6655                   d->mSize = 94;
   6656                } else {
   6657                   /* Uses dirty helper:
   6658                      VexEmNote amd64g_dirtyhelper_FRSTOR
   6659                                   ( VexGuestAMD64State*, HWord ) */
   6660                   d = unsafeIRDirty_0_N (
   6661                          0/*regparms*/,
   6662                          "amd64g_dirtyhelper_FRSTOR",
   6663                          &amd64g_dirtyhelper_FRSTOR,
   6664                          mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
   6665                       );
   6666                   d->mSize = 108;
   6667                }
   6668 
   6669                d->tmp    = w64;
   6670                /* declare we're reading memory */
   6671                d->mFx   = Ifx_Read;
   6672                d->mAddr = mkexpr(addr);
   6673                /* d->mSize set above */
   6674 
   6675                /* declare we're writing guest state */
   6676                d->nFxState = 5;
   6677                vex_bzero(&d->fxState, sizeof(d->fxState));
   6678 
   6679                d->fxState[0].fx     = Ifx_Write;
   6680                d->fxState[0].offset = OFFB_FTOP;
   6681                d->fxState[0].size   = sizeof(UInt);
   6682 
   6683                d->fxState[1].fx     = Ifx_Write;
   6684                d->fxState[1].offset = OFFB_FPREGS;
   6685                d->fxState[1].size   = 8 * sizeof(ULong);
   6686 
   6687                d->fxState[2].fx     = Ifx_Write;
   6688                d->fxState[2].offset = OFFB_FPTAGS;
   6689                d->fxState[2].size   = 8 * sizeof(UChar);
   6690 
   6691                d->fxState[3].fx     = Ifx_Write;
   6692                d->fxState[3].offset = OFFB_FPROUND;
   6693                d->fxState[3].size   = sizeof(ULong);
   6694 
   6695                d->fxState[4].fx     = Ifx_Write;
   6696                d->fxState[4].offset = OFFB_FC3210;
   6697                d->fxState[4].size   = sizeof(ULong);
   6698 
   6699                stmt( IRStmt_Dirty(d) );
   6700 
   6701                /* ew contains any emulation warning we may need to
   6702                   issue.  If needed, side-exit to the next insn,
   6703                   reporting the warning, so that Valgrind's dispatcher
   6704                   sees the warning. */
   6705                assign(ew, unop(Iop_64to32,mkexpr(w64)) );
   6706                put_emwarn( mkexpr(ew) );
   6707                stmt(
   6708                   IRStmt_Exit(
   6709                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   6710                      Ijk_EmWarn,
   6711                      IRConst_U64( guest_RIP_bbstart+delta ),
   6712                      OFFB_RIP
   6713                   )
   6714                );
   6715 
   6716                if ( have66(pfx) ) {
   6717                   DIP("frstors %s\n", dis_buf);
   6718                } else {
   6719                   DIP("frstor %s\n", dis_buf);
   6720                }
   6721                break;
   6722             }
   6723 
   6724             case 6: { /* FNSAVE m94/m108 */
   6725                IRDirty *d;
   6726                if ( have66(pfx) ) {
   6727                  /* Uses dirty helper:
   6728                     void amd64g_dirtyhelper_FNSAVES ( VexGuestAMD64State*,
   6729                                                       HWord ) */
   6730                   d = unsafeIRDirty_0_N (
   6731                          0/*regparms*/,
   6732                          "amd64g_dirtyhelper_FNSAVES",
   6733                          &amd64g_dirtyhelper_FNSAVES,
   6734                          mkIRExprVec_1( mkexpr(addr) )
   6735                          );
   6736                   d->mSize = 94;
   6737                } else {
   6738                  /* Uses dirty helper:
   6739                     void amd64g_dirtyhelper_FNSAVE ( VexGuestAMD64State*,
   6740                                                      HWord ) */
   6741                   d = unsafeIRDirty_0_N (
   6742                          0/*regparms*/,
   6743                          "amd64g_dirtyhelper_FNSAVE",
   6744                          &amd64g_dirtyhelper_FNSAVE,
   6745                          mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
   6746                       );
   6747                   d->mSize = 108;
   6748                }
   6749 
   6750                /* declare we're writing memory */
   6751                d->mFx   = Ifx_Write;
   6752                d->mAddr = mkexpr(addr);
   6753                /* d->mSize set above */
   6754 
   6755                /* declare we're reading guest state */
   6756                d->nFxState = 5;
   6757                vex_bzero(&d->fxState, sizeof(d->fxState));
   6758 
   6759                d->fxState[0].fx     = Ifx_Read;
   6760                d->fxState[0].offset = OFFB_FTOP;
   6761                d->fxState[0].size   = sizeof(UInt);
   6762 
   6763                d->fxState[1].fx     = Ifx_Read;
   6764                d->fxState[1].offset = OFFB_FPREGS;
   6765                d->fxState[1].size   = 8 * sizeof(ULong);
   6766 
   6767                d->fxState[2].fx     = Ifx_Read;
   6768                d->fxState[2].offset = OFFB_FPTAGS;
   6769                d->fxState[2].size   = 8 * sizeof(UChar);
   6770 
   6771                d->fxState[3].fx     = Ifx_Read;
   6772                d->fxState[3].offset = OFFB_FPROUND;
   6773                d->fxState[3].size   = sizeof(ULong);
   6774 
   6775                d->fxState[4].fx     = Ifx_Read;
   6776                d->fxState[4].offset = OFFB_FC3210;
   6777                d->fxState[4].size   = sizeof(ULong);
   6778 
   6779                stmt( IRStmt_Dirty(d) );
   6780 
   6781                if ( have66(pfx) ) {
   6782                  DIP("fnsaves %s\n", dis_buf);
   6783                } else {
   6784                  DIP("fnsave %s\n", dis_buf);
   6785                }
   6786                break;
   6787             }
   6788 
   6789             case 7: { /* FNSTSW m16 */
   6790                IRExpr* sw = get_FPU_sw();
   6791                vassert(typeOfIRExpr(irsb->tyenv, sw) == Ity_I16);
   6792                storeLE( mkexpr(addr), sw );
   6793                DIP("fnstsw %s\n", dis_buf);
   6794                break;
   6795             }
   6796 
   6797             default:
   6798                vex_printf("unhandled opc_aux = 0x%2x\n",
   6799                           (UInt)gregLO3ofRM(modrm));
   6800                vex_printf("first_opcode == 0xDD\n");
   6801                goto decode_fail;
   6802          }
   6803       } else {
   6804          delta++;
   6805          switch (modrm) {
   6806 
   6807             case 0xC0 ... 0xC7: /* FFREE %st(?) */
   6808                r_dst = (UInt)modrm - 0xC0;
   6809                DIP("ffree %%st(%u)\n", r_dst);
   6810                put_ST_TAG ( r_dst, mkU8(0) );
   6811                break;
   6812 
   6813             case 0xD0 ... 0xD7: /* FST %st(0),%st(?) */
   6814                r_dst = (UInt)modrm - 0xD0;
   6815                DIP("fst %%st(0),%%st(%u)\n", r_dst);
   6816                /* P4 manual says: "If the destination operand is a
   6817                   non-empty register, the invalid-operation exception
   6818                   is not generated.  Hence put_ST_UNCHECKED. */
   6819                put_ST_UNCHECKED(r_dst, get_ST(0));
   6820                break;
   6821 
   6822             case 0xD8 ... 0xDF: /* FSTP %st(0),%st(?) */
   6823                r_dst = (UInt)modrm - 0xD8;
   6824                DIP("fstp %%st(0),%%st(%u)\n", r_dst);
   6825                /* P4 manual says: "If the destination operand is a
   6826                   non-empty register, the invalid-operation exception
   6827                   is not generated.  Hence put_ST_UNCHECKED. */
   6828                put_ST_UNCHECKED(r_dst, get_ST(0));
   6829                fp_pop();
   6830                break;
   6831 
   6832             case 0xE0 ... 0xE7: /* FUCOM %st(0),%st(?) */
   6833                r_dst = (UInt)modrm - 0xE0;
   6834                DIP("fucom %%st(0),%%st(%u)\n", r_dst);
   6835                /* This forces C1 to zero, which isn't right. */
   6836                put_C3210(
   6837                    unop(Iop_32Uto64,
   6838                    binop( Iop_And32,
   6839                           binop(Iop_Shl32,
   6840                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   6841                                 mkU8(8)),
   6842                           mkU32(0x4500)
   6843                    )));
   6844                break;
   6845 
   6846             case 0xE8 ... 0xEF: /* FUCOMP %st(0),%st(?) */
   6847                r_dst = (UInt)modrm - 0xE8;
   6848                DIP("fucomp %%st(0),%%st(%u)\n", r_dst);
   6849                /* This forces C1 to zero, which isn't right. */
   6850                put_C3210(
   6851                    unop(Iop_32Uto64,
   6852                    binop( Iop_And32,
   6853                           binop(Iop_Shl32,
   6854                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   6855                                 mkU8(8)),
   6856                           mkU32(0x4500)
   6857                    )));
   6858                fp_pop();
   6859                break;
   6860 
   6861             default:
   6862                goto decode_fail;
   6863          }
   6864       }
   6865    }
   6866 
   6867    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDE opcodes +-+-+-+-+-+-+-+ */
   6868    else
   6869    if (first_opcode == 0xDE) {
   6870 
   6871       if (modrm < 0xC0) {
   6872 
   6873          /* bits 5,4,3 are an opcode extension, and the modRM also
   6874             specifies an address. */
   6875          IROp   fop;
   6876          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6877          delta += len;
   6878 
   6879          switch (gregLO3ofRM(modrm)) {
   6880 
   6881             case 0: /* FIADD m16int */ /* ST(0) += m16int */
   6882                DIP("fiaddw %s\n", dis_buf);
   6883                fop = Iop_AddF64;
   6884                goto do_fop_m16;
   6885 
   6886             case 1: /* FIMUL m16int */ /* ST(0) *= m16int */
   6887                DIP("fimulw %s\n", dis_buf);
   6888                fop = Iop_MulF64;
   6889                goto do_fop_m16;
   6890 
   6891             case 4: /* FISUB m16int */ /* ST(0) -= m16int */
   6892                DIP("fisubw %s\n", dis_buf);
   6893                fop = Iop_SubF64;
   6894                goto do_fop_m16;
   6895 
   6896             case 5: /* FISUBR m16int */ /* ST(0) = m16int - ST(0) */
   6897                DIP("fisubrw %s\n", dis_buf);
   6898                fop = Iop_SubF64;
   6899                goto do_foprev_m16;
   6900 
   6901             case 6: /* FIDIV m16int */ /* ST(0) /= m16int */
   6902                DIP("fisubw %s\n", dis_buf);
   6903                fop = Iop_DivF64;
   6904                goto do_fop_m16;
   6905 
   6906             case 7: /* FIDIVR m16int */ /* ST(0) = m16int / ST(0) */
   6907                DIP("fidivrw %s\n", dis_buf);
   6908                fop = Iop_DivF64;
   6909                goto do_foprev_m16;
   6910 
   6911             do_fop_m16:
   6912                put_ST_UNCHECKED(0,
   6913                   triop(fop,
   6914                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6915                         get_ST(0),
   6916                         unop(Iop_I32StoF64,
   6917                              unop(Iop_16Sto32,
   6918                                   loadLE(Ity_I16, mkexpr(addr))))));
   6919                break;
   6920 
   6921             do_foprev_m16:
   6922                put_ST_UNCHECKED(0,
   6923                   triop(fop,
   6924                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6925                         unop(Iop_I32StoF64,
   6926                              unop(Iop_16Sto32,
   6927                                   loadLE(Ity_I16, mkexpr(addr)))),
   6928                         get_ST(0)));
   6929                break;
   6930 
   6931             default:
   6932                vex_printf("unhandled opc_aux = 0x%2x\n",
   6933                           (UInt)gregLO3ofRM(modrm));
   6934                vex_printf("first_opcode == 0xDE\n");
   6935                goto decode_fail;
   6936          }
   6937 
   6938       } else {
   6939 
   6940          delta++;
   6941          switch (modrm) {
   6942 
   6943             case 0xC0 ... 0xC7: /* FADDP %st(0),%st(?) */
   6944                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, True );
   6945                break;
   6946 
   6947             case 0xC8 ... 0xCF: /* FMULP %st(0),%st(?) */
   6948                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, True );
   6949                break;
   6950 
   6951             case 0xD9: /* FCOMPP %st(0),%st(1) */
   6952                DIP("fcompp %%st(0),%%st(1)\n");
   6953                /* This forces C1 to zero, which isn't right. */
   6954                put_C3210(
   6955                    unop(Iop_32Uto64,
   6956                    binop( Iop_And32,
   6957                           binop(Iop_Shl32,
   6958                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   6959                                 mkU8(8)),
   6960                           mkU32(0x4500)
   6961                    )));
   6962                fp_pop();
   6963                fp_pop();
   6964                break;
   6965 
   6966             case 0xE0 ... 0xE7: /* FSUBRP %st(0),%st(?) */
   6967                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0,  modrm - 0xE0, True );
   6968                break;
   6969 
   6970             case 0xE8 ... 0xEF: /* FSUBP %st(0),%st(?) */
   6971                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0,  modrm - 0xE8, True );
   6972                break;
   6973 
   6974             case 0xF0 ... 0xF7: /* FDIVRP %st(0),%st(?) */
   6975                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, True );
   6976                break;
   6977 
   6978             case 0xF8 ... 0xFF: /* FDIVP %st(0),%st(?) */
   6979                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, True );
   6980                break;
   6981 
   6982             default:
   6983                goto decode_fail;
   6984          }
   6985 
   6986       }
   6987    }
   6988 
   6989    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDF opcodes +-+-+-+-+-+-+-+ */
   6990    else
   6991    if (first_opcode == 0xDF) {
   6992 
   6993       if (modrm < 0xC0) {
   6994 
   6995          /* bits 5,4,3 are an opcode extension, and the modRM also
   6996             specifies an address. */
   6997          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6998          delta += len;
   6999 
   7000          switch (gregLO3ofRM(modrm)) {
   7001 
   7002             case 0: /* FILD m16int */
   7003                DIP("fildw %s\n", dis_buf);
   7004                fp_push();
   7005                put_ST(0, unop(Iop_I32StoF64,
   7006                               unop(Iop_16Sto32,
   7007                                    loadLE(Ity_I16, mkexpr(addr)))));
   7008                break;
   7009 
   7010             case 1: /* FISTTPS m16 (SSE3) */
   7011                DIP("fisttps %s\n", dis_buf);
   7012                storeLE( mkexpr(addr),
   7013                         x87ishly_qnarrow_32_to_16(
   7014                         binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) ));
   7015                fp_pop();
   7016                break;
   7017 
   7018             case 2: /* FIST m16 */
   7019                DIP("fists %s\n", dis_buf);
   7020                storeLE( mkexpr(addr),
   7021                         x87ishly_qnarrow_32_to_16(
   7022                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
   7023                break;
   7024 
   7025             case 3: /* FISTP m16 */
   7026                DIP("fistps %s\n", dis_buf);
   7027                storeLE( mkexpr(addr),
   7028                         x87ishly_qnarrow_32_to_16(
   7029                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
   7030                fp_pop();
   7031                break;
   7032 
   7033             case 5: /* FILD m64 */
   7034                DIP("fildll %s\n", dis_buf);
   7035                fp_push();
   7036                put_ST(0, binop(Iop_I64StoF64,
   7037                                get_roundingmode(),
   7038                                loadLE(Ity_I64, mkexpr(addr))));
   7039                break;
   7040 
   7041             case 7: /* FISTP m64 */
   7042                DIP("fistpll %s\n", dis_buf);
   7043                storeLE( mkexpr(addr),
   7044                         binop(Iop_F64toI64S, get_roundingmode(), get_ST(0)) );
   7045                fp_pop();
   7046                break;
   7047 
   7048             default:
   7049                vex_printf("unhandled opc_aux = 0x%2x\n",
   7050                           (UInt)gregLO3ofRM(modrm));
   7051                vex_printf("first_opcode == 0xDF\n");
   7052                goto decode_fail;
   7053          }
   7054 
   7055       } else {
   7056 
   7057          delta++;
   7058          switch (modrm) {
   7059 
   7060             case 0xC0: /* FFREEP %st(0) */
   7061                DIP("ffreep %%st(%d)\n", 0);
   7062                put_ST_TAG ( 0, mkU8(0) );
   7063                fp_pop();
   7064                break;
   7065 
   7066             case 0xE0: /* FNSTSW %ax */
   7067                DIP("fnstsw %%ax\n");
   7068                /* Invent a plausible-looking FPU status word value and
   7069                   dump it in %AX:
   7070                      ((ftop & 7) << 11) | (c3210 & 0x4700)
   7071                */
   7072                putIRegRAX(
   7073                   2,
   7074                   unop(Iop_32to16,
   7075                        binop(Iop_Or32,
   7076                              binop(Iop_Shl32,
   7077                                    binop(Iop_And32, get_ftop(), mkU32(7)),
   7078                                    mkU8(11)),
   7079                              binop(Iop_And32,
   7080                                    unop(Iop_64to32, get_C3210()),
   7081                                    mkU32(0x4700))
   7082                )));
   7083                break;
   7084 
   7085             case 0xE8 ... 0xEF: /* FUCOMIP %st(0),%st(?) */
   7086                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, True );
   7087                break;
   7088 
   7089             case 0xF0 ... 0xF7: /* FCOMIP %st(0),%st(?) */
   7090                /* not really right since COMIP != UCOMIP */
   7091                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, True );
   7092                break;
   7093 
   7094             default:
   7095                goto decode_fail;
   7096          }
   7097       }
   7098 
   7099    }
   7100 
   7101    else
   7102       goto decode_fail;
   7103 
   7104    *decode_ok = True;
   7105    return delta;
   7106 
   7107   decode_fail:
   7108    *decode_ok = False;
   7109    return delta;
   7110 }
   7111 
   7112 
   7113 /*------------------------------------------------------------*/
   7114 /*---                                                      ---*/
   7115 /*--- MMX INSTRUCTIONS                                     ---*/
   7116 /*---                                                      ---*/
   7117 /*------------------------------------------------------------*/
   7118 
   7119 /* Effect of MMX insns on x87 FPU state (table 11-2 of
   7120    IA32 arch manual, volume 3):
   7121 
   7122    Read from, or write to MMX register (viz, any insn except EMMS):
   7123    * All tags set to Valid (non-empty) -- FPTAGS[i] := nonzero
   7124    * FP stack pointer set to zero
   7125 
   7126    EMMS:
   7127    * All tags set to Invalid (empty) -- FPTAGS[i] := zero
   7128    * FP stack pointer set to zero
   7129 */
   7130 
   7131 static void do_MMX_preamble ( void )
   7132 {
   7133    Int         i;
   7134    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   7135    IRExpr*     zero  = mkU32(0);
   7136    IRExpr*     tag1  = mkU8(1);
   7137    put_ftop(zero);
   7138    for (i = 0; i < 8; i++)
   7139       stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag1) ) );
   7140 }
   7141 
   7142 static void do_EMMS_preamble ( void )
   7143 {
   7144    Int         i;
   7145    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   7146    IRExpr*     zero  = mkU32(0);
   7147    IRExpr*     tag0  = mkU8(0);
   7148    put_ftop(zero);
   7149    for (i = 0; i < 8; i++)
   7150       stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag0) ) );
   7151 }
   7152 
   7153 
   7154 static IRExpr* getMMXReg ( UInt archreg )
   7155 {
   7156    vassert(archreg < 8);
   7157    return IRExpr_Get( OFFB_FPREGS + 8 * archreg, Ity_I64 );
   7158 }
   7159 
   7160 
   7161 static void putMMXReg ( UInt archreg, IRExpr* e )
   7162 {
   7163    vassert(archreg < 8);
   7164    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   7165    stmt( IRStmt_Put( OFFB_FPREGS + 8 * archreg, e ) );
   7166 }
   7167 
   7168 
   7169 /* Helper for non-shift MMX insns.  Note this is incomplete in the
   7170    sense that it does not first call do_MMX_preamble() -- that is the
   7171    responsibility of its caller. */
   7172 
   7173 static
   7174 ULong dis_MMXop_regmem_to_reg ( const VexAbiInfo* vbi,
   7175                                 Prefix      pfx,
   7176                                 Long        delta,
   7177                                 UChar       opc,
   7178                                 const HChar* name,
   7179                                 Bool        show_granularity )
   7180 {
   7181    HChar   dis_buf[50];
   7182    UChar   modrm = getUChar(delta);
   7183    Bool    isReg = epartIsReg(modrm);
   7184    IRExpr* argL  = NULL;
   7185    IRExpr* argR  = NULL;
   7186    IRExpr* argG  = NULL;
   7187    IRExpr* argE  = NULL;
   7188    IRTemp  res   = newTemp(Ity_I64);
   7189 
   7190    Bool    invG  = False;
   7191    IROp    op    = Iop_INVALID;
   7192    void*   hAddr = NULL;
   7193    const HChar*  hName = NULL;
   7194    Bool    eLeft = False;
   7195 
   7196 #  define XXX(_name) do { hAddr = &_name; hName = #_name; } while (0)
   7197 
   7198    switch (opc) {
   7199       /* Original MMX ones */
   7200       case 0xFC: op = Iop_Add8x8; break;
   7201       case 0xFD: op = Iop_Add16x4; break;
   7202       case 0xFE: op = Iop_Add32x2; break;
   7203 
   7204       case 0xEC: op = Iop_QAdd8Sx8; break;
   7205       case 0xED: op = Iop_QAdd16Sx4; break;
   7206 
   7207       case 0xDC: op = Iop_QAdd8Ux8; break;
   7208       case 0xDD: op = Iop_QAdd16Ux4; break;
   7209 
   7210       case 0xF8: op = Iop_Sub8x8;  break;
   7211       case 0xF9: op = Iop_Sub16x4; break;
   7212       case 0xFA: op = Iop_Sub32x2; break;
   7213 
   7214       case 0xE8: op = Iop_QSub8Sx8; break;
   7215       case 0xE9: op = Iop_QSub16Sx4; break;
   7216 
   7217       case 0xD8: op = Iop_QSub8Ux8; break;
   7218       case 0xD9: op = Iop_QSub16Ux4; break;
   7219 
   7220       case 0xE5: op = Iop_MulHi16Sx4; break;
   7221       case 0xD5: op = Iop_Mul16x4; break;
   7222       case 0xF5: XXX(amd64g_calculate_mmx_pmaddwd); break;
   7223 
   7224       case 0x74: op = Iop_CmpEQ8x8; break;
   7225       case 0x75: op = Iop_CmpEQ16x4; break;
   7226       case 0x76: op = Iop_CmpEQ32x2; break;
   7227 
   7228       case 0x64: op = Iop_CmpGT8Sx8; break;
   7229       case 0x65: op = Iop_CmpGT16Sx4; break;
   7230       case 0x66: op = Iop_CmpGT32Sx2; break;
   7231 
   7232       case 0x6B: op = Iop_QNarrowBin32Sto16Sx4; eLeft = True; break;
   7233       case 0x63: op = Iop_QNarrowBin16Sto8Sx8;  eLeft = True; break;
   7234       case 0x67: op = Iop_QNarrowBin16Sto8Ux8;  eLeft = True; break;
   7235 
   7236       case 0x68: op = Iop_InterleaveHI8x8;  eLeft = True; break;
   7237       case 0x69: op = Iop_InterleaveHI16x4; eLeft = True; break;
   7238       case 0x6A: op = Iop_InterleaveHI32x2; eLeft = True; break;
   7239 
   7240       case 0x60: op = Iop_InterleaveLO8x8;  eLeft = True; break;
   7241       case 0x61: op = Iop_InterleaveLO16x4; eLeft = True; break;
   7242       case 0x62: op = Iop_InterleaveLO32x2; eLeft = True; break;
   7243 
   7244       case 0xDB: op = Iop_And64; break;
   7245       case 0xDF: op = Iop_And64; invG = True; break;
   7246       case 0xEB: op = Iop_Or64; break;
   7247       case 0xEF: /* Possibly do better here if argL and argR are the
   7248                     same reg */
   7249                  op = Iop_Xor64; break;
   7250 
   7251       /* Introduced in SSE1 */
   7252       case 0xE0: op = Iop_Avg8Ux8;    break;
   7253       case 0xE3: op = Iop_Avg16Ux4;   break;
   7254       case 0xEE: op = Iop_Max16Sx4;   break;
   7255       case 0xDE: op = Iop_Max8Ux8;    break;
   7256       case 0xEA: op = Iop_Min16Sx4;   break;
   7257       case 0xDA: op = Iop_Min8Ux8;    break;
   7258       case 0xE4: op = Iop_MulHi16Ux4; break;
   7259       case 0xF6: XXX(amd64g_calculate_mmx_psadbw); break;
   7260 
   7261       /* Introduced in SSE2 */
   7262       case 0xD4: op = Iop_Add64; break;
   7263       case 0xFB: op = Iop_Sub64; break;
   7264 
   7265       default:
   7266          vex_printf("\n0x%x\n", (UInt)opc);
   7267          vpanic("dis_MMXop_regmem_to_reg");
   7268    }
   7269 
   7270 #  undef XXX
   7271 
   7272    argG = getMMXReg(gregLO3ofRM(modrm));
   7273    if (invG)
   7274       argG = unop(Iop_Not64, argG);
   7275 
   7276    if (isReg) {
   7277       delta++;
   7278       argE = getMMXReg(eregLO3ofRM(modrm));
   7279    } else {
   7280       Int    len;
   7281       IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7282       delta += len;
   7283       argE = loadLE(Ity_I64, mkexpr(addr));
   7284    }
   7285 
   7286    if (eLeft) {
   7287       argL = argE;
   7288       argR = argG;
   7289    } else {
   7290       argL = argG;
   7291       argR = argE;
   7292    }
   7293 
   7294    if (op != Iop_INVALID) {
   7295       vassert(hName == NULL);
   7296       vassert(hAddr == NULL);
   7297       assign(res, binop(op, argL, argR));
   7298    } else {
   7299       vassert(hName != NULL);
   7300       vassert(hAddr != NULL);
   7301       assign( res,
   7302               mkIRExprCCall(
   7303                  Ity_I64,
   7304                  0/*regparms*/, hName, hAddr,
   7305                  mkIRExprVec_2( argL, argR )
   7306               )
   7307             );
   7308    }
   7309 
   7310    putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
   7311 
   7312    DIP("%s%s %s, %s\n",
   7313        name, show_granularity ? nameMMXGran(opc & 3) : "",
   7314        ( isReg ? nameMMXReg(eregLO3ofRM(modrm)) : dis_buf ),
   7315        nameMMXReg(gregLO3ofRM(modrm)) );
   7316 
   7317    return delta;
   7318 }
   7319 
   7320 
   7321 /* Vector by scalar shift of G by the amount specified at the bottom
   7322    of E.  This is a straight copy of dis_SSE_shiftG_byE. */
   7323 
   7324 static ULong dis_MMX_shiftG_byE ( const VexAbiInfo* vbi,
   7325                                   Prefix pfx, Long delta,
   7326                                   const HChar* opname, IROp op )
   7327 {
   7328    HChar   dis_buf[50];
   7329    Int     alen, size;
   7330    IRTemp  addr;
   7331    Bool    shl, shr, sar;
   7332    UChar   rm   = getUChar(delta);
   7333    IRTemp  g0   = newTemp(Ity_I64);
   7334    IRTemp  g1   = newTemp(Ity_I64);
   7335    IRTemp  amt  = newTemp(Ity_I64);
   7336    IRTemp  amt8 = newTemp(Ity_I8);
   7337 
   7338    if (epartIsReg(rm)) {
   7339       assign( amt, getMMXReg(eregLO3ofRM(rm)) );
   7340       DIP("%s %s,%s\n", opname,
   7341                         nameMMXReg(eregLO3ofRM(rm)),
   7342                         nameMMXReg(gregLO3ofRM(rm)) );
   7343       delta++;
   7344    } else {
   7345       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   7346       assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
   7347       DIP("%s %s,%s\n", opname,
   7348                         dis_buf,
   7349                         nameMMXReg(gregLO3ofRM(rm)) );
   7350       delta += alen;
   7351    }
   7352    assign( g0,   getMMXReg(gregLO3ofRM(rm)) );
   7353    assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
   7354 
   7355    shl = shr = sar = False;
   7356    size = 0;
   7357    switch (op) {
   7358       case Iop_ShlN16x4: shl = True; size = 32; break;
   7359       case Iop_ShlN32x2: shl = True; size = 32; break;
   7360       case Iop_Shl64:    shl = True; size = 64; break;
   7361       case Iop_ShrN16x4: shr = True; size = 16; break;
   7362       case Iop_ShrN32x2: shr = True; size = 32; break;
   7363       case Iop_Shr64:    shr = True; size = 64; break;
   7364       case Iop_SarN16x4: sar = True; size = 16; break;
   7365       case Iop_SarN32x2: sar = True; size = 32; break;
   7366       default: vassert(0);
   7367    }
   7368 
   7369    if (shl || shr) {
   7370      assign(
   7371         g1,
   7372         IRExpr_ITE(
   7373            binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size)),
   7374            binop(op, mkexpr(g0), mkexpr(amt8)),
   7375            mkU64(0)
   7376         )
   7377      );
   7378    } else
   7379    if (sar) {
   7380      assign(
   7381         g1,
   7382         IRExpr_ITE(
   7383            binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size)),
   7384            binop(op, mkexpr(g0), mkexpr(amt8)),
   7385            binop(op, mkexpr(g0), mkU8(size-1))
   7386         )
   7387      );
   7388    } else {
   7389       vassert(0);
   7390    }
   7391 
   7392    putMMXReg( gregLO3ofRM(rm), mkexpr(g1) );
   7393    return delta;
   7394 }
   7395 
   7396 
   7397 /* Vector by scalar shift of E by an immediate byte.  This is a
   7398    straight copy of dis_SSE_shiftE_imm. */
   7399 
   7400 static
   7401 ULong dis_MMX_shiftE_imm ( Long delta, const HChar* opname, IROp op )
   7402 {
   7403    Bool    shl, shr, sar;
   7404    UChar   rm   = getUChar(delta);
   7405    IRTemp  e0   = newTemp(Ity_I64);
   7406    IRTemp  e1   = newTemp(Ity_I64);
   7407    UChar   amt, size;
   7408    vassert(epartIsReg(rm));
   7409    vassert(gregLO3ofRM(rm) == 2
   7410            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
   7411    amt = getUChar(delta+1);
   7412    delta += 2;
   7413    DIP("%s $%d,%s\n", opname,
   7414                       (Int)amt,
   7415                       nameMMXReg(eregLO3ofRM(rm)) );
   7416 
   7417    assign( e0, getMMXReg(eregLO3ofRM(rm)) );
   7418 
   7419    shl = shr = sar = False;
   7420    size = 0;
   7421    switch (op) {
   7422       case Iop_ShlN16x4: shl = True; size = 16; break;
   7423       case Iop_ShlN32x2: shl = True; size = 32; break;
   7424       case Iop_Shl64:    shl = True; size = 64; break;
   7425       case Iop_SarN16x4: sar = True; size = 16; break;
   7426       case Iop_SarN32x2: sar = True; size = 32; break;
   7427       case Iop_ShrN16x4: shr = True; size = 16; break;
   7428       case Iop_ShrN32x2: shr = True; size = 32; break;
   7429       case Iop_Shr64:    shr = True; size = 64; break;
   7430       default: vassert(0);
   7431    }
   7432 
   7433    if (shl || shr) {
   7434      assign( e1, amt >= size
   7435                     ? mkU64(0)
   7436                     : binop(op, mkexpr(e0), mkU8(amt))
   7437      );
   7438    } else
   7439    if (sar) {
   7440      assign( e1, amt >= size
   7441                     ? binop(op, mkexpr(e0), mkU8(size-1))
   7442                     : binop(op, mkexpr(e0), mkU8(amt))
   7443      );
   7444    } else {
   7445       vassert(0);
   7446    }
   7447 
   7448    putMMXReg( eregLO3ofRM(rm), mkexpr(e1) );
   7449    return delta;
   7450 }
   7451 
   7452 
   7453 /* Completely handle all MMX instructions except emms. */
   7454 
   7455 static
   7456 ULong dis_MMX ( Bool* decode_ok,
   7457                 const VexAbiInfo* vbi, Prefix pfx, Int sz, Long delta )
   7458 {
   7459    Int   len;
   7460    UChar modrm;
   7461    HChar dis_buf[50];
   7462    UChar opc = getUChar(delta);
   7463    delta++;
   7464 
   7465    /* dis_MMX handles all insns except emms. */
   7466    do_MMX_preamble();
   7467 
   7468    switch (opc) {
   7469 
   7470       case 0x6E:
   7471          if (sz == 4) {
   7472             /* MOVD (src)ireg32-or-mem32 (E), (dst)mmxreg (G)*/
   7473             modrm = getUChar(delta);
   7474             if (epartIsReg(modrm)) {
   7475                delta++;
   7476                putMMXReg(
   7477                   gregLO3ofRM(modrm),
   7478                   binop( Iop_32HLto64,
   7479                          mkU32(0),
   7480                          getIReg32(eregOfRexRM(pfx,modrm)) ) );
   7481                DIP("movd %s, %s\n",
   7482                    nameIReg32(eregOfRexRM(pfx,modrm)),
   7483                    nameMMXReg(gregLO3ofRM(modrm)));
   7484             } else {
   7485                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7486                delta += len;
   7487                putMMXReg(
   7488                   gregLO3ofRM(modrm),
   7489                   binop( Iop_32HLto64,
   7490                          mkU32(0),
   7491                          loadLE(Ity_I32, mkexpr(addr)) ) );
   7492                DIP("movd %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
   7493             }
   7494          }
   7495          else
   7496          if (sz == 8) {
   7497             /* MOVD (src)ireg64-or-mem64 (E), (dst)mmxreg (G)*/
   7498             modrm = getUChar(delta);
   7499             if (epartIsReg(modrm)) {
   7500                delta++;
   7501                putMMXReg( gregLO3ofRM(modrm),
   7502                           getIReg64(eregOfRexRM(pfx,modrm)) );
   7503                DIP("movd %s, %s\n",
   7504                    nameIReg64(eregOfRexRM(pfx,modrm)),
   7505                    nameMMXReg(gregLO3ofRM(modrm)));
   7506             } else {
   7507                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7508                delta += len;
   7509                putMMXReg( gregLO3ofRM(modrm),
   7510                           loadLE(Ity_I64, mkexpr(addr)) );
   7511                DIP("movd{64} %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
   7512             }
   7513          }
   7514          else {
   7515             goto mmx_decode_failure;
   7516          }
   7517          break;
   7518 
   7519       case 0x7E:
   7520          if (sz == 4) {
   7521             /* MOVD (src)mmxreg (G), (dst)ireg32-or-mem32 (E) */
   7522             modrm = getUChar(delta);
   7523             if (epartIsReg(modrm)) {
   7524                delta++;
   7525                putIReg32( eregOfRexRM(pfx,modrm),
   7526                           unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
   7527                DIP("movd %s, %s\n",
   7528                    nameMMXReg(gregLO3ofRM(modrm)),
   7529                    nameIReg32(eregOfRexRM(pfx,modrm)));
   7530             } else {
   7531                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7532                delta += len;
   7533                storeLE( mkexpr(addr),
   7534                         unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
   7535                DIP("movd %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
   7536             }
   7537          }
   7538          else
   7539          if (sz == 8) {
   7540             /* MOVD (src)mmxreg (G), (dst)ireg64-or-mem64 (E) */
   7541             modrm = getUChar(delta);
   7542             if (epartIsReg(modrm)) {
   7543                delta++;
   7544                putIReg64( eregOfRexRM(pfx,modrm),
   7545                           getMMXReg(gregLO3ofRM(modrm)) );
   7546                DIP("movd %s, %s\n",
   7547                    nameMMXReg(gregLO3ofRM(modrm)),
   7548                    nameIReg64(eregOfRexRM(pfx,modrm)));
   7549             } else {
   7550                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7551                delta += len;
   7552                storeLE( mkexpr(addr),
   7553                        getMMXReg(gregLO3ofRM(modrm)) );
   7554                DIP("movd{64} %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
   7555             }
   7556          } else {
   7557             goto mmx_decode_failure;
   7558          }
   7559          break;
   7560 
   7561       case 0x6F:
   7562          /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
   7563          if (sz != 4
   7564              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7565             goto mmx_decode_failure;
   7566          modrm = getUChar(delta);
   7567          if (epartIsReg(modrm)) {
   7568             delta++;
   7569             putMMXReg( gregLO3ofRM(modrm), getMMXReg(eregLO3ofRM(modrm)) );
   7570             DIP("movq %s, %s\n",
   7571                 nameMMXReg(eregLO3ofRM(modrm)),
   7572                 nameMMXReg(gregLO3ofRM(modrm)));
   7573          } else {
   7574             IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7575             delta += len;
   7576             putMMXReg( gregLO3ofRM(modrm), loadLE(Ity_I64, mkexpr(addr)) );
   7577             DIP("movq %s, %s\n",
   7578                 dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
   7579          }
   7580          break;
   7581 
   7582       case 0x7F:
   7583          /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
   7584          if (sz != 4
   7585              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7586             goto mmx_decode_failure;
   7587          modrm = getUChar(delta);
   7588          if (epartIsReg(modrm)) {
   7589             delta++;
   7590             putMMXReg( eregLO3ofRM(modrm), getMMXReg(gregLO3ofRM(modrm)) );
   7591             DIP("movq %s, %s\n",
   7592                 nameMMXReg(gregLO3ofRM(modrm)),
   7593                 nameMMXReg(eregLO3ofRM(modrm)));
   7594          } else {
   7595             IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7596             delta += len;
   7597             storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
   7598             DIP("mov(nt)q %s, %s\n",
   7599                 nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
   7600          }
   7601          break;
   7602 
   7603       case 0xFC:
   7604       case 0xFD:
   7605       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
   7606          if (sz != 4)
   7607             goto mmx_decode_failure;
   7608          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padd", True );
   7609          break;
   7610 
   7611       case 0xEC:
   7612       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
   7613          if (sz != 4
   7614              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7615             goto mmx_decode_failure;
   7616          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padds", True );
   7617          break;
   7618 
   7619       case 0xDC:
   7620       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   7621          if (sz != 4)
   7622             goto mmx_decode_failure;
   7623          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "paddus", True );
   7624          break;
   7625 
   7626       case 0xF8:
   7627       case 0xF9:
   7628       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
   7629          if (sz != 4)
   7630             goto mmx_decode_failure;
   7631          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psub", True );
   7632          break;
   7633 
   7634       case 0xE8:
   7635       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
   7636          if (sz != 4)
   7637             goto mmx_decode_failure;
   7638          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubs", True );
   7639          break;
   7640 
   7641       case 0xD8:
   7642       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   7643          if (sz != 4)
   7644             goto mmx_decode_failure;
   7645          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubus", True );
   7646          break;
   7647 
   7648       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
   7649          if (sz != 4)
   7650             goto mmx_decode_failure;
   7651          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmulhw", False );
   7652          break;
   7653 
   7654       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
   7655          if (sz != 4)
   7656             goto mmx_decode_failure;
   7657          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmullw", False );
   7658          break;
   7659 
   7660       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
   7661          vassert(sz == 4);
   7662          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmaddwd", False );
   7663          break;
   7664 
   7665       case 0x74:
   7666       case 0x75:
   7667       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
   7668          if (sz != 4)
   7669             goto mmx_decode_failure;
   7670          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpeq", True );
   7671          break;
   7672 
   7673       case 0x64:
   7674       case 0x65:
   7675       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
   7676          if (sz != 4)
   7677             goto mmx_decode_failure;
   7678          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpgt", True );
   7679          break;
   7680 
   7681       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
   7682          if (sz != 4)
   7683             goto mmx_decode_failure;
   7684          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packssdw", False );
   7685          break;
   7686 
   7687       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
   7688          if (sz != 4)
   7689             goto mmx_decode_failure;
   7690          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packsswb", False );
   7691          break;
   7692 
   7693       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
   7694          if (sz != 4)
   7695             goto mmx_decode_failure;
   7696          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packuswb", False );
   7697          break;
   7698 
   7699       case 0x68:
   7700       case 0x69:
   7701       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
   7702          if (sz != 4
   7703              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7704             goto mmx_decode_failure;
   7705          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckh", True );
   7706          break;
   7707 
   7708       case 0x60:
   7709       case 0x61:
   7710       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
   7711          if (sz != 4
   7712              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7713             goto mmx_decode_failure;
   7714          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckl", True );
   7715          break;
   7716 
   7717       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
   7718          if (sz != 4)
   7719             goto mmx_decode_failure;
   7720          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pand", False );
   7721          break;
   7722 
   7723       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
   7724          if (sz != 4)
   7725             goto mmx_decode_failure;
   7726          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pandn", False );
   7727          break;
   7728 
   7729       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
   7730          if (sz != 4)
   7731             goto mmx_decode_failure;
   7732          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "por", False );
   7733          break;
   7734 
   7735       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
   7736          if (sz != 4)
   7737             goto mmx_decode_failure;
   7738          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pxor", False );
   7739          break;
   7740 
   7741 #     define SHIFT_BY_REG(_name,_op)                                     \
   7742                 delta = dis_MMX_shiftG_byE(vbi, pfx, delta, _name, _op); \
   7743                 break;
   7744 
   7745       /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
   7746       case 0xF1: SHIFT_BY_REG("psllw", Iop_ShlN16x4);
   7747       case 0xF2: SHIFT_BY_REG("pslld", Iop_ShlN32x2);
   7748       case 0xF3: SHIFT_BY_REG("psllq", Iop_Shl64);
   7749 
   7750       /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
   7751       case 0xD1: SHIFT_BY_REG("psrlw", Iop_ShrN16x4);
   7752       case 0xD2: SHIFT_BY_REG("psrld", Iop_ShrN32x2);
   7753       case 0xD3: SHIFT_BY_REG("psrlq", Iop_Shr64);
   7754 
   7755       /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
   7756       case 0xE1: SHIFT_BY_REG("psraw", Iop_SarN16x4);
   7757       case 0xE2: SHIFT_BY_REG("psrad", Iop_SarN32x2);
   7758 
   7759 #     undef SHIFT_BY_REG
   7760 
   7761       case 0x71:
   7762       case 0x72:
   7763       case 0x73: {
   7764          /* (sz==4): PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
   7765          UChar byte2, subopc;
   7766          if (sz != 4)
   7767             goto mmx_decode_failure;
   7768          byte2  = getUChar(delta);      /* amode / sub-opcode */
   7769          subopc = toUChar( (byte2 >> 3) & 7 );
   7770 
   7771 #        define SHIFT_BY_IMM(_name,_op)                        \
   7772             do { delta = dis_MMX_shiftE_imm(delta,_name,_op);  \
   7773             } while (0)
   7774 
   7775               if (subopc == 2 /*SRL*/ && opc == 0x71)
   7776                   SHIFT_BY_IMM("psrlw", Iop_ShrN16x4);
   7777          else if (subopc == 2 /*SRL*/ && opc == 0x72)
   7778                  SHIFT_BY_IMM("psrld", Iop_ShrN32x2);
   7779          else if (subopc == 2 /*SRL*/ && opc == 0x73)
   7780                  SHIFT_BY_IMM("psrlq", Iop_Shr64);
   7781 
   7782          else if (subopc == 4 /*SAR*/ && opc == 0x71)
   7783                  SHIFT_BY_IMM("psraw", Iop_SarN16x4);
   7784          else if (subopc == 4 /*SAR*/ && opc == 0x72)
   7785                  SHIFT_BY_IMM("psrad", Iop_SarN32x2);
   7786 
   7787          else if (subopc == 6 /*SHL*/ && opc == 0x71)
   7788                  SHIFT_BY_IMM("psllw", Iop_ShlN16x4);
   7789          else if (subopc == 6 /*SHL*/ && opc == 0x72)
   7790                   SHIFT_BY_IMM("pslld", Iop_ShlN32x2);
   7791          else if (subopc == 6 /*SHL*/ && opc == 0x73)
   7792                  SHIFT_BY_IMM("psllq", Iop_Shl64);
   7793 
   7794          else goto mmx_decode_failure;
   7795 
   7796 #        undef SHIFT_BY_IMM
   7797          break;
   7798       }
   7799 
   7800       case 0xF7: {
   7801          IRTemp addr    = newTemp(Ity_I64);
   7802          IRTemp regD    = newTemp(Ity_I64);
   7803          IRTemp regM    = newTemp(Ity_I64);
   7804          IRTemp mask    = newTemp(Ity_I64);
   7805          IRTemp olddata = newTemp(Ity_I64);
   7806          IRTemp newdata = newTemp(Ity_I64);
   7807 
   7808          modrm = getUChar(delta);
   7809          if (sz != 4 || (!epartIsReg(modrm)))
   7810             goto mmx_decode_failure;
   7811          delta++;
   7812 
   7813          assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
   7814          assign( regM, getMMXReg( eregLO3ofRM(modrm) ));
   7815          assign( regD, getMMXReg( gregLO3ofRM(modrm) ));
   7816          assign( mask, binop(Iop_SarN8x8, mkexpr(regM), mkU8(7)) );
   7817          assign( olddata, loadLE( Ity_I64, mkexpr(addr) ));
   7818          assign( newdata,
   7819                  binop(Iop_Or64,
   7820                        binop(Iop_And64,
   7821                              mkexpr(regD),
   7822                              mkexpr(mask) ),
   7823                        binop(Iop_And64,
   7824                              mkexpr(olddata),
   7825                              unop(Iop_Not64, mkexpr(mask)))) );
   7826          storeLE( mkexpr(addr), mkexpr(newdata) );
   7827          DIP("maskmovq %s,%s\n", nameMMXReg( eregLO3ofRM(modrm) ),
   7828                                  nameMMXReg( gregLO3ofRM(modrm) ) );
   7829          break;
   7830       }
   7831 
   7832       /* --- MMX decode failure --- */
   7833       default:
   7834       mmx_decode_failure:
   7835          *decode_ok = False;
   7836          return delta; /* ignored */
   7837 
   7838    }
   7839 
   7840    *decode_ok = True;
   7841    return delta;
   7842 }
   7843 
   7844 
   7845 /*------------------------------------------------------------*/
   7846 /*--- More misc arithmetic and other obscure insns.        ---*/
   7847 /*------------------------------------------------------------*/
   7848 
   7849 /* Generate base << amt with vacated places filled with stuff
   7850    from xtra.  amt guaranteed in 0 .. 63. */
   7851 static
   7852 IRExpr* shiftL64_with_extras ( IRTemp base, IRTemp xtra, IRTemp amt )
   7853 {
   7854    /* if   amt == 0
   7855       then base
   7856       else (base << amt) | (xtra >>u (64-amt))
   7857    */
   7858    return
   7859       IRExpr_ITE(
   7860          binop(Iop_CmpNE8, mkexpr(amt), mkU8(0)),
   7861          binop(Iop_Or64,
   7862                binop(Iop_Shl64, mkexpr(base), mkexpr(amt)),
   7863                binop(Iop_Shr64, mkexpr(xtra),
   7864                                 binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
   7865                ),
   7866          mkexpr(base)
   7867       );
   7868 }
   7869 
   7870 /* Generate base >>u amt with vacated places filled with stuff
   7871    from xtra.  amt guaranteed in 0 .. 63. */
   7872 static
   7873 IRExpr* shiftR64_with_extras ( IRTemp xtra, IRTemp base, IRTemp amt )
   7874 {
   7875    /* if   amt == 0
   7876       then base
   7877       else (base >>u amt) | (xtra << (64-amt))
   7878    */
   7879    return
   7880       IRExpr_ITE(
   7881          binop(Iop_CmpNE8, mkexpr(amt), mkU8(0)),
   7882          binop(Iop_Or64,
   7883                binop(Iop_Shr64, mkexpr(base), mkexpr(amt)),
   7884                binop(Iop_Shl64, mkexpr(xtra),
   7885                                 binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
   7886                ),
   7887          mkexpr(base)
   7888       );
   7889 }
   7890 
   7891 /* Double length left and right shifts.  Apparently only required in
   7892    v-size (no b- variant). */
   7893 static
   7894 ULong dis_SHLRD_Gv_Ev ( const VexAbiInfo* vbi,
   7895                         Prefix pfx,
   7896                         Long delta, UChar modrm,
   7897                         Int sz,
   7898                         IRExpr* shift_amt,
   7899                         Bool amt_is_literal,
   7900                         const HChar* shift_amt_txt,
   7901                         Bool left_shift )
   7902 {
   7903    /* shift_amt :: Ity_I8 is the amount to shift.  shift_amt_txt is used
   7904       for printing it.   And eip on entry points at the modrm byte. */
   7905    Int len;
   7906    HChar dis_buf[50];
   7907 
   7908    IRType ty     = szToITy(sz);
   7909    IRTemp gsrc   = newTemp(ty);
   7910    IRTemp esrc   = newTemp(ty);
   7911    IRTemp addr   = IRTemp_INVALID;
   7912    IRTemp tmpSH  = newTemp(Ity_I8);
   7913    IRTemp tmpSS  = newTemp(Ity_I8);
   7914    IRTemp tmp64  = IRTemp_INVALID;
   7915    IRTemp res64  = IRTemp_INVALID;
   7916    IRTemp rss64  = IRTemp_INVALID;
   7917    IRTemp resTy  = IRTemp_INVALID;
   7918    IRTemp rssTy  = IRTemp_INVALID;
   7919    Int    mask   = sz==8 ? 63 : 31;
   7920 
   7921    vassert(sz == 2 || sz == 4 || sz == 8);
   7922 
   7923    /* The E-part is the destination; this is shifted.  The G-part
   7924       supplies bits to be shifted into the E-part, but is not
   7925       changed.
   7926 
   7927       If shifting left, form a double-length word with E at the top
   7928       and G at the bottom, and shift this left.  The result is then in
   7929       the high part.
   7930 
   7931       If shifting right, form a double-length word with G at the top
   7932       and E at the bottom, and shift this right.  The result is then
   7933       at the bottom.  */
   7934 
   7935    /* Fetch the operands. */
   7936 
   7937    assign( gsrc, getIRegG(sz, pfx, modrm) );
   7938 
   7939    if (epartIsReg(modrm)) {
   7940       delta++;
   7941       assign( esrc, getIRegE(sz, pfx, modrm) );
   7942       DIP("sh%cd%c %s, %s, %s\n",
   7943           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   7944           shift_amt_txt,
   7945           nameIRegG(sz, pfx, modrm), nameIRegE(sz, pfx, modrm));
   7946    } else {
   7947       addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
   7948                         /* # bytes following amode */
   7949                         amt_is_literal ? 1 : 0 );
   7950       delta += len;
   7951       assign( esrc, loadLE(ty, mkexpr(addr)) );
   7952       DIP("sh%cd%c %s, %s, %s\n",
   7953           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   7954           shift_amt_txt,
   7955           nameIRegG(sz, pfx, modrm), dis_buf);
   7956    }
   7957 
   7958    /* Calculate the masked shift amount (tmpSH), the masked subshift
   7959       amount (tmpSS), the shifted value (res64) and the subshifted
   7960       value (rss64). */
   7961 
   7962    assign( tmpSH, binop(Iop_And8, shift_amt, mkU8(mask)) );
   7963    assign( tmpSS, binop(Iop_And8,
   7964                         binop(Iop_Sub8, mkexpr(tmpSH), mkU8(1) ),
   7965                         mkU8(mask)));
   7966 
   7967    tmp64 = newTemp(Ity_I64);
   7968    res64 = newTemp(Ity_I64);
   7969    rss64 = newTemp(Ity_I64);
   7970 
   7971    if (sz == 2 || sz == 4) {
   7972 
   7973       /* G is xtra; E is data */
   7974       /* what a freaking nightmare: */
   7975       if (sz == 4 && left_shift) {
   7976          assign( tmp64, binop(Iop_32HLto64, mkexpr(esrc), mkexpr(gsrc)) );
   7977          assign( res64,
   7978                  binop(Iop_Shr64,
   7979                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
   7980                        mkU8(32)) );
   7981          assign( rss64,
   7982                  binop(Iop_Shr64,
   7983                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSS)),
   7984                        mkU8(32)) );
   7985       }
   7986       else
   7987       if (sz == 4 && !left_shift) {
   7988          assign( tmp64, binop(Iop_32HLto64, mkexpr(gsrc), mkexpr(esrc)) );
   7989          assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
   7990          assign( rss64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSS)) );
   7991       }
   7992       else
   7993       if (sz == 2 && left_shift) {
   7994          assign( tmp64,
   7995                  binop(Iop_32HLto64,
   7996                        binop(Iop_16HLto32, mkexpr(esrc), mkexpr(gsrc)),
   7997                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc))
   7998          ));
   7999          /* result formed by shifting [esrc'gsrc'gsrc'gsrc] */
   8000          assign( res64,
   8001                  binop(Iop_Shr64,
   8002                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
   8003                        mkU8(48)) );
   8004          /* subshift formed by shifting [esrc'0000'0000'0000] */
   8005          assign( rss64,
   8006                  binop(Iop_Shr64,
   8007                        binop(Iop_Shl64,
   8008                              binop(Iop_Shl64, unop(Iop_16Uto64, mkexpr(esrc)),
   8009                                               mkU8(48)),
   8010                              mkexpr(tmpSS)),
   8011                        mkU8(48)) );
   8012       }
   8013       else
   8014       if (sz == 2 && !left_shift) {
   8015          assign( tmp64,
   8016                  binop(Iop_32HLto64,
   8017                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc)),
   8018                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(esrc))
   8019          ));
   8020          /* result formed by shifting [gsrc'gsrc'gsrc'esrc] */
   8021          assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
   8022          /* subshift formed by shifting [0000'0000'0000'esrc] */
   8023          assign( rss64, binop(Iop_Shr64,
   8024                               unop(Iop_16Uto64, mkexpr(esrc)),
   8025                               mkexpr(tmpSS)) );
   8026       }
   8027 
   8028    } else {
   8029 
   8030       vassert(sz == 8);
   8031       if (left_shift) {
   8032          assign( res64, shiftL64_with_extras( esrc, gsrc, tmpSH ));
   8033          assign( rss64, shiftL64_with_extras( esrc, gsrc, tmpSS ));
   8034       } else {
   8035          assign( res64, shiftR64_with_extras( gsrc, esrc, tmpSH ));
   8036          assign( rss64, shiftR64_with_extras( gsrc, esrc, tmpSS ));
   8037       }
   8038 
   8039    }
   8040 
   8041    resTy = newTemp(ty);
   8042    rssTy = newTemp(ty);
   8043    assign( resTy, narrowTo(ty, mkexpr(res64)) );
   8044    assign( rssTy, narrowTo(ty, mkexpr(rss64)) );
   8045 
   8046    /* Put result back and write the flags thunk. */
   8047    setFlags_DEP1_DEP2_shift ( left_shift ? Iop_Shl64 : Iop_Sar64,
   8048                               resTy, rssTy, ty, tmpSH );
   8049 
   8050    if (epartIsReg(modrm)) {
   8051       putIRegE(sz, pfx, modrm, mkexpr(resTy));
   8052    } else {
   8053       storeLE( mkexpr(addr), mkexpr(resTy) );
   8054    }
   8055 
   8056    if (amt_is_literal) delta++;
   8057    return delta;
   8058 }
   8059 
   8060 
   8061 /* Handle BT/BTS/BTR/BTC Gv, Ev.  Apparently b-size is not
   8062    required. */
   8063 
   8064 typedef enum { BtOpNone, BtOpSet, BtOpReset, BtOpComp } BtOp;
   8065 
   8066 static const HChar* nameBtOp ( BtOp op )
   8067 {
   8068    switch (op) {
   8069       case BtOpNone:  return "";
   8070       case BtOpSet:   return "s";
   8071       case BtOpReset: return "r";
   8072       case BtOpComp:  return "c";
   8073       default: vpanic("nameBtOp(amd64)");
   8074    }
   8075 }
   8076 
   8077 
   8078 static
   8079 ULong dis_bt_G_E ( const VexAbiInfo* vbi,
   8080                    Prefix pfx, Int sz, Long delta, BtOp op,
   8081                    /*OUT*/Bool* decode_OK )
   8082 {
   8083    HChar  dis_buf[50];
   8084    UChar  modrm;
   8085    Int    len;
   8086    IRTemp t_fetched, t_bitno0, t_bitno1, t_bitno2, t_addr0,
   8087           t_addr1, t_rsp, t_mask, t_new;
   8088 
   8089    vassert(sz == 2 || sz == 4 || sz == 8);
   8090 
   8091    t_fetched = t_bitno0 = t_bitno1 = t_bitno2
   8092              = t_addr0 = t_addr1 = t_rsp
   8093              = t_mask = t_new = IRTemp_INVALID;
   8094 
   8095    t_fetched = newTemp(Ity_I8);
   8096    t_new     = newTemp(Ity_I8);
   8097    t_bitno0  = newTemp(Ity_I64);
   8098    t_bitno1  = newTemp(Ity_I64);
   8099    t_bitno2  = newTemp(Ity_I8);
   8100    t_addr1   = newTemp(Ity_I64);
   8101    modrm     = getUChar(delta);
   8102 
   8103    *decode_OK = True;
   8104    if (epartIsReg(modrm)) {
   8105       /* F2 and F3 are never acceptable. */
   8106       if (haveF2orF3(pfx)) {
   8107          *decode_OK = False;
   8108          return delta;
   8109       }
   8110    } else {
   8111       /* F2 or F3 (but not both) are allowed, provided LOCK is also
   8112          present, and only for the BTC/BTS/BTR cases (not BT). */
   8113       if (haveF2orF3(pfx)) {
   8114          if (haveF2andF3(pfx) || !haveLOCK(pfx) || op == BtOpNone) {
   8115             *decode_OK = False;
   8116             return delta;
   8117          }
   8118       }
   8119    }
   8120 
   8121    assign( t_bitno0, widenSto64(getIRegG(sz, pfx, modrm)) );
   8122 
   8123    if (epartIsReg(modrm)) {
   8124       delta++;
   8125       /* Get it onto the client's stack.  Oh, this is a horrible
   8126          kludge.  See https://bugs.kde.org/show_bug.cgi?id=245925.
   8127          Because of the ELF ABI stack redzone, there may be live data
   8128          up to 128 bytes below %RSP.  So we can't just push it on the
   8129          stack, else we may wind up trashing live data, and causing
   8130          impossible-to-find simulation errors.  (Yes, this did
   8131          happen.)  So we need to drop RSP before at least 128 before
   8132          pushing it.  That unfortunately means hitting Memcheck's
   8133          fast-case painting code.  Ideally we should drop more than
   8134          128, to reduce the chances of breaking buggy programs that
   8135          have live data below -128(%RSP).  Memcheck fast-cases moves
   8136          of 288 bytes due to the need to handle ppc64-linux quickly,
   8137          so let's use 288.  Of course the real fix is to get rid of
   8138          this kludge entirely.  */
   8139       t_rsp = newTemp(Ity_I64);
   8140       t_addr0 = newTemp(Ity_I64);
   8141 
   8142       vassert(vbi->guest_stack_redzone_size == 128);
   8143       assign( t_rsp, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(288)) );
   8144       putIReg64(R_RSP, mkexpr(t_rsp));
   8145 
   8146       storeLE( mkexpr(t_rsp), getIRegE(sz, pfx, modrm) );
   8147 
   8148       /* Make t_addr0 point at it. */
   8149       assign( t_addr0, mkexpr(t_rsp) );
   8150 
   8151       /* Mask out upper bits of the shift amount, since we're doing a
   8152          reg. */
   8153       assign( t_bitno1, binop(Iop_And64,
   8154                               mkexpr(t_bitno0),
   8155                               mkU64(sz == 8 ? 63 : sz == 4 ? 31 : 15)) );
   8156 
   8157    } else {
   8158       t_addr0 = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   8159       delta += len;
   8160       assign( t_bitno1, mkexpr(t_bitno0) );
   8161    }
   8162 
   8163    /* At this point: t_addr0 is the address being operated on.  If it
   8164       was a reg, we will have pushed it onto the client's stack.
   8165       t_bitno1 is the bit number, suitably masked in the case of a
   8166       reg.  */
   8167 
   8168    /* Now the main sequence. */
   8169    assign( t_addr1,
   8170            binop(Iop_Add64,
   8171                  mkexpr(t_addr0),
   8172                  binop(Iop_Sar64, mkexpr(t_bitno1), mkU8(3))) );
   8173 
   8174    /* t_addr1 now holds effective address */
   8175 
   8176    assign( t_bitno2,
   8177            unop(Iop_64to8,
   8178                 binop(Iop_And64, mkexpr(t_bitno1), mkU64(7))) );
   8179 
   8180    /* t_bitno2 contains offset of bit within byte */
   8181 
   8182    if (op != BtOpNone) {
   8183       t_mask = newTemp(Ity_I8);
   8184       assign( t_mask, binop(Iop_Shl8, mkU8(1), mkexpr(t_bitno2)) );
   8185    }
   8186 
   8187    /* t_mask is now a suitable byte mask */
   8188 
   8189    assign( t_fetched, loadLE(Ity_I8, mkexpr(t_addr1)) );
   8190 
   8191    if (op != BtOpNone) {
   8192       switch (op) {
   8193          case BtOpSet:
   8194             assign( t_new,
   8195                     binop(Iop_Or8, mkexpr(t_fetched), mkexpr(t_mask)) );
   8196             break;
   8197          case BtOpComp:
   8198             assign( t_new,
   8199                     binop(Iop_Xor8, mkexpr(t_fetched), mkexpr(t_mask)) );
   8200             break;
   8201          case BtOpReset:
   8202             assign( t_new,
   8203                     binop(Iop_And8, mkexpr(t_fetched),
   8204                                     unop(Iop_Not8, mkexpr(t_mask))) );
   8205             break;
   8206          default:
   8207             vpanic("dis_bt_G_E(amd64)");
   8208       }
   8209       if ((haveLOCK(pfx)) && !epartIsReg(modrm)) {
   8210          casLE( mkexpr(t_addr1), mkexpr(t_fetched)/*expd*/,
   8211                                  mkexpr(t_new)/*new*/,
   8212                                  guest_RIP_curr_instr );
   8213       } else {
   8214          storeLE( mkexpr(t_addr1), mkexpr(t_new) );
   8215       }
   8216    }
   8217 
   8218    /* Side effect done; now get selected bit into Carry flag.  The Intel docs
   8219       (as of 2015, at least) say that C holds the result, Z is unchanged, and
   8220       O,S,A and P are undefined.  However, on Skylake it appears that O,S,A,P
   8221       are also unchanged, so let's do that. */
   8222    const ULong maskC     = AMD64G_CC_MASK_C;
   8223    const ULong maskOSZAP = AMD64G_CC_MASK_O | AMD64G_CC_MASK_S
   8224                            | AMD64G_CC_MASK_Z | AMD64G_CC_MASK_A
   8225                            | AMD64G_CC_MASK_P;
   8226 
   8227    IRTemp old_rflags = newTemp(Ity_I64);
   8228    assign(old_rflags, mk_amd64g_calculate_rflags_all());
   8229 
   8230    IRTemp new_rflags = newTemp(Ity_I64);
   8231    assign(new_rflags,
   8232           binop(Iop_Or64,
   8233                 binop(Iop_And64, mkexpr(old_rflags), mkU64(maskOSZAP)),
   8234                 binop(Iop_And64,
   8235                       binop(Iop_Shr64,
   8236                             unop(Iop_8Uto64, mkexpr(t_fetched)),
   8237                             mkexpr(t_bitno2)),
   8238                       mkU64(maskC))));
   8239 
   8240    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   8241    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   8242    stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(new_rflags) ));
   8243    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   8244       elimination of previous stores to this field work better. */
   8245    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   8246 
   8247    /* Move reg operand from stack back to reg */
   8248    if (epartIsReg(modrm)) {
   8249       /* t_rsp still points at it. */
   8250       /* only write the reg if actually modifying it; doing otherwise
   8251          zeroes the top half erroneously when doing btl due to
   8252          standard zero-extend rule */
   8253       if (op != BtOpNone)
   8254          putIRegE(sz, pfx, modrm, loadLE(szToITy(sz), mkexpr(t_rsp)) );
   8255       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t_rsp), mkU64(288)) );
   8256    }
   8257 
   8258    DIP("bt%s%c %s, %s\n",
   8259        nameBtOp(op), nameISize(sz), nameIRegG(sz, pfx, modrm),
   8260        ( epartIsReg(modrm) ? nameIRegE(sz, pfx, modrm) : dis_buf ) );
   8261 
   8262    return delta;
   8263 }
   8264 
   8265 
   8266 
   8267 /* Handle BSF/BSR.  Only v-size seems necessary. */
   8268 static
   8269 ULong dis_bs_E_G ( const VexAbiInfo* vbi,
   8270                    Prefix pfx, Int sz, Long delta, Bool fwds )
   8271 {
   8272    Bool   isReg;
   8273    UChar  modrm;
   8274    HChar  dis_buf[50];
   8275 
   8276    IRType ty    = szToITy(sz);
   8277    IRTemp src   = newTemp(ty);
   8278    IRTemp dst   = newTemp(ty);
   8279    IRTemp src64 = newTemp(Ity_I64);
   8280    IRTemp dst64 = newTemp(Ity_I64);
   8281    IRTemp srcB  = newTemp(Ity_I1);
   8282 
   8283    vassert(sz == 8 || sz == 4 || sz == 2);
   8284 
   8285    modrm = getUChar(delta);
   8286    isReg = epartIsReg(modrm);
   8287    if (isReg) {
   8288       delta++;
   8289       assign( src, getIRegE(sz, pfx, modrm) );
   8290    } else {
   8291       Int    len;
   8292       IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   8293       delta += len;
   8294       assign( src, loadLE(ty, mkexpr(addr)) );
   8295    }
   8296 
   8297    DIP("bs%c%c %s, %s\n",
   8298        fwds ? 'f' : 'r', nameISize(sz),
   8299        ( isReg ? nameIRegE(sz, pfx, modrm) : dis_buf ),
   8300        nameIRegG(sz, pfx, modrm));
   8301 
   8302    /* First, widen src to 64 bits if it is not already. */
   8303    assign( src64, widenUto64(mkexpr(src)) );
   8304 
   8305    /* Generate a bool expression which is zero iff the original is
   8306       zero, and nonzero otherwise.  Ask for a CmpNE version which, if
   8307       instrumented by Memcheck, is instrumented expensively, since
   8308       this may be used on the output of a preceding movmskb insn,
   8309       which has been known to be partially defined, and in need of
   8310       careful handling. */
   8311    assign( srcB, binop(Iop_ExpCmpNE64, mkexpr(src64), mkU64(0)) );
   8312 
   8313    /* Flags: Z is 1 iff source value is zero.  All others
   8314       are undefined -- we force them to zero. */
   8315    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   8316    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   8317    stmt( IRStmt_Put(
   8318             OFFB_CC_DEP1,
   8319             IRExpr_ITE( mkexpr(srcB),
   8320                         /* src!=0 */
   8321                         mkU64(0),
   8322                         /* src==0 */
   8323                         mkU64(AMD64G_CC_MASK_Z)
   8324                         )
   8325        ));
   8326    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   8327       elimination of previous stores to this field work better. */
   8328    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   8329 
   8330    /* Result: iff source value is zero, we can't use
   8331       Iop_Clz64/Iop_Ctz64 as they have no defined result in that case.
   8332       But anyway, amd64 semantics say the result is undefined in
   8333       such situations.  Hence handle the zero case specially. */
   8334 
   8335    /* Bleh.  What we compute:
   8336 
   8337           bsf64:  if src == 0 then {dst is unchanged}
   8338                               else Ctz64(src)
   8339 
   8340           bsr64:  if src == 0 then {dst is unchanged}
   8341                               else 63 - Clz64(src)
   8342 
   8343           bsf32:  if src == 0 then {dst is unchanged}
   8344                               else Ctz64(32Uto64(src))
   8345 
   8346           bsr32:  if src == 0 then {dst is unchanged}
   8347                               else 63 - Clz64(32Uto64(src))
   8348 
   8349           bsf16:  if src == 0 then {dst is unchanged}
   8350                               else Ctz64(32Uto64(16Uto32(src)))
   8351 
   8352           bsr16:  if src == 0 then {dst is unchanged}
   8353                               else 63 - Clz64(32Uto64(16Uto32(src)))
   8354    */
   8355 
   8356    /* The main computation, guarding against zero. */
   8357    assign( dst64,
   8358            IRExpr_ITE(
   8359               mkexpr(srcB),
   8360               /* src != 0 */
   8361               fwds ? unop(Iop_Ctz64, mkexpr(src64))
   8362                    : binop(Iop_Sub64,
   8363                            mkU64(63),
   8364                            unop(Iop_Clz64, mkexpr(src64))),
   8365               /* src == 0 -- leave dst unchanged */
   8366               widenUto64( getIRegG( sz, pfx, modrm ) )
   8367            )
   8368          );
   8369 
   8370    if (sz == 2)
   8371       assign( dst, unop(Iop_64to16, mkexpr(dst64)) );
   8372    else
   8373    if (sz == 4)
   8374       assign( dst, unop(Iop_64to32, mkexpr(dst64)) );
   8375    else
   8376       assign( dst, mkexpr(dst64) );
   8377 
   8378    /* dump result back */
   8379    putIRegG( sz, pfx, modrm, mkexpr(dst) );
   8380 
   8381    return delta;
   8382 }
   8383 
   8384 
   8385 /* swap rAX with the reg specified by reg and REX.B */
   8386 static
   8387 void codegen_xchg_rAX_Reg ( Prefix pfx, Int sz, UInt regLo3 )
   8388 {
   8389    IRType ty = szToITy(sz);
   8390    IRTemp t1 = newTemp(ty);
   8391    IRTemp t2 = newTemp(ty);
   8392    vassert(sz == 2 || sz == 4 || sz == 8);
   8393    vassert(regLo3 < 8);
   8394    if (sz == 8) {
   8395       assign( t1, getIReg64(R_RAX) );
   8396       assign( t2, getIRegRexB(8, pfx, regLo3) );
   8397       putIReg64( R_RAX, mkexpr(t2) );
   8398       putIRegRexB(8, pfx, regLo3, mkexpr(t1) );
   8399    } else if (sz == 4) {
   8400       assign( t1, getIReg32(R_RAX) );
   8401       assign( t2, getIRegRexB(4, pfx, regLo3) );
   8402       putIReg32( R_RAX, mkexpr(t2) );
   8403       putIRegRexB(4, pfx, regLo3, mkexpr(t1) );
   8404    } else {
   8405       assign( t1, getIReg16(R_RAX) );
   8406       assign( t2, getIRegRexB(2, pfx, regLo3) );
   8407       putIReg16( R_RAX, mkexpr(t2) );
   8408       putIRegRexB(2, pfx, regLo3, mkexpr(t1) );
   8409    }
   8410    DIP("xchg%c %s, %s\n",
   8411        nameISize(sz), nameIRegRAX(sz),
   8412                       nameIRegRexB(sz,pfx, regLo3));
   8413 }
   8414 
   8415 
   8416 static
   8417 void codegen_SAHF ( void )
   8418 {
   8419    /* Set the flags to:
   8420       (amd64g_calculate_flags_all() & AMD64G_CC_MASK_O)
   8421                                     -- retain the old O flag
   8422       | (%AH & (AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
   8423                 |AMD64G_CC_MASK_P|AMD64G_CC_MASK_C)
   8424    */
   8425    ULong  mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
   8426                        |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
   8427    IRTemp oldflags   = newTemp(Ity_I64);
   8428    assign( oldflags, mk_amd64g_calculate_rflags_all() );
   8429    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   8430    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   8431    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   8432    stmt( IRStmt_Put( OFFB_CC_DEP1,
   8433          binop(Iop_Or64,
   8434                binop(Iop_And64, mkexpr(oldflags), mkU64(AMD64G_CC_MASK_O)),
   8435                binop(Iop_And64,
   8436                      binop(Iop_Shr64, getIReg64(R_RAX), mkU8(8)),
   8437                      mkU64(mask_SZACP))
   8438               )
   8439    ));
   8440 }
   8441 
   8442 
   8443 static
   8444 void codegen_LAHF ( void  )
   8445 {
   8446    /* AH <- EFLAGS(SF:ZF:0:AF:0:PF:1:CF) */
   8447    IRExpr* rax_with_hole;
   8448    IRExpr* new_byte;
   8449    IRExpr* new_rax;
   8450    ULong   mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
   8451                         |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
   8452 
   8453    IRTemp  flags = newTemp(Ity_I64);
   8454    assign( flags, mk_amd64g_calculate_rflags_all() );
   8455 
   8456    rax_with_hole
   8457       = binop(Iop_And64, getIReg64(R_RAX), mkU64(~0xFF00ULL));
   8458    new_byte
   8459       = binop(Iop_Or64, binop(Iop_And64, mkexpr(flags), mkU64(mask_SZACP)),
   8460                         mkU64(1<<1));
   8461    new_rax
   8462       = binop(Iop_Or64, rax_with_hole,
   8463                         binop(Iop_Shl64, new_byte, mkU8(8)));
   8464    putIReg64(R_RAX, new_rax);
   8465 }
   8466 
   8467 
   8468 static
   8469 ULong dis_cmpxchg_G_E ( /*OUT*/Bool* ok,
   8470                         const VexAbiInfo*  vbi,
   8471                         Prefix       pfx,
   8472                         Int          size,
   8473                         Long         delta0 )
   8474 {
   8475    HChar dis_buf[50];
   8476    Int   len;
   8477 
   8478    IRType ty    = szToITy(size);
   8479    IRTemp acc   = newTemp(ty);
   8480    IRTemp src   = newTemp(ty);
   8481    IRTemp dest  = newTemp(ty);
   8482    IRTemp dest2 = newTemp(ty);
   8483    IRTemp acc2  = newTemp(ty);
   8484    IRTemp cond  = newTemp(Ity_I1);
   8485    IRTemp addr  = IRTemp_INVALID;
   8486    UChar  rm    = getUChar(delta0);
   8487 
   8488    /* There are 3 cases to consider:
   8489 
   8490       reg-reg: ignore any lock prefix, generate sequence based
   8491                on ITE
   8492 
   8493       reg-mem, not locked: ignore any lock prefix, generate sequence
   8494                            based on ITE
   8495 
   8496       reg-mem, locked: use IRCAS
   8497    */
   8498 
   8499    /* Decide whether F2 or F3 are acceptable.  Never for register
   8500       case, but for the memory case, one or the other is OK provided
   8501       LOCK is also present. */
   8502    if (epartIsReg(rm)) {
   8503       if (haveF2orF3(pfx)) {
   8504          *ok = False;
   8505          return delta0;
   8506       }
   8507    } else {
   8508       if (haveF2orF3(pfx)) {
   8509          if (haveF2andF3(pfx) || !haveLOCK(pfx)) {
   8510             *ok = False;
   8511             return delta0;
   8512          }
   8513       }
   8514    }
   8515 
   8516    if (epartIsReg(rm)) {
   8517       /* case 1 */
   8518       assign( dest, getIRegE(size, pfx, rm) );
   8519       delta0++;
   8520       assign( src, getIRegG(size, pfx, rm) );
   8521       assign( acc, getIRegRAX(size) );
   8522       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   8523       assign( cond, mk_amd64g_calculate_condition(AMD64CondZ) );
   8524       assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
   8525       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
   8526       putIRegRAX(size, mkexpr(acc2));
   8527       putIRegE(size, pfx, rm, mkexpr(dest2));
   8528       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   8529                                nameIRegG(size,pfx,rm),
   8530                                nameIRegE(size,pfx,rm) );
   8531    }
   8532    else if (!epartIsReg(rm) && !haveLOCK(pfx)) {
   8533       /* case 2 */
   8534       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8535       assign( dest, loadLE(ty, mkexpr(addr)) );
   8536       delta0 += len;
   8537       assign( src, getIRegG(size, pfx, rm) );
   8538       assign( acc, getIRegRAX(size) );
   8539       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   8540       assign( cond, mk_amd64g_calculate_condition(AMD64CondZ) );
   8541       assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
   8542       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
   8543       putIRegRAX(size, mkexpr(acc2));
   8544       storeLE( mkexpr(addr), mkexpr(dest2) );
   8545       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   8546                                nameIRegG(size,pfx,rm), dis_buf);
   8547    }
   8548    else if (!epartIsReg(rm) && haveLOCK(pfx)) {
   8549       /* case 3 */
   8550       /* src is new value.  acc is expected value.  dest is old value.
   8551          Compute success from the output of the IRCAS, and steer the
   8552          new value for RAX accordingly: in case of success, RAX is
   8553          unchanged. */
   8554       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8555       delta0 += len;
   8556       assign( src, getIRegG(size, pfx, rm) );
   8557       assign( acc, getIRegRAX(size) );
   8558       stmt( IRStmt_CAS(
   8559          mkIRCAS( IRTemp_INVALID, dest, Iend_LE, mkexpr(addr),
   8560                   NULL, mkexpr(acc), NULL, mkexpr(src) )
   8561       ));
   8562       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   8563       assign( cond, mk_amd64g_calculate_condition(AMD64CondZ) );
   8564       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
   8565       putIRegRAX(size, mkexpr(acc2));
   8566       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   8567                                nameIRegG(size,pfx,rm), dis_buf);
   8568    }
   8569    else vassert(0);
   8570 
   8571    *ok = True;
   8572    return delta0;
   8573 }
   8574 
   8575 
   8576 /* Handle conditional move instructions of the form
   8577       cmovcc E(reg-or-mem), G(reg)
   8578 
   8579    E(src) is reg-or-mem
   8580    G(dst) is reg.
   8581 
   8582    If E is reg, -->    GET %E, tmps
   8583                        GET %G, tmpd
   8584                        CMOVcc tmps, tmpd
   8585                        PUT tmpd, %G
   8586 
   8587    If E is mem  -->    (getAddr E) -> tmpa
   8588                        LD (tmpa), tmps
   8589                        GET %G, tmpd
   8590                        CMOVcc tmps, tmpd
   8591                        PUT tmpd, %G
   8592 */
   8593 static
   8594 ULong dis_cmov_E_G ( const VexAbiInfo* vbi,
   8595                      Prefix        pfx,
   8596                      Int           sz,
   8597                      AMD64Condcode cond,
   8598                      Long          delta0 )
   8599 {
   8600    UChar rm  = getUChar(delta0);
   8601    HChar dis_buf[50];
   8602    Int   len;
   8603 
   8604    IRType ty   = szToITy(sz);
   8605    IRTemp tmps = newTemp(ty);
   8606    IRTemp tmpd = newTemp(ty);
   8607 
   8608    if (epartIsReg(rm)) {
   8609       assign( tmps, getIRegE(sz, pfx, rm) );
   8610       assign( tmpd, getIRegG(sz, pfx, rm) );
   8611 
   8612       putIRegG( sz, pfx, rm,
   8613                 IRExpr_ITE( mk_amd64g_calculate_condition(cond),
   8614                             mkexpr(tmps),
   8615                             mkexpr(tmpd) )
   8616               );
   8617       DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
   8618                             nameIRegE(sz,pfx,rm),
   8619                             nameIRegG(sz,pfx,rm));
   8620       return 1+delta0;
   8621    }
   8622 
   8623    /* E refers to memory */
   8624    {
   8625       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8626       assign( tmps, loadLE(ty, mkexpr(addr)) );
   8627       assign( tmpd, getIRegG(sz, pfx, rm) );
   8628 
   8629       putIRegG( sz, pfx, rm,
   8630                 IRExpr_ITE( mk_amd64g_calculate_condition(cond),
   8631                             mkexpr(tmps),
   8632                             mkexpr(tmpd) )
   8633               );
   8634 
   8635       DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
   8636                             dis_buf,
   8637                             nameIRegG(sz,pfx,rm));
   8638       return len+delta0;
   8639    }
   8640 }
   8641 
   8642 
   8643 static
   8644 ULong dis_xadd_G_E ( /*OUT*/Bool* decode_ok,
   8645                      const VexAbiInfo* vbi,
   8646                      Prefix pfx, Int sz, Long delta0 )
   8647 {
   8648    Int   len;
   8649    UChar rm = getUChar(delta0);
   8650    HChar dis_buf[50];
   8651 
   8652    IRType ty    = szToITy(sz);
   8653    IRTemp tmpd  = newTemp(ty);
   8654    IRTemp tmpt0 = newTemp(ty);
   8655    IRTemp tmpt1 = newTemp(ty);
   8656 
   8657    /* There are 3 cases to consider:
   8658 
   8659       reg-reg: ignore any lock prefix,
   8660                generate 'naive' (non-atomic) sequence
   8661 
   8662       reg-mem, not locked: ignore any lock prefix, generate 'naive'
   8663                            (non-atomic) sequence
   8664 
   8665       reg-mem, locked: use IRCAS
   8666    */
   8667 
   8668    if (epartIsReg(rm)) {
   8669       /* case 1 */
   8670       assign( tmpd, getIRegE(sz, pfx, rm) );
   8671       assign( tmpt0, getIRegG(sz, pfx, rm) );
   8672       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   8673                            mkexpr(tmpd), mkexpr(tmpt0)) );
   8674       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   8675       putIRegG(sz, pfx, rm, mkexpr(tmpd));
   8676       putIRegE(sz, pfx, rm, mkexpr(tmpt1));
   8677       DIP("xadd%c %s, %s\n",
   8678           nameISize(sz), nameIRegG(sz,pfx,rm), nameIRegE(sz,pfx,rm));
   8679       *decode_ok = True;
   8680       return 1+delta0;
   8681    }
   8682    else if (!epartIsReg(rm) && !haveLOCK(pfx)) {
   8683       /* case 2 */
   8684       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8685       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   8686       assign( tmpt0, getIRegG(sz, pfx, rm) );
   8687       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   8688                            mkexpr(tmpd), mkexpr(tmpt0)) );
   8689       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   8690       storeLE( mkexpr(addr), mkexpr(tmpt1) );
   8691       putIRegG(sz, pfx, rm, mkexpr(tmpd));
   8692       DIP("xadd%c %s, %s\n",
   8693           nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
   8694       *decode_ok = True;
   8695       return len+delta0;
   8696    }
   8697    else if (!epartIsReg(rm) && haveLOCK(pfx)) {
   8698       /* case 3 */
   8699       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8700       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   8701       assign( tmpt0, getIRegG(sz, pfx, rm) );
   8702       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   8703                            mkexpr(tmpd), mkexpr(tmpt0)) );
   8704       casLE( mkexpr(addr), mkexpr(tmpd)/*expVal*/,
   8705                            mkexpr(tmpt1)/*newVal*/, guest_RIP_curr_instr );
   8706       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   8707       putIRegG(sz, pfx, rm, mkexpr(tmpd));
   8708       DIP("xadd%c %s, %s\n",
   8709           nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
   8710       *decode_ok = True;
   8711       return len+delta0;
   8712    }
   8713    /*UNREACHED*/
   8714    vassert(0);
   8715 }
   8716 
   8717 //.. /* Move 16 bits from Ew (ireg or mem) to G (a segment register). */
   8718 //..
   8719 //.. static
   8720 //.. UInt dis_mov_Ew_Sw ( UChar sorb, Long delta0 )
   8721 //.. {
   8722 //..    Int    len;
   8723 //..    IRTemp addr;
   8724 //..    UChar  rm  = getUChar(delta0);
   8725 //..    HChar  dis_buf[50];
   8726 //..
   8727 //..    if (epartIsReg(rm)) {
   8728 //..       putSReg( gregOfRM(rm), getIReg(2, eregOfRM(rm)) );
   8729 //..       DIP("movw %s,%s\n", nameIReg(2,eregOfRM(rm)), nameSReg(gregOfRM(rm)));
   8730 //..       return 1+delta0;
   8731 //..    } else {
   8732 //..       addr = disAMode ( &len, sorb, delta0, dis_buf );
   8733 //..       putSReg( gregOfRM(rm), loadLE(Ity_I16, mkexpr(addr)) );
   8734 //..       DIP("movw %s,%s\n", dis_buf, nameSReg(gregOfRM(rm)));
   8735 //..       return len+delta0;
   8736 //..    }
   8737 //.. }
   8738 //..
   8739 //.. /* Move 16 bits from G (a segment register) to Ew (ireg or mem).  If
   8740 //..    dst is ireg and sz==4, zero out top half of it.  */
   8741 //..
   8742 //.. static
   8743 //.. UInt dis_mov_Sw_Ew ( UChar sorb,
   8744 //..                      Int   sz,
   8745 //..                      UInt  delta0 )
   8746 //.. {
   8747 //..    Int    len;
   8748 //..    IRTemp addr;
   8749 //..    UChar  rm  = getUChar(delta0);
   8750 //..    HChar  dis_buf[50];
   8751 //..
   8752 //..    vassert(sz == 2 || sz == 4);
   8753 //..
   8754 //..    if (epartIsReg(rm)) {
   8755 //..       if (sz == 4)
   8756 //..          putIReg(4, eregOfRM(rm), unop(Iop_16Uto32, getSReg(gregOfRM(rm))));
   8757 //..       else
   8758 //..          putIReg(2, eregOfRM(rm), getSReg(gregOfRM(rm)));
   8759 //..
   8760 //..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), nameIReg(sz,eregOfRM(rm)));
   8761 //..       return 1+delta0;
   8762 //..    } else {
   8763 //..       addr = disAMode ( &len, sorb, delta0, dis_buf );
   8764 //..       storeLE( mkexpr(addr), getSReg(gregOfRM(rm)) );
   8765 //..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), dis_buf);
   8766 //..       return len+delta0;
   8767 //..    }
   8768 //.. }
   8769 
   8770 /* Handle move instructions of the form
   8771       mov S, E  meaning
   8772       mov sreg, reg-or-mem
   8773    Is passed the a ptr to the modRM byte, and the data size.  Returns
   8774    the address advanced completely over this instruction.
   8775 
   8776    VEX does not currently simulate segment registers on AMD64 which means that
   8777    instead of moving a value of a segment register, zero is moved to the
   8778    destination.  The zero value represents a null (unused) selector.  This is
   8779    not correct (especially for the %cs, %fs and %gs registers) but it seems to
   8780    provide a sufficient simulation for currently seen programs that use this
   8781    instruction.  If some program actually decides to use the obtained segment
   8782    selector for something meaningful then the zero value should be a clear
   8783    indicator that there is some problem.
   8784 
   8785    S(src) is sreg.
   8786    E(dst) is reg-or-mem
   8787 
   8788    If E is reg, -->    PUT $0, %E
   8789 
   8790    If E is mem, -->    (getAddr E) -> tmpa
   8791                        ST $0, (tmpa)
   8792 */
   8793 static
   8794 ULong dis_mov_S_E ( const VexAbiInfo* vbi,
   8795                     Prefix      pfx,
   8796                     Int         size,
   8797                     Long        delta0 )
   8798 {
   8799    Int   len;
   8800    UChar rm = getUChar(delta0);
   8801    HChar dis_buf[50];
   8802 
   8803    if (epartIsReg(rm)) {
   8804       putIRegE(size, pfx, rm, mkU(szToITy(size), 0));
   8805       DIP("mov %s,%s\n", nameSReg(gregOfRexRM(pfx, rm)),
   8806                          nameIRegE(size, pfx, rm));
   8807       return 1+delta0;
   8808    }
   8809 
   8810    /* E refers to memory */
   8811    {
   8812       IRTemp addr = disAMode(&len, vbi, pfx, delta0, dis_buf, 0);
   8813       storeLE(mkexpr(addr), mkU16(0));
   8814       DIP("mov %s,%s\n", nameSReg(gregOfRexRM(pfx, rm)),
   8815                          dis_buf);
   8816       return len+delta0;
   8817    }
   8818 }
   8819 
   8820 //.. static
   8821 //.. void dis_push_segreg ( UInt sreg, Int sz )
   8822 //.. {
   8823 //..     IRTemp t1 = newTemp(Ity_I16);
   8824 //..     IRTemp ta = newTemp(Ity_I32);
   8825 //..     vassert(sz == 2 || sz == 4);
   8826 //..
   8827 //..     assign( t1, getSReg(sreg) );
   8828 //..     assign( ta, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)) );
   8829 //..     putIReg(4, R_ESP, mkexpr(ta));
   8830 //..     storeLE( mkexpr(ta), mkexpr(t1) );
   8831 //..
   8832 //..     DIP("pushw %s\n", nameSReg(sreg));
   8833 //.. }
   8834 //..
   8835 //.. static
   8836 //.. void dis_pop_segreg ( UInt sreg, Int sz )
   8837 //.. {
   8838 //..     IRTemp t1 = newTemp(Ity_I16);
   8839 //..     IRTemp ta = newTemp(Ity_I32);
   8840 //..     vassert(sz == 2 || sz == 4);
   8841 //..
   8842 //..     assign( ta, getIReg(4, R_ESP) );
   8843 //..     assign( t1, loadLE(Ity_I16, mkexpr(ta)) );
   8844 //..
   8845 //..     putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(ta), mkU32(sz)) );
   8846 //..     putSReg( sreg, mkexpr(t1) );
   8847 //..     DIP("pop %s\n", nameSReg(sreg));
   8848 //.. }
   8849 
   8850 static
   8851 void dis_ret ( /*MOD*/DisResult* dres, const VexAbiInfo* vbi, ULong d64 )
   8852 {
   8853    IRTemp t1 = newTemp(Ity_I64);
   8854    IRTemp t2 = newTemp(Ity_I64);
   8855    IRTemp t3 = newTemp(Ity_I64);
   8856    assign(t1, getIReg64(R_RSP));
   8857    assign(t2, loadLE(Ity_I64,mkexpr(t1)));
   8858    assign(t3, binop(Iop_Add64, mkexpr(t1), mkU64(8+d64)));
   8859    putIReg64(R_RSP, mkexpr(t3));
   8860    make_redzone_AbiHint(vbi, t3, t2/*nia*/, "ret");
   8861    jmp_treg(dres, Ijk_Ret, t2);
   8862    vassert(dres->whatNext == Dis_StopHere);
   8863 }
   8864 
   8865 
   8866 /*------------------------------------------------------------*/
   8867 /*--- SSE/SSE2/SSE3 helpers                                ---*/
   8868 /*------------------------------------------------------------*/
   8869 
   8870 /* Indicates whether the op requires a rounding-mode argument.  Note
   8871    that this covers only vector floating point arithmetic ops, and
   8872    omits the scalar ones that need rounding modes.  Note also that
   8873    inconsistencies here will get picked up later by the IR sanity
   8874    checker, so this isn't correctness-critical. */
   8875 static Bool requiresRMode ( IROp op )
   8876 {
   8877    switch (op) {
   8878       /* 128 bit ops */
   8879       case Iop_Add32Fx4: case Iop_Sub32Fx4:
   8880       case Iop_Mul32Fx4: case Iop_Div32Fx4:
   8881       case Iop_Add64Fx2: case Iop_Sub64Fx2:
   8882       case Iop_Mul64Fx2: case Iop_Div64Fx2:
   8883       /* 256 bit ops */
   8884       case Iop_Add32Fx8: case Iop_Sub32Fx8:
   8885       case Iop_Mul32Fx8: case Iop_Div32Fx8:
   8886       case Iop_Add64Fx4: case Iop_Sub64Fx4:
   8887       case Iop_Mul64Fx4: case Iop_Div64Fx4:
   8888          return True;
   8889       default:
   8890          break;
   8891    }
   8892    return False;
   8893 }
   8894 
   8895 
   8896 /* Worker function; do not call directly.
   8897    Handles full width G = G `op` E   and   G = (not G) `op` E.
   8898 */
   8899 
   8900 static ULong dis_SSE_E_to_G_all_wrk (
   8901                 const VexAbiInfo* vbi,
   8902                 Prefix pfx, Long delta,
   8903                 const HChar* opname, IROp op,
   8904                 Bool   invertG
   8905              )
   8906 {
   8907    HChar   dis_buf[50];
   8908    Int     alen;
   8909    IRTemp  addr;
   8910    UChar   rm = getUChar(delta);
   8911    Bool    needsRMode = requiresRMode(op);
   8912    IRExpr* gpart
   8913       = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRexRM(pfx,rm)))
   8914                 : getXMMReg(gregOfRexRM(pfx,rm));
   8915    if (epartIsReg(rm)) {
   8916       putXMMReg(
   8917          gregOfRexRM(pfx,rm),
   8918          needsRMode
   8919             ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   8920                         gpart,
   8921                         getXMMReg(eregOfRexRM(pfx,rm)))
   8922             : binop(op, gpart,
   8923                         getXMMReg(eregOfRexRM(pfx,rm)))
   8924       );
   8925       DIP("%s %s,%s\n", opname,
   8926                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8927                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8928       return delta+1;
   8929    } else {
   8930       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8931       putXMMReg(
   8932          gregOfRexRM(pfx,rm),
   8933          needsRMode
   8934             ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   8935                         gpart,
   8936                         loadLE(Ity_V128, mkexpr(addr)))
   8937             : binop(op, gpart,
   8938                         loadLE(Ity_V128, mkexpr(addr)))
   8939       );
   8940       DIP("%s %s,%s\n", opname,
   8941                         dis_buf,
   8942                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8943       return delta+alen;
   8944    }
   8945 }
   8946 
   8947 
   8948 /* All lanes SSE binary operation, G = G `op` E. */
   8949 
   8950 static
   8951 ULong dis_SSE_E_to_G_all ( const VexAbiInfo* vbi,
   8952                            Prefix pfx, Long delta,
   8953                            const HChar* opname, IROp op )
   8954 {
   8955    return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, False );
   8956 }
   8957 
   8958 /* All lanes SSE binary operation, G = (not G) `op` E. */
   8959 
   8960 static
   8961 ULong dis_SSE_E_to_G_all_invG ( const VexAbiInfo* vbi,
   8962                                 Prefix pfx, Long delta,
   8963                                 const HChar* opname, IROp op )
   8964 {
   8965    return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, True );
   8966 }
   8967 
   8968 
   8969 /* Lowest 32-bit lane only SSE binary operation, G = G `op` E. */
   8970 
   8971 static ULong dis_SSE_E_to_G_lo32 ( const VexAbiInfo* vbi,
   8972                                    Prefix pfx, Long delta,
   8973                                    const HChar* opname, IROp op )
   8974 {
   8975    HChar   dis_buf[50];
   8976    Int     alen;
   8977    IRTemp  addr;
   8978    UChar   rm = getUChar(delta);
   8979    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
   8980    if (epartIsReg(rm)) {
   8981       putXMMReg( gregOfRexRM(pfx,rm),
   8982                  binop(op, gpart,
   8983                            getXMMReg(eregOfRexRM(pfx,rm))) );
   8984       DIP("%s %s,%s\n", opname,
   8985                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8986                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8987       return delta+1;
   8988    } else {
   8989       /* We can only do a 32-bit memory read, so the upper 3/4 of the
   8990          E operand needs to be made simply of zeroes. */
   8991       IRTemp epart = newTemp(Ity_V128);
   8992       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8993       assign( epart, unop( Iop_32UtoV128,
   8994                            loadLE(Ity_I32, mkexpr(addr))) );
   8995       putXMMReg( gregOfRexRM(pfx,rm),
   8996                  binop(op, gpart, mkexpr(epart)) );
   8997       DIP("%s %s,%s\n", opname,
   8998                         dis_buf,
   8999                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9000       return delta+alen;
   9001    }
   9002 }
   9003 
   9004 
   9005 /* Lower 64-bit lane only SSE binary operation, G = G `op` E. */
   9006 
   9007 static ULong dis_SSE_E_to_G_lo64 ( const VexAbiInfo* vbi,
   9008                                    Prefix pfx, Long delta,
   9009                                    const HChar* opname, IROp op )
   9010 {
   9011    HChar   dis_buf[50];
   9012    Int     alen;
   9013    IRTemp  addr;
   9014    UChar   rm = getUChar(delta);
   9015    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
   9016    if (epartIsReg(rm)) {
   9017       putXMMReg( gregOfRexRM(pfx,rm),
   9018                  binop(op, gpart,
   9019                            getXMMReg(eregOfRexRM(pfx,rm))) );
   9020       DIP("%s %s,%s\n", opname,
   9021                         nameXMMReg(eregOfRexRM(pfx,rm)),
   9022                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9023       return delta+1;
   9024    } else {
   9025       /* We can only do a 64-bit memory read, so the upper half of the
   9026          E operand needs to be made simply of zeroes. */
   9027       IRTemp epart = newTemp(Ity_V128);
   9028       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9029       assign( epart, unop( Iop_64UtoV128,
   9030                            loadLE(Ity_I64, mkexpr(addr))) );
   9031       putXMMReg( gregOfRexRM(pfx,rm),
   9032                  binop(op, gpart, mkexpr(epart)) );
   9033       DIP("%s %s,%s\n", opname,
   9034                         dis_buf,
   9035                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9036       return delta+alen;
   9037    }
   9038 }
   9039 
   9040 
   9041 /* All lanes unary SSE operation, G = op(E). */
   9042 
   9043 static ULong dis_SSE_E_to_G_unary_all (
   9044                 const VexAbiInfo* vbi,
   9045                 Prefix pfx, Long delta,
   9046                 const HChar* opname, IROp op
   9047              )
   9048 {
   9049    HChar   dis_buf[50];
   9050    Int     alen;
   9051    IRTemp  addr;
   9052    UChar   rm = getUChar(delta);
   9053    // Sqrt32Fx4 and Sqrt64Fx2 take a rounding mode, which is faked
   9054    // up in the usual way.
   9055    Bool needsIRRM = op == Iop_Sqrt32Fx4 || op == Iop_Sqrt64Fx2;
   9056    if (epartIsReg(rm)) {
   9057       IRExpr* src = getXMMReg(eregOfRexRM(pfx,rm));
   9058       /* XXXROUNDINGFIXME */
   9059       IRExpr* res = needsIRRM ? binop(op, get_FAKE_roundingmode(), src)
   9060                               : unop(op, src);
   9061       putXMMReg( gregOfRexRM(pfx,rm), res );
   9062       DIP("%s %s,%s\n", opname,
   9063                         nameXMMReg(eregOfRexRM(pfx,rm)),
   9064                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9065       return delta+1;
   9066    } else {
   9067       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9068       IRExpr* src = loadLE(Ity_V128, mkexpr(addr));
   9069       /* XXXROUNDINGFIXME */
   9070       IRExpr* res = needsIRRM ? binop(op, get_FAKE_roundingmode(), src)
   9071                               : unop(op, src);
   9072       putXMMReg( gregOfRexRM(pfx,rm), res );
   9073       DIP("%s %s,%s\n", opname,
   9074                         dis_buf,
   9075                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9076       return delta+alen;
   9077    }
   9078 }
   9079 
   9080 
   9081 /* Lowest 32-bit lane only unary SSE operation, G = op(E). */
   9082 
   9083 static ULong dis_SSE_E_to_G_unary_lo32 (
   9084                 const VexAbiInfo* vbi,
   9085                 Prefix pfx, Long delta,
   9086                 const HChar* opname, IROp op
   9087              )
   9088 {
   9089    /* First we need to get the old G value and patch the low 32 bits
   9090       of the E operand into it.  Then apply op and write back to G. */
   9091    HChar   dis_buf[50];
   9092    Int     alen;
   9093    IRTemp  addr;
   9094    UChar   rm = getUChar(delta);
   9095    IRTemp  oldG0 = newTemp(Ity_V128);
   9096    IRTemp  oldG1 = newTemp(Ity_V128);
   9097 
   9098    assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
   9099 
   9100    if (epartIsReg(rm)) {
   9101       assign( oldG1,
   9102               binop( Iop_SetV128lo32,
   9103                      mkexpr(oldG0),
   9104                      getXMMRegLane32(eregOfRexRM(pfx,rm), 0)) );
   9105       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   9106       DIP("%s %s,%s\n", opname,
   9107                         nameXMMReg(eregOfRexRM(pfx,rm)),
   9108                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9109       return delta+1;
   9110    } else {
   9111       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9112       assign( oldG1,
   9113               binop( Iop_SetV128lo32,
   9114                      mkexpr(oldG0),
   9115                      loadLE(Ity_I32, mkexpr(addr)) ));
   9116       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   9117       DIP("%s %s,%s\n", opname,
   9118                         dis_buf,
   9119                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9120       return delta+alen;
   9121    }
   9122 }
   9123 
   9124 
   9125 /* Lowest 64-bit lane only unary SSE operation, G = op(E). */
   9126 
   9127 static ULong dis_SSE_E_to_G_unary_lo64 (
   9128                 const VexAbiInfo* vbi,
   9129                 Prefix pfx, Long delta,
   9130                 const HChar* opname, IROp op
   9131              )
   9132 {
   9133    /* First we need to get the old G value and patch the low 64 bits
   9134       of the E operand into it.  Then apply op and write back to G. */
   9135    HChar   dis_buf[50];
   9136    Int     alen;
   9137    IRTemp  addr;
   9138    UChar   rm = getUChar(delta);
   9139    IRTemp  oldG0 = newTemp(Ity_V128);
   9140    IRTemp  oldG1 = newTemp(Ity_V128);
   9141 
   9142    assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
   9143 
   9144    if (epartIsReg(rm)) {
   9145       assign( oldG1,
   9146               binop( Iop_SetV128lo64,
   9147                      mkexpr(oldG0),
   9148                      getXMMRegLane64(eregOfRexRM(pfx,rm), 0)) );
   9149       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   9150       DIP("%s %s,%s\n", opname,
   9151                         nameXMMReg(eregOfRexRM(pfx,rm)),
   9152                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9153       return delta+1;
   9154    } else {
   9155       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9156       assign( oldG1,
   9157               binop( Iop_SetV128lo64,
   9158                      mkexpr(oldG0),
   9159                      loadLE(Ity_I64, mkexpr(addr)) ));
   9160       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   9161       DIP("%s %s,%s\n", opname,
   9162                         dis_buf,
   9163                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9164       return delta+alen;
   9165    }
   9166 }
   9167 
   9168 
   9169 /* SSE integer binary operation:
   9170       G = G `op` E   (eLeft == False)
   9171       G = E `op` G   (eLeft == True)
   9172 */
   9173 static ULong dis_SSEint_E_to_G(
   9174                 const VexAbiInfo* vbi,
   9175                 Prefix pfx, Long delta,
   9176                 const HChar* opname, IROp op,
   9177                 Bool   eLeft
   9178              )
   9179 {
   9180    HChar   dis_buf[50];
   9181    Int     alen;
   9182    IRTemp  addr;
   9183    UChar   rm = getUChar(delta);
   9184    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
   9185    IRExpr* epart = NULL;
   9186    if (epartIsReg(rm)) {
   9187       epart = getXMMReg(eregOfRexRM(pfx,rm));
   9188       DIP("%s %s,%s\n", opname,
   9189                         nameXMMReg(eregOfRexRM(pfx,rm)),
   9190                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9191       delta += 1;
   9192    } else {
   9193       addr  = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9194       epart = loadLE(Ity_V128, mkexpr(addr));
   9195       DIP("%s %s,%s\n", opname,
   9196                         dis_buf,
   9197                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9198       delta += alen;
   9199    }
   9200    putXMMReg( gregOfRexRM(pfx,rm),
   9201               eLeft ? binop(op, epart, gpart)
   9202                     : binop(op, gpart, epart) );
   9203    return delta;
   9204 }
   9205 
   9206 
   9207 /* Helper for doing SSE FP comparisons.  False return ==> unhandled.
   9208    This is all a bit of a kludge in that it ignores the subtleties of
   9209    ordered-vs-unordered and signalling-vs-nonsignalling in the Intel
   9210    spec. */
   9211 static Bool findSSECmpOp ( /*OUT*/Bool* preSwapP,
   9212                            /*OUT*/IROp* opP,
   9213                            /*OUT*/Bool* postNotP,
   9214                            UInt imm8, Bool all_lanes, Int sz )
   9215 {
   9216    if (imm8 >= 32) return False;
   9217 
   9218    /* First, compute a (preSwap, op, postNot) triple from
   9219       the supplied imm8. */
   9220    Bool pre = False;
   9221    IROp op  = Iop_INVALID;
   9222    Bool not = False;
   9223 
   9224 #  define XXX(_pre, _op, _not) { pre = _pre; op = _op; not = _not; }
   9225    // If you add a case here, add a corresponding test for both VCMPSD_128
   9226    // and VCMPSS_128 in avx-1.c.
   9227    // Cases 0xA and above are
   9228    //    "Enhanced Comparison Predicate[s] for VEX-Encoded [insns]"
   9229    switch (imm8) {
   9230       // "O" = ordered, "U" = unordered
   9231       // "Q" = non-signalling (quiet), "S" = signalling
   9232       //
   9233       //             swap operands?
   9234       //             |
   9235       //             |      cmp op          invert after?
   9236       //             |      |               |
   9237       //             v      v               v
   9238       case 0x0:  XXX(False, Iop_CmpEQ32Fx4, False); break; // EQ_OQ
   9239       case 0x8:  XXX(False, Iop_CmpEQ32Fx4, False); break; // EQ_UQ
   9240       case 0x10: XXX(False, Iop_CmpEQ32Fx4, False); break; // EQ_OS
   9241       case 0x18: XXX(False, Iop_CmpEQ32Fx4, False); break; // EQ_US
   9242       //
   9243       case 0x1:  XXX(False, Iop_CmpLT32Fx4, False); break; // LT_OS
   9244       case 0x11: XXX(False, Iop_CmpLT32Fx4, False); break; // LT_OQ
   9245       //
   9246       case 0x2:  XXX(False, Iop_CmpLE32Fx4, False); break; // LE_OS
   9247       case 0x12: XXX(False, Iop_CmpLE32Fx4, False); break; // LE_OQ
   9248       //
   9249       case 0x3:  XXX(False, Iop_CmpUN32Fx4, False); break; // UNORD_Q
   9250       case 0x13: XXX(False, Iop_CmpUN32Fx4, False); break; // UNORD_S
   9251       //
   9252       // 0xC: this isn't really right because it returns all-1s when
   9253       // either operand is a NaN, and it should return all-0s.
   9254       case 0x4:  XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_UQ
   9255       case 0xC:  XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_OQ
   9256       case 0x14: XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_US
   9257       case 0x1C: XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_OS
   9258       //
   9259       case 0x5:  XXX(False, Iop_CmpLT32Fx4, True);  break; // NLT_US
   9260       case 0x15: XXX(False, Iop_CmpLT32Fx4, True);  break; // NLT_UQ
   9261       //
   9262       case 0x6:  XXX(False, Iop_CmpLE32Fx4, True);  break; // NLE_US
   9263       case 0x16: XXX(False, Iop_CmpLE32Fx4, True);  break; // NLE_UQ
   9264       //
   9265       case 0x7:  XXX(False, Iop_CmpUN32Fx4, True);  break; // ORD_Q
   9266       case 0x17: XXX(False, Iop_CmpUN32Fx4, True);  break; // ORD_S
   9267       //
   9268       case 0x9:  XXX(True,  Iop_CmpLE32Fx4, True);  break; // NGE_US
   9269       case 0x19: XXX(True,  Iop_CmpLE32Fx4, True);  break; // NGE_UQ
   9270       //
   9271       case 0xA:  XXX(True,  Iop_CmpLT32Fx4, True);  break; // NGT_US
   9272       case 0x1A: XXX(True,  Iop_CmpLT32Fx4, True);  break; // NGT_UQ
   9273       //
   9274       case 0xD:  XXX(True,  Iop_CmpLE32Fx4, False); break; // GE_OS
   9275       case 0x1D: XXX(True,  Iop_CmpLE32Fx4, False); break; // GE_OQ
   9276       //
   9277       case 0xE:  XXX(True,  Iop_CmpLT32Fx4, False); break; // GT_OS
   9278       case 0x1E: XXX(True,  Iop_CmpLT32Fx4, False); break; // GT_OQ
   9279       // Unhandled:
   9280       // 0xB  FALSE_OQ
   9281       // 0xF  TRUE_UQ
   9282       // 0x1B  FALSE_OS
   9283       // 0x1F  TRUE_US
   9284       /* Don't forget to add test cases to VCMPSS_128_<imm8> in
   9285          avx-1.c if new cases turn up. */
   9286       default: break;
   9287    }
   9288 #  undef XXX
   9289    if (op == Iop_INVALID) return False;
   9290 
   9291    /* Now convert the op into one with the same arithmetic but that is
   9292       correct for the width and laneage requirements. */
   9293 
   9294    /**/ if (sz == 4 && all_lanes) {
   9295       switch (op) {
   9296          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ32Fx4; break;
   9297          case Iop_CmpLT32Fx4: op = Iop_CmpLT32Fx4; break;
   9298          case Iop_CmpLE32Fx4: op = Iop_CmpLE32Fx4; break;
   9299          case Iop_CmpUN32Fx4: op = Iop_CmpUN32Fx4; break;
   9300          default: vassert(0);
   9301       }
   9302    }
   9303    else if (sz == 4 && !all_lanes) {
   9304       switch (op) {
   9305          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ32F0x4; break;
   9306          case Iop_CmpLT32Fx4: op = Iop_CmpLT32F0x4; break;
   9307          case Iop_CmpLE32Fx4: op = Iop_CmpLE32F0x4; break;
   9308          case Iop_CmpUN32Fx4: op = Iop_CmpUN32F0x4; break;
   9309          default: vassert(0);
   9310       }
   9311    }
   9312    else if (sz == 8 && all_lanes) {
   9313       switch (op) {
   9314          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ64Fx2; break;
   9315          case Iop_CmpLT32Fx4: op = Iop_CmpLT64Fx2; break;
   9316          case Iop_CmpLE32Fx4: op = Iop_CmpLE64Fx2; break;
   9317          case Iop_CmpUN32Fx4: op = Iop_CmpUN64Fx2; break;
   9318          default: vassert(0);
   9319       }
   9320    }
   9321    else if (sz == 8 && !all_lanes) {
   9322       switch (op) {
   9323          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ64F0x2; break;
   9324          case Iop_CmpLT32Fx4: op = Iop_CmpLT64F0x2; break;
   9325          case Iop_CmpLE32Fx4: op = Iop_CmpLE64F0x2; break;
   9326          case Iop_CmpUN32Fx4: op = Iop_CmpUN64F0x2; break;
   9327          default: vassert(0);
   9328       }
   9329    }
   9330    else {
   9331       vpanic("findSSECmpOp(amd64,guest)");
   9332    }
   9333 
   9334    *preSwapP = pre; *opP = op; *postNotP = not;
   9335    return True;
   9336 }
   9337 
   9338 
   9339 /* Handles SSE 32F/64F comparisons.  It can fail, in which case it
   9340    returns the original delta to indicate failure. */
   9341 
   9342 static Long dis_SSE_cmp_E_to_G ( const VexAbiInfo* vbi,
   9343                                  Prefix pfx, Long delta,
   9344                                  const HChar* opname, Bool all_lanes, Int sz )
   9345 {
   9346    Long    delta0 = delta;
   9347    HChar   dis_buf[50];
   9348    Int     alen;
   9349    UInt    imm8;
   9350    IRTemp  addr;
   9351    Bool    preSwap = False;
   9352    IROp    op      = Iop_INVALID;
   9353    Bool    postNot = False;
   9354    IRTemp  plain   = newTemp(Ity_V128);
   9355    UChar   rm      = getUChar(delta);
   9356    UShort  mask    = 0;
   9357    vassert(sz == 4 || sz == 8);
   9358    if (epartIsReg(rm)) {
   9359       imm8 = getUChar(delta+1);
   9360       if (imm8 >= 8) return delta0; /* FAIL */
   9361       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
   9362       if (!ok) return delta0; /* FAIL */
   9363       vassert(!preSwap); /* never needed for imm8 < 8 */
   9364       assign( plain, binop(op, getXMMReg(gregOfRexRM(pfx,rm)),
   9365                                getXMMReg(eregOfRexRM(pfx,rm))) );
   9366       delta += 2;
   9367       DIP("%s $%u,%s,%s\n", opname,
   9368                             imm8,
   9369                             nameXMMReg(eregOfRexRM(pfx,rm)),
   9370                             nameXMMReg(gregOfRexRM(pfx,rm)) );
   9371    } else {
   9372       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   9373       imm8 = getUChar(delta+alen);
   9374       if (imm8 >= 8) return delta0; /* FAIL */
   9375       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
   9376       if (!ok) return delta0; /* FAIL */
   9377       vassert(!preSwap); /* never needed for imm8 < 8 */
   9378       assign( plain,
   9379               binop(
   9380                  op,
   9381                  getXMMReg(gregOfRexRM(pfx,rm)),
   9382                    all_lanes
   9383                       ? loadLE(Ity_V128, mkexpr(addr))
   9384                    : sz == 8
   9385                       ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
   9386                    : /*sz==4*/
   9387                       unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
   9388               )
   9389       );
   9390       delta += alen+1;
   9391       DIP("%s $%u,%s,%s\n", opname,
   9392                             imm8,
   9393                             dis_buf,
   9394                             nameXMMReg(gregOfRexRM(pfx,rm)) );
   9395    }
   9396 
   9397    if (postNot && all_lanes) {
   9398       putXMMReg( gregOfRexRM(pfx,rm),
   9399                  unop(Iop_NotV128, mkexpr(plain)) );
   9400    }
   9401    else
   9402    if (postNot && !all_lanes) {
   9403       mask = toUShort(sz==4 ? 0x000F : 0x00FF);
   9404       putXMMReg( gregOfRexRM(pfx,rm),
   9405                  binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) );
   9406    }
   9407    else {
   9408       putXMMReg( gregOfRexRM(pfx,rm), mkexpr(plain) );
   9409    }
   9410 
   9411    return delta;
   9412 }
   9413 
   9414 
   9415 /* Vector by scalar shift of G by the amount specified at the bottom
   9416    of E. */
   9417 
   9418 static ULong dis_SSE_shiftG_byE ( const VexAbiInfo* vbi,
   9419                                   Prefix pfx, Long delta,
   9420                                   const HChar* opname, IROp op )
   9421 {
   9422    HChar   dis_buf[50];
   9423    Int     alen, size;
   9424    IRTemp  addr;
   9425    Bool    shl, shr, sar;
   9426    UChar   rm   = getUChar(delta);
   9427    IRTemp  g0   = newTemp(Ity_V128);
   9428    IRTemp  g1   = newTemp(Ity_V128);
   9429    IRTemp  amt  = newTemp(Ity_I64);
   9430    IRTemp  amt8 = newTemp(Ity_I8);
   9431    if (epartIsReg(rm)) {
   9432       assign( amt, getXMMRegLane64(eregOfRexRM(pfx,rm), 0) );
   9433       DIP("%s %s,%s\n", opname,
   9434                         nameXMMReg(eregOfRexRM(pfx,rm)),
   9435                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9436       delta++;
   9437    } else {
   9438       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9439       assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
   9440       DIP("%s %s,%s\n", opname,
   9441                         dis_buf,
   9442                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9443       delta += alen;
   9444    }
   9445    assign( g0,   getXMMReg(gregOfRexRM(pfx,rm)) );
   9446    assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
   9447 
   9448    shl = shr = sar = False;
   9449    size = 0;
   9450    switch (op) {
   9451       case Iop_ShlN16x8: shl = True; size = 32; break;
   9452       case Iop_ShlN32x4: shl = True; size = 32; break;
   9453       case Iop_ShlN64x2: shl = True; size = 64; break;
   9454       case Iop_SarN16x8: sar = True; size = 16; break;
   9455       case Iop_SarN32x4: sar = True; size = 32; break;
   9456       case Iop_ShrN16x8: shr = True; size = 16; break;
   9457       case Iop_ShrN32x4: shr = True; size = 32; break;
   9458       case Iop_ShrN64x2: shr = True; size = 64; break;
   9459       default: vassert(0);
   9460    }
   9461 
   9462    if (shl || shr) {
   9463      assign(
   9464         g1,
   9465         IRExpr_ITE(
   9466            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
   9467            binop(op, mkexpr(g0), mkexpr(amt8)),
   9468            mkV128(0x0000)
   9469         )
   9470      );
   9471    } else
   9472    if (sar) {
   9473      assign(
   9474         g1,
   9475         IRExpr_ITE(
   9476            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
   9477            binop(op, mkexpr(g0), mkexpr(amt8)),
   9478            binop(op, mkexpr(g0), mkU8(size-1))
   9479         )
   9480      );
   9481    } else {
   9482       vassert(0);
   9483    }
   9484 
   9485    putXMMReg( gregOfRexRM(pfx,rm), mkexpr(g1) );
   9486    return delta;
   9487 }
   9488 
   9489 
   9490 /* Vector by scalar shift of E by an immediate byte. */
   9491 
   9492 static
   9493 ULong dis_SSE_shiftE_imm ( Prefix pfx,
   9494                            Long delta, const HChar* opname, IROp op )
   9495 {
   9496    Bool    shl, shr, sar;
   9497    UChar   rm   = getUChar(delta);
   9498    IRTemp  e0   = newTemp(Ity_V128);
   9499    IRTemp  e1   = newTemp(Ity_V128);
   9500    UChar   amt, size;
   9501    vassert(epartIsReg(rm));
   9502    vassert(gregLO3ofRM(rm) == 2
   9503            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
   9504    amt = getUChar(delta+1);
   9505    delta += 2;
   9506    DIP("%s $%d,%s\n", opname,
   9507                       (Int)amt,
   9508                       nameXMMReg(eregOfRexRM(pfx,rm)) );
   9509    assign( e0, getXMMReg(eregOfRexRM(pfx,rm)) );
   9510 
   9511    shl = shr = sar = False;
   9512    size = 0;
   9513    switch (op) {
   9514       case Iop_ShlN16x8: shl = True; size = 16; break;
   9515       case Iop_ShlN32x4: shl = True; size = 32; break;
   9516       case Iop_ShlN64x2: shl = True; size = 64; break;
   9517       case Iop_SarN16x8: sar = True; size = 16; break;
   9518       case Iop_SarN32x4: sar = True; size = 32; break;
   9519       case Iop_ShrN16x8: shr = True; size = 16; break;
   9520       case Iop_ShrN32x4: shr = True; size = 32; break;
   9521       case Iop_ShrN64x2: shr = True; size = 64; break;
   9522       default: vassert(0);
   9523    }
   9524 
   9525    if (shl || shr) {
   9526      assign( e1, amt >= size
   9527                     ? mkV128(0x0000)
   9528                     : binop(op, mkexpr(e0), mkU8(amt))
   9529      );
   9530    } else
   9531    if (sar) {
   9532      assign( e1, amt >= size
   9533                     ? binop(op, mkexpr(e0), mkU8(size-1))
   9534                     : binop(op, mkexpr(e0), mkU8(amt))
   9535      );
   9536    } else {
   9537       vassert(0);
   9538    }
   9539 
   9540    putXMMReg( eregOfRexRM(pfx,rm), mkexpr(e1) );
   9541    return delta;
   9542 }
   9543 
   9544 
   9545 /* Get the current SSE rounding mode. */
   9546 
   9547 static IRExpr* /* :: Ity_I32 */ get_sse_roundingmode ( void )
   9548 {
   9549    return
   9550       unop( Iop_64to32,
   9551             binop( Iop_And64,
   9552                    IRExpr_Get( OFFB_SSEROUND, Ity_I64 ),
   9553                    mkU64(3) ));
   9554 }
   9555 
   9556 static void put_sse_roundingmode ( IRExpr* sseround )
   9557 {
   9558    vassert(typeOfIRExpr(irsb->tyenv, sseround) == Ity_I32);
   9559    stmt( IRStmt_Put( OFFB_SSEROUND,
   9560                      unop(Iop_32Uto64,sseround) ) );
   9561 }
   9562 
   9563 /* Break a V128-bit value up into four 32-bit ints. */
   9564 
   9565 static void breakupV128to32s ( IRTemp t128,
   9566                                /*OUTs*/
   9567                                IRTemp* t3, IRTemp* t2,
   9568                                IRTemp* t1, IRTemp* t0 )
   9569 {
   9570    IRTemp hi64 = newTemp(Ity_I64);
   9571    IRTemp lo64 = newTemp(Ity_I64);
   9572    assign( hi64, unop(Iop_V128HIto64, mkexpr(t128)) );
   9573    assign( lo64, unop(Iop_V128to64,   mkexpr(t128)) );
   9574 
   9575    vassert(t0 && *t0 == IRTemp_INVALID);
   9576    vassert(t1 && *t1 == IRTemp_INVALID);
   9577    vassert(t2 && *t2 == IRTemp_INVALID);
   9578    vassert(t3 && *t3 == IRTemp_INVALID);
   9579 
   9580    *t0 = newTemp(Ity_I32);
   9581    *t1 = newTemp(Ity_I32);
   9582    *t2 = newTemp(Ity_I32);
   9583    *t3 = newTemp(Ity_I32);
   9584    assign( *t0, unop(Iop_64to32,   mkexpr(lo64)) );
   9585    assign( *t1, unop(Iop_64HIto32, mkexpr(lo64)) );
   9586    assign( *t2, unop(Iop_64to32,   mkexpr(hi64)) );
   9587    assign( *t3, unop(Iop_64HIto32, mkexpr(hi64)) );
   9588 }
   9589 
   9590 /* Construct a V128-bit value from four 32-bit ints. */
   9591 
   9592 static IRExpr* mkV128from32s ( IRTemp t3, IRTemp t2,
   9593                                IRTemp t1, IRTemp t0 )
   9594 {
   9595    return
   9596       binop( Iop_64HLtoV128,
   9597              binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
   9598              binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0))
   9599    );
   9600 }
   9601 
   9602 /* Break a 64-bit value up into four 16-bit ints. */
   9603 
   9604 static void breakup64to16s ( IRTemp t64,
   9605                              /*OUTs*/
   9606                              IRTemp* t3, IRTemp* t2,
   9607                              IRTemp* t1, IRTemp* t0 )
   9608 {
   9609    IRTemp hi32 = newTemp(Ity_I32);
   9610    IRTemp lo32 = newTemp(Ity_I32);
   9611    assign( hi32, unop(Iop_64HIto32, mkexpr(t64)) );
   9612    assign( lo32, unop(Iop_64to32,   mkexpr(t64)) );
   9613 
   9614    vassert(t0 && *t0 == IRTemp_INVALID);
   9615    vassert(t1 && *t1 == IRTemp_INVALID);
   9616    vassert(t2 && *t2 == IRTemp_INVALID);
   9617    vassert(t3 && *t3 == IRTemp_INVALID);
   9618 
   9619    *t0 = newTemp(Ity_I16);
   9620    *t1 = newTemp(Ity_I16);
   9621    *t2 = newTemp(Ity_I16);
   9622    *t3 = newTemp(Ity_I16);
   9623    assign( *t0, unop(Iop_32to16,   mkexpr(lo32)) );
   9624    assign( *t1, unop(Iop_32HIto16, mkexpr(lo32)) );
   9625    assign( *t2, unop(Iop_32to16,   mkexpr(hi32)) );
   9626    assign( *t3, unop(Iop_32HIto16, mkexpr(hi32)) );
   9627 }
   9628 
   9629 /* Construct a 64-bit value from four 16-bit ints. */
   9630 
   9631 static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2,
   9632                              IRTemp t1, IRTemp t0 )
   9633 {
   9634    return
   9635       binop( Iop_32HLto64,
   9636              binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)),
   9637              binop(Iop_16HLto32, mkexpr(t1), mkexpr(t0))
   9638    );
   9639 }
   9640 
   9641 /* Break a V256-bit value up into four 64-bit ints. */
   9642 
   9643 static void breakupV256to64s ( IRTemp t256,
   9644                                /*OUTs*/
   9645                                IRTemp* t3, IRTemp* t2,
   9646                                IRTemp* t1, IRTemp* t0 )
   9647 {
   9648    vassert(t0 && *t0 == IRTemp_INVALID);
   9649    vassert(t1 && *t1 == IRTemp_INVALID);
   9650    vassert(t2 && *t2 == IRTemp_INVALID);
   9651    vassert(t3 && *t3 == IRTemp_INVALID);
   9652    *t0 = newTemp(Ity_I64);
   9653    *t1 = newTemp(Ity_I64);
   9654    *t2 = newTemp(Ity_I64);
   9655    *t3 = newTemp(Ity_I64);
   9656    assign( *t0, unop(Iop_V256to64_0, mkexpr(t256)) );
   9657    assign( *t1, unop(Iop_V256to64_1, mkexpr(t256)) );
   9658    assign( *t2, unop(Iop_V256to64_2, mkexpr(t256)) );
   9659    assign( *t3, unop(Iop_V256to64_3, mkexpr(t256)) );
   9660 }
   9661 
   9662 /* Break a V256-bit value up into two V128s. */
   9663 
   9664 static void breakupV256toV128s ( IRTemp t256,
   9665                                  /*OUTs*/
   9666                                  IRTemp* t1, IRTemp* t0 )
   9667 {
   9668    vassert(t0 && *t0 == IRTemp_INVALID);
   9669    vassert(t1 && *t1 == IRTemp_INVALID);
   9670    *t0 = newTemp(Ity_V128);
   9671    *t1 = newTemp(Ity_V128);
   9672    assign(*t1, unop(Iop_V256toV128_1, mkexpr(t256)));
   9673    assign(*t0, unop(Iop_V256toV128_0, mkexpr(t256)));
   9674 }
   9675 
   9676 /* Break a V256-bit value up into eight 32-bit ints.  */
   9677 
   9678 static void breakupV256to32s ( IRTemp t256,
   9679                                /*OUTs*/
   9680                                IRTemp* t7, IRTemp* t6,
   9681                                IRTemp* t5, IRTemp* t4,
   9682                                IRTemp* t3, IRTemp* t2,
   9683                                IRTemp* t1, IRTemp* t0 )
   9684 {
   9685    IRTemp t128_1 = IRTemp_INVALID;
   9686    IRTemp t128_0 = IRTemp_INVALID;
   9687    breakupV256toV128s( t256, &t128_1, &t128_0 );
   9688    breakupV128to32s( t128_1, t7, t6, t5, t4 );
   9689    breakupV128to32s( t128_0, t3, t2, t1, t0 );
   9690 }
   9691 
   9692 /* Break a V128-bit value up into two 64-bit ints. */
   9693 
   9694 static void breakupV128to64s ( IRTemp t128,
   9695                                /*OUTs*/
   9696                                IRTemp* t1, IRTemp* t0 )
   9697 {
   9698    vassert(t0 && *t0 == IRTemp_INVALID);
   9699    vassert(t1 && *t1 == IRTemp_INVALID);
   9700    *t0 = newTemp(Ity_I64);
   9701    *t1 = newTemp(Ity_I64);
   9702    assign( *t0, unop(Iop_V128to64,   mkexpr(t128)) );
   9703    assign( *t1, unop(Iop_V128HIto64, mkexpr(t128)) );
   9704 }
   9705 
   9706 /* Construct a V256-bit value from eight 32-bit ints. */
   9707 
   9708 static IRExpr* mkV256from32s ( IRTemp t7, IRTemp t6,
   9709                                IRTemp t5, IRTemp t4,
   9710                                IRTemp t3, IRTemp t2,
   9711                                IRTemp t1, IRTemp t0 )
   9712 {
   9713    return
   9714       binop( Iop_V128HLtoV256,
   9715              binop( Iop_64HLtoV128,
   9716                     binop(Iop_32HLto64, mkexpr(t7), mkexpr(t6)),
   9717                     binop(Iop_32HLto64, mkexpr(t5), mkexpr(t4)) ),
   9718              binop( Iop_64HLtoV128,
   9719                     binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
   9720                     binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0)) )
   9721    );
   9722 }
   9723 
   9724 /* Construct a V256-bit value from four 64-bit ints. */
   9725 
   9726 static IRExpr* mkV256from64s ( IRTemp t3, IRTemp t2,
   9727                                IRTemp t1, IRTemp t0 )
   9728 {
   9729    return
   9730       binop( Iop_V128HLtoV256,
   9731              binop(Iop_64HLtoV128, mkexpr(t3), mkexpr(t2)),
   9732              binop(Iop_64HLtoV128, mkexpr(t1), mkexpr(t0))
   9733    );
   9734 }
   9735 
   9736 /* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
   9737    values (aa,bb), computes, for each of the 4 16-bit lanes:
   9738 
   9739    (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
   9740 */
   9741 static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
   9742 {
   9743    IRTemp aa      = newTemp(Ity_I64);
   9744    IRTemp bb      = newTemp(Ity_I64);
   9745    IRTemp aahi32s = newTemp(Ity_I64);
   9746    IRTemp aalo32s = newTemp(Ity_I64);
   9747    IRTemp bbhi32s = newTemp(Ity_I64);
   9748    IRTemp bblo32s = newTemp(Ity_I64);
   9749    IRTemp rHi     = newTemp(Ity_I64);
   9750    IRTemp rLo     = newTemp(Ity_I64);
   9751    IRTemp one32x2 = newTemp(Ity_I64);
   9752    assign(aa, aax);
   9753    assign(bb, bbx);
   9754    assign( aahi32s,
   9755            binop(Iop_SarN32x2,
   9756                  binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
   9757                  mkU8(16) ));
   9758    assign( aalo32s,
   9759            binop(Iop_SarN32x2,
   9760                  binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
   9761                  mkU8(16) ));
   9762    assign( bbhi32s,
   9763            binop(Iop_SarN32x2,
   9764                  binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
   9765                  mkU8(16) ));
   9766    assign( bblo32s,
   9767            binop(Iop_SarN32x2,
   9768                  binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
   9769                  mkU8(16) ));
   9770    assign(one32x2, mkU64( (1ULL << 32) + 1 ));
   9771    assign(
   9772       rHi,
   9773       binop(
   9774          Iop_ShrN32x2,
   9775          binop(
   9776             Iop_Add32x2,
   9777             binop(
   9778                Iop_ShrN32x2,
   9779                binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
   9780                mkU8(14)
   9781             ),
   9782             mkexpr(one32x2)
   9783          ),
   9784          mkU8(1)
   9785       )
   9786    );
   9787    assign(
   9788       rLo,
   9789       binop(
   9790          Iop_ShrN32x2,
   9791          binop(
   9792             Iop_Add32x2,
   9793             binop(
   9794                Iop_ShrN32x2,
   9795                binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
   9796                mkU8(14)
   9797             ),
   9798             mkexpr(one32x2)
   9799          ),
   9800          mkU8(1)
   9801       )
   9802    );
   9803    return
   9804       binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
   9805 }
   9806 
   9807 /* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns.  Given two 64-bit
   9808    values (aa,bb), computes, for each lane:
   9809 
   9810           if aa_lane < 0 then - bb_lane
   9811      else if aa_lane > 0 then bb_lane
   9812      else 0
   9813 */
   9814 static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
   9815 {
   9816    IRTemp aa       = newTemp(Ity_I64);
   9817    IRTemp bb       = newTemp(Ity_I64);
   9818    IRTemp zero     = newTemp(Ity_I64);
   9819    IRTemp bbNeg    = newTemp(Ity_I64);
   9820    IRTemp negMask  = newTemp(Ity_I64);
   9821    IRTemp posMask  = newTemp(Ity_I64);
   9822    IROp   opSub    = Iop_INVALID;
   9823    IROp   opCmpGTS = Iop_INVALID;
   9824 
   9825    switch (laneszB) {
   9826       case 1: opSub = Iop_Sub8x8;  opCmpGTS = Iop_CmpGT8Sx8;  break;
   9827       case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
   9828       case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
   9829       default: vassert(0);
   9830    }
   9831 
   9832    assign( aa,      aax );
   9833    assign( bb,      bbx );
   9834    assign( zero,    mkU64(0) );
   9835    assign( bbNeg,   binop(opSub,    mkexpr(zero), mkexpr(bb)) );
   9836    assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
   9837    assign( posMask, binop(opCmpGTS, mkexpr(aa),   mkexpr(zero)) );
   9838 
   9839    return
   9840       binop(Iop_Or64,
   9841             binop(Iop_And64, mkexpr(bb),    mkexpr(posMask)),
   9842             binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
   9843 
   9844 }
   9845 
   9846 
   9847 /* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns.  Given a 64-bit
   9848    value aa, computes, for each lane
   9849 
   9850    if aa < 0 then -aa else aa
   9851 
   9852    Note that the result is interpreted as unsigned, so that the
   9853    absolute value of the most negative signed input can be
   9854    represented.
   9855 */
   9856 static IRTemp math_PABS_MMX ( IRTemp aa, Int laneszB )
   9857 {
   9858    IRTemp res     = newTemp(Ity_I64);
   9859    IRTemp zero    = newTemp(Ity_I64);
   9860    IRTemp aaNeg   = newTemp(Ity_I64);
   9861    IRTemp negMask = newTemp(Ity_I64);
   9862    IRTemp posMask = newTemp(Ity_I64);
   9863    IROp   opSub   = Iop_INVALID;
   9864    IROp   opSarN  = Iop_INVALID;
   9865 
   9866    switch (laneszB) {
   9867       case 1: opSub = Iop_Sub8x8;  opSarN = Iop_SarN8x8;  break;
   9868       case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
   9869       case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
   9870       default: vassert(0);
   9871    }
   9872 
   9873    assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
   9874    assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
   9875    assign( zero,    mkU64(0) );
   9876    assign( aaNeg,   binop(opSub, mkexpr(zero), mkexpr(aa)) );
   9877    assign( res,
   9878            binop(Iop_Or64,
   9879                  binop(Iop_And64, mkexpr(aa),    mkexpr(posMask)),
   9880                  binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) ));
   9881    return res;
   9882 }
   9883 
   9884 /* XMM version of math_PABS_MMX. */
   9885 static IRTemp math_PABS_XMM ( IRTemp aa, Int laneszB )
   9886 {
   9887    IRTemp res  = newTemp(Ity_V128);
   9888    IRTemp aaHi = newTemp(Ity_I64);
   9889    IRTemp aaLo = newTemp(Ity_I64);
   9890    assign(aaHi, unop(Iop_V128HIto64, mkexpr(aa)));
   9891    assign(aaLo, unop(Iop_V128to64, mkexpr(aa)));
   9892    assign(res, binop(Iop_64HLtoV128,
   9893                      mkexpr(math_PABS_MMX(aaHi, laneszB)),
   9894                      mkexpr(math_PABS_MMX(aaLo, laneszB))));
   9895    return res;
   9896 }
   9897 
   9898 /* Specialisations of math_PABS_XMM, since there's no easy way to do
   9899    partial applications in C :-( */
   9900 static IRTemp math_PABS_XMM_pap4 ( IRTemp aa ) {
   9901    return math_PABS_XMM(aa, 4);
   9902 }
   9903 
   9904 static IRTemp math_PABS_XMM_pap2 ( IRTemp aa ) {
   9905    return math_PABS_XMM(aa, 2);
   9906 }
   9907 
   9908 static IRTemp math_PABS_XMM_pap1 ( IRTemp aa ) {
   9909    return math_PABS_XMM(aa, 1);
   9910 }
   9911 
   9912 /* YMM version of math_PABS_XMM. */
   9913 static IRTemp math_PABS_YMM ( IRTemp aa, Int laneszB )
   9914 {
   9915    IRTemp res  = newTemp(Ity_V256);
   9916    IRTemp aaHi = IRTemp_INVALID;
   9917    IRTemp aaLo = IRTemp_INVALID;
   9918    breakupV256toV128s(aa, &aaHi, &aaLo);
   9919    assign(res, binop(Iop_V128HLtoV256,
   9920                      mkexpr(math_PABS_XMM(aaHi, laneszB)),
   9921                      mkexpr(math_PABS_XMM(aaLo, laneszB))));
   9922    return res;
   9923 }
   9924 
   9925 static IRTemp math_PABS_YMM_pap4 ( IRTemp aa ) {
   9926    return math_PABS_YMM(aa, 4);
   9927 }
   9928 
   9929 static IRTemp math_PABS_YMM_pap2 ( IRTemp aa ) {
   9930    return math_PABS_YMM(aa, 2);
   9931 }
   9932 
   9933 static IRTemp math_PABS_YMM_pap1 ( IRTemp aa ) {
   9934    return math_PABS_YMM(aa, 1);
   9935 }
   9936 
   9937 static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
   9938                                         IRTemp lo64, Long byteShift )
   9939 {
   9940    vassert(byteShift >= 1 && byteShift <= 7);
   9941    return
   9942       binop(Iop_Or64,
   9943             binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
   9944             binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
   9945       );
   9946 }
   9947 
   9948 static IRTemp math_PALIGNR_XMM ( IRTemp sV, IRTemp dV, UInt imm8 )
   9949 {
   9950    IRTemp res = newTemp(Ity_V128);
   9951    IRTemp sHi = newTemp(Ity_I64);
   9952    IRTemp sLo = newTemp(Ity_I64);
   9953    IRTemp dHi = newTemp(Ity_I64);
   9954    IRTemp dLo = newTemp(Ity_I64);
   9955    IRTemp rHi = newTemp(Ity_I64);
   9956    IRTemp rLo = newTemp(Ity_I64);
   9957 
   9958    assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   9959    assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   9960    assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   9961    assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   9962 
   9963    if (imm8 == 0) {
   9964       assign( rHi, mkexpr(sHi) );
   9965       assign( rLo, mkexpr(sLo) );
   9966    }
   9967    else if (imm8 >= 1 && imm8 <= 7) {
   9968       assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, imm8) );
   9969       assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, imm8) );
   9970    }
   9971    else if (imm8 == 8) {
   9972       assign( rHi, mkexpr(dLo) );
   9973       assign( rLo, mkexpr(sHi) );
   9974    }
   9975    else if (imm8 >= 9 && imm8 <= 15) {
   9976       assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, imm8-8) );
   9977       assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, imm8-8) );
   9978    }
   9979    else if (imm8 == 16) {
   9980       assign( rHi, mkexpr(dHi) );
   9981       assign( rLo, mkexpr(dLo) );
   9982    }
   9983    else if (imm8 >= 17 && imm8 <= 23) {
   9984       assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(imm8-16))) );
   9985       assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, imm8-16) );
   9986    }
   9987    else if (imm8 == 24) {
   9988       assign( rHi, mkU64(0) );
   9989       assign( rLo, mkexpr(dHi) );
   9990    }
   9991    else if (imm8 >= 25 && imm8 <= 31) {
   9992       assign( rHi, mkU64(0) );
   9993       assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(imm8-24))) );
   9994    }
   9995    else if (imm8 >= 32 && imm8 <= 255) {
   9996       assign( rHi, mkU64(0) );
   9997       assign( rLo, mkU64(0) );
   9998    }
   9999    else
   10000       vassert(0);
   10001 
   10002    assign( res, binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo)));
   10003    return res;
   10004 }
   10005 
   10006 
   10007 /* Generate a SIGSEGV followed by a restart of the current instruction
   10008    if effective_addr is not 16-aligned.  This is required behaviour
   10009    for some SSE3 instructions and all 128-bit SSSE3 instructions.
   10010    This assumes that guest_RIP_curr_instr is set correctly! */
   10011 static
   10012 void gen_SEGV_if_not_XX_aligned ( IRTemp effective_addr, ULong mask )
   10013 {
   10014    stmt(
   10015       IRStmt_Exit(
   10016          binop(Iop_CmpNE64,
   10017                binop(Iop_And64,mkexpr(effective_addr),mkU64(mask)),
   10018                mkU64(0)),
   10019          Ijk_SigSEGV,
   10020          IRConst_U64(guest_RIP_curr_instr),
   10021          OFFB_RIP
   10022       )
   10023    );
   10024 }
   10025 
   10026 static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr ) {
   10027    gen_SEGV_if_not_XX_aligned(effective_addr, 16-1);
   10028 }
   10029 
   10030 static void gen_SEGV_if_not_32_aligned ( IRTemp effective_addr ) {
   10031    gen_SEGV_if_not_XX_aligned(effective_addr, 32-1);
   10032 }
   10033 
   10034 static void gen_SEGV_if_not_64_aligned ( IRTemp effective_addr ) {
   10035    gen_SEGV_if_not_XX_aligned(effective_addr, 64-1);
   10036 }
   10037 
   10038 /* Helper for deciding whether a given insn (starting at the opcode
   10039    byte) may validly be used with a LOCK prefix.  The following insns
   10040    may be used with LOCK when their destination operand is in memory.
   10041    AFAICS this is exactly the same for both 32-bit and 64-bit mode.
   10042 
   10043    ADD        80 /0,  81 /0,  82 /0,  83 /0,  00,  01
   10044    OR         80 /1,  81 /1,  82 /x,  83 /1,  08,  09
   10045    ADC        80 /2,  81 /2,  82 /2,  83 /2,  10,  11
   10046    SBB        81 /3,  81 /3,  82 /x,  83 /3,  18,  19
   10047    AND        80 /4,  81 /4,  82 /x,  83 /4,  20,  21
   10048    SUB        80 /5,  81 /5,  82 /x,  83 /5,  28,  29
   10049    XOR        80 /6,  81 /6,  82 /x,  83 /6,  30,  31
   10050 
   10051    DEC        FE /1,  FF /1
   10052    INC        FE /0,  FF /0
   10053 
   10054    NEG        F6 /3,  F7 /3
   10055    NOT        F6 /2,  F7 /2
   10056 
   10057    XCHG       86, 87
   10058 
   10059    BTC        0F BB,  0F BA /7
   10060    BTR        0F B3,  0F BA /6
   10061    BTS        0F AB,  0F BA /5
   10062 
   10063    CMPXCHG    0F B0,  0F B1
   10064    CMPXCHG8B  0F C7 /1
   10065 
   10066    XADD       0F C0,  0F C1
   10067 
   10068    ------------------------------
   10069 
   10070    80 /0  =  addb $imm8,  rm8
   10071    81 /0  =  addl $imm32, rm32  and  addw $imm16, rm16
   10072    82 /0  =  addb $imm8,  rm8
   10073    83 /0  =  addl $simm8, rm32  and  addw $simm8, rm16
   10074 
   10075    00     =  addb r8,  rm8
   10076    01     =  addl r32, rm32  and  addw r16, rm16
   10077 
   10078    Same for ADD OR ADC SBB AND SUB XOR
   10079 
   10080    FE /1  = dec rm8
   10081    FF /1  = dec rm32  and  dec rm16
   10082 
   10083    FE /0  = inc rm8
   10084    FF /0  = inc rm32  and  inc rm16
   10085 
   10086    F6 /3  = neg rm8
   10087    F7 /3  = neg rm32  and  neg rm16
   10088 
   10089    F6 /2  = not rm8
   10090    F7 /2  = not rm32  and  not rm16
   10091 
   10092    0F BB     = btcw r16, rm16    and  btcl r32, rm32
   10093    OF BA /7  = btcw $imm8, rm16  and  btcw $imm8, rm32
   10094 
   10095    Same for BTS, BTR
   10096 */
   10097 static Bool can_be_used_with_LOCK_prefix ( const UChar* opc )
   10098 {
   10099    switch (opc[0]) {
   10100       case 0x00: case 0x01: case 0x08: case 0x09:
   10101       case 0x10: case 0x11: case 0x18: case 0x19:
   10102       case 0x20: case 0x21: case 0x28: case 0x29:
   10103       case 0x30: case 0x31:
   10104          if (!epartIsReg(opc[1]))
   10105             return True;
   10106          break;
   10107 
   10108       case 0x80: case 0x81: case 0x82: case 0x83:
   10109          if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 6
   10110              && !epartIsReg(opc[1]))
   10111             return True;
   10112          break;
   10113 
   10114       case 0xFE: case 0xFF:
   10115          if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 1
   10116              && !epartIsReg(opc[1]))
   10117             return True;
   10118          break;
   10119 
   10120       case 0xF6: case 0xF7:
   10121          if (gregLO3ofRM(opc[1]) >= 2 && gregLO3ofRM(opc[1]) <= 3
   10122              && !epartIsReg(opc[1]))
   10123             return True;
   10124          break;
   10125 
   10126       case 0x86: case 0x87:
   10127          if (!epartIsReg(opc[1]))
   10128             return True;
   10129          break;
   10130 
   10131       case 0x0F: {
   10132          switch (opc[1]) {
   10133             case 0xBB: case 0xB3: case 0xAB:
   10134                if (!epartIsReg(opc[2]))
   10135                   return True;
   10136                break;
   10137             case 0xBA:
   10138                if (gregLO3ofRM(opc[2]) >= 5 && gregLO3ofRM(opc[2]) <= 7
   10139                    && !epartIsReg(opc[2]))
   10140                   return True;
   10141                break;
   10142             case 0xB0: case 0xB1:
   10143                if (!epartIsReg(opc[2]))
   10144                   return True;
   10145                break;
   10146             case 0xC7:
   10147                if (gregLO3ofRM(opc[2]) == 1 && !epartIsReg(opc[2]) )
   10148                   return True;
   10149                break;
   10150             case 0xC0: case 0xC1:
   10151                if (!epartIsReg(opc[2]))
   10152                   return True;
   10153                break;
   10154             default:
   10155                break;
   10156          } /* switch (opc[1]) */
   10157          break;
   10158       }
   10159 
   10160       default:
   10161          break;
   10162    } /* switch (opc[0]) */
   10163 
   10164    return False;
   10165 }
   10166 
   10167 
   10168 /*------------------------------------------------------------*/
   10169 /*---                                                      ---*/
   10170 /*--- Top-level SSE/SSE2: dis_ESC_0F__SSE2                 ---*/
   10171 /*---                                                      ---*/
   10172 /*------------------------------------------------------------*/
   10173 
   10174 static Long dis_COMISD ( const VexAbiInfo* vbi, Prefix pfx,
   10175                          Long delta, Bool isAvx, UChar opc )
   10176 {
   10177    vassert(opc == 0x2F/*COMISD*/ || opc == 0x2E/*UCOMISD*/);
   10178    Int    alen  = 0;
   10179    HChar  dis_buf[50];
   10180    IRTemp argL  = newTemp(Ity_F64);
   10181    IRTemp argR  = newTemp(Ity_F64);
   10182    UChar  modrm = getUChar(delta);
   10183    IRTemp addr  = IRTemp_INVALID;
   10184    if (epartIsReg(modrm)) {
   10185       assign( argR, getXMMRegLane64F( eregOfRexRM(pfx,modrm),
   10186                                       0/*lowest lane*/ ) );
   10187       delta += 1;
   10188       DIP("%s%scomisd %s,%s\n", isAvx ? "v" : "",
   10189                                 opc==0x2E ? "u" : "",
   10190                                 nameXMMReg(eregOfRexRM(pfx,modrm)),
   10191                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10192    } else {
   10193       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10194       assign( argR, loadLE(Ity_F64, mkexpr(addr)) );
   10195       delta += alen;
   10196       DIP("%s%scomisd %s,%s\n", isAvx ? "v" : "",
   10197                                 opc==0x2E ? "u" : "",
   10198                                 dis_buf,
   10199                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10200    }
   10201    assign( argL, getXMMRegLane64F( gregOfRexRM(pfx,modrm),
   10202                                    0/*lowest lane*/ ) );
   10203 
   10204    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   10205    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   10206    stmt( IRStmt_Put(
   10207             OFFB_CC_DEP1,
   10208             binop( Iop_And64,
   10209                    unop( Iop_32Uto64,
   10210                          binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)) ),
   10211                    mkU64(0x45)
   10212        )));
   10213    return delta;
   10214 }
   10215 
   10216 
   10217 static Long dis_COMISS ( const VexAbiInfo* vbi, Prefix pfx,
   10218                          Long delta, Bool isAvx, UChar opc )
   10219 {
   10220    vassert(opc == 0x2F/*COMISS*/ || opc == 0x2E/*UCOMISS*/);
   10221    Int    alen  = 0;
   10222    HChar  dis_buf[50];
   10223    IRTemp argL  = newTemp(Ity_F32);
   10224    IRTemp argR  = newTemp(Ity_F32);
   10225    UChar  modrm = getUChar(delta);
   10226    IRTemp addr  = IRTemp_INVALID;
   10227    if (epartIsReg(modrm)) {
   10228       assign( argR, getXMMRegLane32F( eregOfRexRM(pfx,modrm),
   10229                                       0/*lowest lane*/ ) );
   10230       delta += 1;
   10231       DIP("%s%scomiss %s,%s\n", isAvx ? "v" : "",
   10232                                 opc==0x2E ? "u" : "",
   10233                                 nameXMMReg(eregOfRexRM(pfx,modrm)),
   10234                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10235    } else {
   10236       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10237       assign( argR, loadLE(Ity_F32, mkexpr(addr)) );
   10238       delta += alen;
   10239       DIP("%s%scomiss %s,%s\n", isAvx ? "v" : "",
   10240                                 opc==0x2E ? "u" : "",
   10241                                 dis_buf,
   10242                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10243    }
   10244    assign( argL, getXMMRegLane32F( gregOfRexRM(pfx,modrm),
   10245                                    0/*lowest lane*/ ) );
   10246 
   10247    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   10248    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   10249    stmt( IRStmt_Put(
   10250             OFFB_CC_DEP1,
   10251             binop( Iop_And64,
   10252                    unop( Iop_32Uto64,
   10253                          binop(Iop_CmpF64,
   10254                                unop(Iop_F32toF64,mkexpr(argL)),
   10255                                unop(Iop_F32toF64,mkexpr(argR)))),
   10256                    mkU64(0x45)
   10257        )));
   10258    return delta;
   10259 }
   10260 
   10261 
   10262 static Long dis_PSHUFD_32x4 ( const VexAbiInfo* vbi, Prefix pfx,
   10263                               Long delta, Bool writesYmm )
   10264 {
   10265    Int    order;
   10266    Int    alen  = 0;
   10267    HChar  dis_buf[50];
   10268    IRTemp sV    = newTemp(Ity_V128);
   10269    UChar  modrm = getUChar(delta);
   10270    const HChar* strV  = writesYmm ? "v" : "";
   10271    IRTemp addr  = IRTemp_INVALID;
   10272    if (epartIsReg(modrm)) {
   10273       assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   10274       order = (Int)getUChar(delta+1);
   10275       delta += 1+1;
   10276       DIP("%spshufd $%d,%s,%s\n", strV, order,
   10277                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
   10278                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   10279    } else {
   10280       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   10281                         1/*byte after the amode*/ );
   10282       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   10283       order = (Int)getUChar(delta+alen);
   10284       delta += alen+1;
   10285       DIP("%spshufd $%d,%s,%s\n", strV, order,
   10286                                  dis_buf,
   10287                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   10288    }
   10289 
   10290    IRTemp s3, s2, s1, s0;
   10291    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   10292    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   10293 
   10294 #  define SEL(n)  ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   10295    IRTemp dV = newTemp(Ity_V128);
   10296    assign(dV,
   10297           mkV128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
   10298                          SEL((order>>2)&3), SEL((order>>0)&3) )
   10299    );
   10300 #  undef SEL
   10301 
   10302    (writesYmm ? putYMMRegLoAndZU : putXMMReg)
   10303       (gregOfRexRM(pfx,modrm), mkexpr(dV));
   10304    return delta;
   10305 }
   10306 
   10307 
   10308 static Long dis_PSHUFD_32x8 ( const VexAbiInfo* vbi, Prefix pfx, Long delta )
   10309 {
   10310    Int    order;
   10311    Int    alen  = 0;
   10312    HChar  dis_buf[50];
   10313    IRTemp sV    = newTemp(Ity_V256);
   10314    UChar  modrm = getUChar(delta);
   10315    IRTemp addr  = IRTemp_INVALID;
   10316    UInt   rG    = gregOfRexRM(pfx,modrm);
   10317    if (epartIsReg(modrm)) {
   10318       UInt rE = eregOfRexRM(pfx,modrm);
   10319       assign( sV, getYMMReg(rE) );
   10320       order = (Int)getUChar(delta+1);
   10321       delta += 1+1;
   10322       DIP("vpshufd $%d,%s,%s\n", order, nameYMMReg(rE), nameYMMReg(rG));
   10323    } else {
   10324       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   10325                         1/*byte after the amode*/ );
   10326       assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   10327       order = (Int)getUChar(delta+alen);
   10328       delta += alen+1;
   10329       DIP("vpshufd $%d,%s,%s\n", order,  dis_buf, nameYMMReg(rG));
   10330    }
   10331 
   10332    IRTemp s[8];
   10333    s[7] = s[6] = s[5] = s[4] = s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID;
   10334    breakupV256to32s( sV, &s[7], &s[6], &s[5], &s[4],
   10335                          &s[3], &s[2], &s[1], &s[0] );
   10336 
   10337    putYMMReg( rG, mkV256from32s( s[4 + ((order>>6)&3)],
   10338                                  s[4 + ((order>>4)&3)],
   10339                                  s[4 + ((order>>2)&3)],
   10340                                  s[4 + ((order>>0)&3)],
   10341                                  s[0 + ((order>>6)&3)],
   10342                                  s[0 + ((order>>4)&3)],
   10343                                  s[0 + ((order>>2)&3)],
   10344                                  s[0 + ((order>>0)&3)] ) );
   10345    return delta;
   10346 }
   10347 
   10348 
   10349 static IRTemp math_PSRLDQ ( IRTemp sV, Int imm )
   10350 {
   10351    IRTemp dV    = newTemp(Ity_V128);
   10352    IRTemp hi64  = newTemp(Ity_I64);
   10353    IRTemp lo64  = newTemp(Ity_I64);
   10354    IRTemp hi64r = newTemp(Ity_I64);
   10355    IRTemp lo64r = newTemp(Ity_I64);
   10356 
   10357    vassert(imm >= 0 && imm <= 255);
   10358    if (imm >= 16) {
   10359       assign(dV, mkV128(0x0000));
   10360       return dV;
   10361    }
   10362 
   10363    assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   10364    assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   10365 
   10366    if (imm == 0) {
   10367       assign( lo64r, mkexpr(lo64) );
   10368       assign( hi64r, mkexpr(hi64) );
   10369    }
   10370    else
   10371    if (imm == 8) {
   10372       assign( hi64r, mkU64(0) );
   10373       assign( lo64r, mkexpr(hi64) );
   10374    }
   10375    else
   10376    if (imm > 8) {
   10377       assign( hi64r, mkU64(0) );
   10378       assign( lo64r, binop( Iop_Shr64, mkexpr(hi64), mkU8( 8*(imm-8) ) ));
   10379    } else {
   10380       assign( hi64r, binop( Iop_Shr64, mkexpr(hi64), mkU8(8 * imm) ));
   10381       assign( lo64r,
   10382               binop( Iop_Or64,
   10383                      binop(Iop_Shr64, mkexpr(lo64),
   10384                            mkU8(8 * imm)),
   10385                      binop(Iop_Shl64, mkexpr(hi64),
   10386                            mkU8(8 * (8 - imm)) )
   10387                      )
   10388               );
   10389    }
   10390 
   10391    assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   10392    return dV;
   10393 }
   10394 
   10395 
   10396 static IRTemp math_PSLLDQ ( IRTemp sV, Int imm )
   10397 {
   10398    IRTemp       dV    = newTemp(Ity_V128);
   10399    IRTemp       hi64  = newTemp(Ity_I64);
   10400    IRTemp       lo64  = newTemp(Ity_I64);
   10401    IRTemp       hi64r = newTemp(Ity_I64);
   10402    IRTemp       lo64r = newTemp(Ity_I64);
   10403 
   10404    vassert(imm >= 0 && imm <= 255);
   10405    if (imm >= 16) {
   10406       assign(dV, mkV128(0x0000));
   10407       return dV;
   10408    }
   10409 
   10410    assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   10411    assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   10412 
   10413    if (imm == 0) {
   10414       assign( lo64r, mkexpr(lo64) );
   10415       assign( hi64r, mkexpr(hi64) );
   10416    }
   10417    else
   10418    if (imm == 8) {
   10419       assign( lo64r, mkU64(0) );
   10420       assign( hi64r, mkexpr(lo64) );
   10421    }
   10422    else
   10423    if (imm > 8) {
   10424       assign( lo64r, mkU64(0) );
   10425       assign( hi64r, binop( Iop_Shl64, mkexpr(lo64), mkU8( 8*(imm-8) ) ));
   10426    } else {
   10427       assign( lo64r, binop( Iop_Shl64, mkexpr(lo64), mkU8(8 * imm) ));
   10428       assign( hi64r,
   10429               binop( Iop_Or64,
   10430                      binop(Iop_Shl64, mkexpr(hi64),
   10431                            mkU8(8 * imm)),
   10432                      binop(Iop_Shr64, mkexpr(lo64),
   10433                            mkU8(8 * (8 - imm)) )
   10434                      )
   10435               );
   10436    }
   10437 
   10438    assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   10439    return dV;
   10440 }
   10441 
   10442 
   10443 static Long dis_CVTxSD2SI ( const VexAbiInfo* vbi, Prefix pfx,
   10444                             Long delta, Bool isAvx, UChar opc, Int sz )
   10445 {
   10446    vassert(opc == 0x2D/*CVTSD2SI*/ || opc == 0x2C/*CVTTSD2SI*/);
   10447    HChar  dis_buf[50];
   10448    Int    alen   = 0;
   10449    UChar  modrm  = getUChar(delta);
   10450    IRTemp addr   = IRTemp_INVALID;
   10451    IRTemp rmode  = newTemp(Ity_I32);
   10452    IRTemp f64lo  = newTemp(Ity_F64);
   10453    Bool   r2zero = toBool(opc == 0x2C);
   10454 
   10455    if (epartIsReg(modrm)) {
   10456       delta += 1;
   10457       assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
   10458       DIP("%scvt%ssd2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
   10459                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
   10460                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
   10461                                            False));
   10462    } else {
   10463       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10464       assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   10465       delta += alen;
   10466       DIP("%scvt%ssd2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
   10467                                   dis_buf,
   10468                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
   10469                                            False));
   10470    }
   10471 
   10472    if (r2zero) {
   10473       assign( rmode, mkU32((UInt)Irrm_ZERO) );
   10474    } else {
   10475       assign( rmode, get_sse_roundingmode() );
   10476    }
   10477 
   10478    if (sz == 4) {
   10479       putIReg32( gregOfRexRM(pfx,modrm),
   10480                  binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) );
   10481    } else {
   10482       vassert(sz == 8);
   10483       putIReg64( gregOfRexRM(pfx,modrm),
   10484                  binop( Iop_F64toI64S, mkexpr(rmode), mkexpr(f64lo)) );
   10485    }
   10486 
   10487    return delta;
   10488 }
   10489 
   10490 
   10491 static Long dis_CVTxSS2SI ( const VexAbiInfo* vbi, Prefix pfx,
   10492                             Long delta, Bool isAvx, UChar opc, Int sz )
   10493 {
   10494    vassert(opc == 0x2D/*CVTSS2SI*/ || opc == 0x2C/*CVTTSS2SI*/);
   10495    HChar  dis_buf[50];
   10496    Int    alen   = 0;
   10497    UChar  modrm  = getUChar(delta);
   10498    IRTemp addr   = IRTemp_INVALID;
   10499    IRTemp rmode  = newTemp(Ity_I32);
   10500    IRTemp f32lo  = newTemp(Ity_F32);
   10501    Bool   r2zero = toBool(opc == 0x2C);
   10502 
   10503    if (epartIsReg(modrm)) {
   10504       delta += 1;
   10505       assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
   10506       DIP("%scvt%sss2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
   10507                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
   10508                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
   10509                                            False));
   10510    } else {
   10511       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10512       assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   10513       delta += alen;
   10514       DIP("%scvt%sss2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
   10515                                   dis_buf,
   10516                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
   10517                                            False));
   10518    }
   10519 
   10520    if (r2zero) {
   10521       assign( rmode, mkU32((UInt)Irrm_ZERO) );
   10522    } else {
   10523       assign( rmode, get_sse_roundingmode() );
   10524    }
   10525 
   10526    if (sz == 4) {
   10527       putIReg32( gregOfRexRM(pfx,modrm),
   10528                  binop( Iop_F64toI32S,
   10529                         mkexpr(rmode),
   10530                         unop(Iop_F32toF64, mkexpr(f32lo))) );
   10531    } else {
   10532       vassert(sz == 8);
   10533       putIReg64( gregOfRexRM(pfx,modrm),
   10534                  binop( Iop_F64toI64S,
   10535                         mkexpr(rmode),
   10536                         unop(Iop_F32toF64, mkexpr(f32lo))) );
   10537    }
   10538 
   10539    return delta;
   10540 }
   10541 
   10542 
   10543 static Long dis_CVTPS2PD_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10544                                Long delta, Bool isAvx )
   10545 {
   10546    IRTemp addr  = IRTemp_INVALID;
   10547    Int    alen  = 0;
   10548    HChar  dis_buf[50];
   10549    IRTemp f32lo = newTemp(Ity_F32);
   10550    IRTemp f32hi = newTemp(Ity_F32);
   10551    UChar  modrm = getUChar(delta);
   10552    UInt   rG    = gregOfRexRM(pfx,modrm);
   10553    if (epartIsReg(modrm)) {
   10554       UInt rE = eregOfRexRM(pfx,modrm);
   10555       assign( f32lo, getXMMRegLane32F(rE, 0) );
   10556       assign( f32hi, getXMMRegLane32F(rE, 1) );
   10557       delta += 1;
   10558       DIP("%scvtps2pd %s,%s\n",
   10559           isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
   10560    } else {
   10561       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10562       assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) );
   10563       assign( f32hi, loadLE(Ity_F32,
   10564                             binop(Iop_Add64,mkexpr(addr),mkU64(4))) );
   10565       delta += alen;
   10566       DIP("%scvtps2pd %s,%s\n",
   10567           isAvx ? "v" : "", dis_buf, nameXMMReg(rG));
   10568    }
   10569 
   10570    putXMMRegLane64F( rG, 1, unop(Iop_F32toF64, mkexpr(f32hi)) );
   10571    putXMMRegLane64F( rG, 0, unop(Iop_F32toF64, mkexpr(f32lo)) );
   10572    if (isAvx)
   10573       putYMMRegLane128( rG, 1, mkV128(0));
   10574    return delta;
   10575 }
   10576 
   10577 
   10578 static Long dis_CVTPS2PD_256 ( const VexAbiInfo* vbi, Prefix pfx,
   10579                                Long delta )
   10580 {
   10581    IRTemp addr  = IRTemp_INVALID;
   10582    Int    alen  = 0;
   10583    HChar  dis_buf[50];
   10584    IRTemp f32_0 = newTemp(Ity_F32);
   10585    IRTemp f32_1 = newTemp(Ity_F32);
   10586    IRTemp f32_2 = newTemp(Ity_F32);
   10587    IRTemp f32_3 = newTemp(Ity_F32);
   10588    UChar  modrm = getUChar(delta);
   10589    UInt   rG    = gregOfRexRM(pfx,modrm);
   10590    if (epartIsReg(modrm)) {
   10591       UInt rE = eregOfRexRM(pfx,modrm);
   10592       assign( f32_0, getXMMRegLane32F(rE, 0) );
   10593       assign( f32_1, getXMMRegLane32F(rE, 1) );
   10594       assign( f32_2, getXMMRegLane32F(rE, 2) );
   10595       assign( f32_3, getXMMRegLane32F(rE, 3) );
   10596       delta += 1;
   10597       DIP("vcvtps2pd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   10598    } else {
   10599       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10600       assign( f32_0, loadLE(Ity_F32, mkexpr(addr)) );
   10601       assign( f32_1, loadLE(Ity_F32,
   10602                             binop(Iop_Add64,mkexpr(addr),mkU64(4))) );
   10603       assign( f32_2, loadLE(Ity_F32,
   10604                             binop(Iop_Add64,mkexpr(addr),mkU64(8))) );
   10605       assign( f32_3, loadLE(Ity_F32,
   10606                             binop(Iop_Add64,mkexpr(addr),mkU64(12))) );
   10607       delta += alen;
   10608       DIP("vcvtps2pd %s,%s\n", dis_buf, nameYMMReg(rG));
   10609    }
   10610 
   10611    putYMMRegLane64F( rG, 3, unop(Iop_F32toF64, mkexpr(f32_3)) );
   10612    putYMMRegLane64F( rG, 2, unop(Iop_F32toF64, mkexpr(f32_2)) );
   10613    putYMMRegLane64F( rG, 1, unop(Iop_F32toF64, mkexpr(f32_1)) );
   10614    putYMMRegLane64F( rG, 0, unop(Iop_F32toF64, mkexpr(f32_0)) );
   10615    return delta;
   10616 }
   10617 
   10618 
   10619 static Long dis_CVTPD2PS_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10620                                Long delta, Bool isAvx )
   10621 {
   10622    IRTemp addr  = IRTemp_INVALID;
   10623    Int    alen  = 0;
   10624    HChar  dis_buf[50];
   10625    UChar  modrm = getUChar(delta);
   10626    UInt   rG    = gregOfRexRM(pfx,modrm);
   10627    IRTemp argV  = newTemp(Ity_V128);
   10628    IRTemp rmode = newTemp(Ity_I32);
   10629    if (epartIsReg(modrm)) {
   10630       UInt rE = eregOfRexRM(pfx,modrm);
   10631       assign( argV, getXMMReg(rE) );
   10632       delta += 1;
   10633       DIP("%scvtpd2ps %s,%s\n", isAvx ? "v" : "",
   10634           nameXMMReg(rE), nameXMMReg(rG));
   10635    } else {
   10636       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10637       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10638       delta += alen;
   10639       DIP("%scvtpd2ps %s,%s\n", isAvx ? "v" : "",
   10640           dis_buf, nameXMMReg(rG) );
   10641    }
   10642 
   10643    assign( rmode, get_sse_roundingmode() );
   10644    IRTemp t0 = newTemp(Ity_F64);
   10645    IRTemp t1 = newTemp(Ity_F64);
   10646    assign( t0, unop(Iop_ReinterpI64asF64,
   10647                     unop(Iop_V128to64, mkexpr(argV))) );
   10648    assign( t1, unop(Iop_ReinterpI64asF64,
   10649                     unop(Iop_V128HIto64, mkexpr(argV))) );
   10650 
   10651 #  define CVT(_t)  binop( Iop_F64toF32, mkexpr(rmode), mkexpr(_t) )
   10652    putXMMRegLane32(  rG, 3, mkU32(0) );
   10653    putXMMRegLane32(  rG, 2, mkU32(0) );
   10654    putXMMRegLane32F( rG, 1, CVT(t1) );
   10655    putXMMRegLane32F( rG, 0, CVT(t0) );
   10656 #  undef CVT
   10657    if (isAvx)
   10658       putYMMRegLane128( rG, 1, mkV128(0) );
   10659 
   10660    return delta;
   10661 }
   10662 
   10663 
   10664 static Long dis_CVTxPS2DQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10665                                 Long delta, Bool isAvx, Bool r2zero )
   10666 {
   10667    IRTemp addr  = IRTemp_INVALID;
   10668    Int    alen  = 0;
   10669    HChar  dis_buf[50];
   10670    UChar  modrm = getUChar(delta);
   10671    IRTemp argV  = newTemp(Ity_V128);
   10672    IRTemp rmode = newTemp(Ity_I32);
   10673    UInt   rG    = gregOfRexRM(pfx,modrm);
   10674    IRTemp t0, t1, t2, t3;
   10675 
   10676    if (epartIsReg(modrm)) {
   10677       UInt rE = eregOfRexRM(pfx,modrm);
   10678       assign( argV, getXMMReg(rE) );
   10679       delta += 1;
   10680       DIP("%scvt%sps2dq %s,%s\n",
   10681           isAvx ? "v" : "", r2zero ? "t" : "", nameXMMReg(rE), nameXMMReg(rG));
   10682    } else {
   10683       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10684       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10685       delta += alen;
   10686       DIP("%scvt%sps2dq %s,%s\n",
   10687           isAvx ? "v" : "", r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
   10688    }
   10689 
   10690    assign( rmode, r2zero ? mkU32((UInt)Irrm_ZERO)
   10691                          : get_sse_roundingmode() );
   10692    t0 = t1 = t2 = t3 = IRTemp_INVALID;
   10693    breakupV128to32s( argV, &t3, &t2, &t1, &t0 );
   10694    /* This is less than ideal.  If it turns out to be a performance
   10695       bottleneck it can be improved. */
   10696 #  define CVT(_t)                             \
   10697       binop( Iop_F64toI32S,                   \
   10698              mkexpr(rmode),                   \
   10699              unop( Iop_F32toF64,              \
   10700                    unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
   10701 
   10702    putXMMRegLane32( rG, 3, CVT(t3) );
   10703    putXMMRegLane32( rG, 2, CVT(t2) );
   10704    putXMMRegLane32( rG, 1, CVT(t1) );
   10705    putXMMRegLane32( rG, 0, CVT(t0) );
   10706 #  undef CVT
   10707    if (isAvx)
   10708       putYMMRegLane128( rG, 1, mkV128(0) );
   10709 
   10710    return delta;
   10711 }
   10712 
   10713 
   10714 static Long dis_CVTxPS2DQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
   10715                                 Long delta, Bool r2zero )
   10716 {
   10717    IRTemp addr  = IRTemp_INVALID;
   10718    Int    alen  = 0;
   10719    HChar  dis_buf[50];
   10720    UChar  modrm = getUChar(delta);
   10721    IRTemp argV  = newTemp(Ity_V256);
   10722    IRTemp rmode = newTemp(Ity_I32);
   10723    UInt   rG    = gregOfRexRM(pfx,modrm);
   10724    IRTemp t0, t1, t2, t3, t4, t5, t6, t7;
   10725 
   10726    if (epartIsReg(modrm)) {
   10727       UInt rE = eregOfRexRM(pfx,modrm);
   10728       assign( argV, getYMMReg(rE) );
   10729       delta += 1;
   10730       DIP("vcvt%sps2dq %s,%s\n",
   10731           r2zero ? "t" : "", nameYMMReg(rE), nameYMMReg(rG));
   10732    } else {
   10733       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10734       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
   10735       delta += alen;
   10736       DIP("vcvt%sps2dq %s,%s\n",
   10737           r2zero ? "t" : "", dis_buf, nameYMMReg(rG) );
   10738    }
   10739 
   10740    assign( rmode, r2zero ? mkU32((UInt)Irrm_ZERO)
   10741                          : get_sse_roundingmode() );
   10742    t0 = t1 = t2 = t3 = t4 = t5 = t6 = t7 = IRTemp_INVALID;
   10743    breakupV256to32s( argV, &t7, &t6, &t5, &t4, &t3, &t2, &t1, &t0 );
   10744    /* This is less than ideal.  If it turns out to be a performance
   10745       bottleneck it can be improved. */
   10746 #  define CVT(_t)                             \
   10747       binop( Iop_F64toI32S,                   \
   10748              mkexpr(rmode),                   \
   10749              unop( Iop_F32toF64,              \
   10750                    unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
   10751 
   10752    putYMMRegLane32( rG, 7, CVT(t7) );
   10753    putYMMRegLane32( rG, 6, CVT(t6) );
   10754    putYMMRegLane32( rG, 5, CVT(t5) );
   10755    putYMMRegLane32( rG, 4, CVT(t4) );
   10756    putYMMRegLane32( rG, 3, CVT(t3) );
   10757    putYMMRegLane32( rG, 2, CVT(t2) );
   10758    putYMMRegLane32( rG, 1, CVT(t1) );
   10759    putYMMRegLane32( rG, 0, CVT(t0) );
   10760 #  undef CVT
   10761 
   10762    return delta;
   10763 }
   10764 
   10765 
   10766 static Long dis_CVTxPD2DQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10767                                 Long delta, Bool isAvx, Bool r2zero )
   10768 {
   10769    IRTemp addr  = IRTemp_INVALID;
   10770    Int    alen  = 0;
   10771    HChar  dis_buf[50];
   10772    UChar  modrm = getUChar(delta);
   10773    IRTemp argV  = newTemp(Ity_V128);
   10774    IRTemp rmode = newTemp(Ity_I32);
   10775    UInt   rG    = gregOfRexRM(pfx,modrm);
   10776    IRTemp t0, t1;
   10777 
   10778    if (epartIsReg(modrm)) {
   10779       UInt rE = eregOfRexRM(pfx,modrm);
   10780       assign( argV, getXMMReg(rE) );
   10781       delta += 1;
   10782       DIP("%scvt%spd2dq %s,%s\n",
   10783           isAvx ? "v" : "", r2zero ? "t" : "", nameXMMReg(rE), nameXMMReg(rG));
   10784    } else {
   10785       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10786       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10787       delta += alen;
   10788       DIP("%scvt%spd2dqx %s,%s\n",
   10789           isAvx ? "v" : "", r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
   10790    }
   10791 
   10792    if (r2zero) {
   10793       assign(rmode, mkU32((UInt)Irrm_ZERO) );
   10794    } else {
   10795       assign( rmode, get_sse_roundingmode() );
   10796    }
   10797 
   10798    t0 = newTemp(Ity_F64);
   10799    t1 = newTemp(Ity_F64);
   10800    assign( t0, unop(Iop_ReinterpI64asF64,
   10801                     unop(Iop_V128to64, mkexpr(argV))) );
   10802    assign( t1, unop(Iop_ReinterpI64asF64,
   10803                     unop(Iop_V128HIto64, mkexpr(argV))) );
   10804 
   10805 #  define CVT(_t)  binop( Iop_F64toI32S,                   \
   10806                           mkexpr(rmode),                   \
   10807                           mkexpr(_t) )
   10808 
   10809    putXMMRegLane32( rG, 3, mkU32(0) );
   10810    putXMMRegLane32( rG, 2, mkU32(0) );
   10811    putXMMRegLane32( rG, 1, CVT(t1) );
   10812    putXMMRegLane32( rG, 0, CVT(t0) );
   10813 #  undef CVT
   10814    if (isAvx)
   10815       putYMMRegLane128( rG, 1, mkV128(0) );
   10816 
   10817    return delta;
   10818 }
   10819 
   10820 
   10821 static Long dis_CVTxPD2DQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
   10822                                 Long delta, Bool r2zero )
   10823 {
   10824    IRTemp addr  = IRTemp_INVALID;
   10825    Int    alen  = 0;
   10826    HChar  dis_buf[50];
   10827    UChar  modrm = getUChar(delta);
   10828    IRTemp argV  = newTemp(Ity_V256);
   10829    IRTemp rmode = newTemp(Ity_I32);
   10830    UInt   rG    = gregOfRexRM(pfx,modrm);
   10831    IRTemp t0, t1, t2, t3;
   10832 
   10833    if (epartIsReg(modrm)) {
   10834       UInt rE = eregOfRexRM(pfx,modrm);
   10835       assign( argV, getYMMReg(rE) );
   10836       delta += 1;
   10837       DIP("vcvt%spd2dq %s,%s\n",
   10838           r2zero ? "t" : "", nameYMMReg(rE), nameXMMReg(rG));
   10839    } else {
   10840       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10841       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
   10842       delta += alen;
   10843       DIP("vcvt%spd2dqy %s,%s\n",
   10844           r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
   10845    }
   10846 
   10847    if (r2zero) {
   10848       assign(rmode, mkU32((UInt)Irrm_ZERO) );
   10849    } else {
   10850       assign( rmode, get_sse_roundingmode() );
   10851    }
   10852 
   10853    t0 = IRTemp_INVALID;
   10854    t1 = IRTemp_INVALID;
   10855    t2 = IRTemp_INVALID;
   10856    t3 = IRTemp_INVALID;
   10857    breakupV256to64s( argV, &t3, &t2, &t1, &t0 );
   10858 
   10859 #  define CVT(_t)  binop( Iop_F64toI32S,                   \
   10860                           mkexpr(rmode),                   \
   10861                           unop( Iop_ReinterpI64asF64,      \
   10862                                 mkexpr(_t) ) )
   10863 
   10864    putXMMRegLane32( rG, 3, CVT(t3) );
   10865    putXMMRegLane32( rG, 2, CVT(t2) );
   10866    putXMMRegLane32( rG, 1, CVT(t1) );
   10867    putXMMRegLane32( rG, 0, CVT(t0) );
   10868 #  undef CVT
   10869    putYMMRegLane128( rG, 1, mkV128(0) );
   10870 
   10871    return delta;
   10872 }
   10873 
   10874 
   10875 static Long dis_CVTDQ2PS_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10876                                Long delta, Bool isAvx )
   10877 {
   10878    IRTemp addr  = IRTemp_INVALID;
   10879    Int    alen  = 0;
   10880    HChar  dis_buf[50];
   10881    UChar  modrm = getUChar(delta);
   10882    IRTemp argV  = newTemp(Ity_V128);
   10883    IRTemp rmode = newTemp(Ity_I32);
   10884    UInt   rG    = gregOfRexRM(pfx,modrm);
   10885    IRTemp t0, t1, t2, t3;
   10886 
   10887    if (epartIsReg(modrm)) {
   10888       UInt rE = eregOfRexRM(pfx,modrm);
   10889       assign( argV, getXMMReg(rE) );
   10890       delta += 1;
   10891       DIP("%scvtdq2ps %s,%s\n",
   10892           isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
   10893    } else {
   10894       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10895       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10896       delta += alen;
   10897       DIP("%scvtdq2ps %s,%s\n",
   10898           isAvx ? "v" : "", dis_buf, nameXMMReg(rG) );
   10899    }
   10900 
   10901    assign( rmode, get_sse_roundingmode() );
   10902    t0 = IRTemp_INVALID;
   10903    t1 = IRTemp_INVALID;
   10904    t2 = IRTemp_INVALID;
   10905    t3 = IRTemp_INVALID;
   10906    breakupV128to32s( argV, &t3, &t2, &t1, &t0 );
   10907 
   10908 #  define CVT(_t)  binop( Iop_F64toF32,                    \
   10909                           mkexpr(rmode),                   \
   10910                           unop(Iop_I32StoF64,mkexpr(_t)))
   10911 
   10912    putXMMRegLane32F( rG, 3, CVT(t3) );
   10913    putXMMRegLane32F( rG, 2, CVT(t2) );
   10914    putXMMRegLane32F( rG, 1, CVT(t1) );
   10915    putXMMRegLane32F( rG, 0, CVT(t0) );
   10916 #  undef CVT
   10917    if (isAvx)
   10918       putYMMRegLane128( rG, 1, mkV128(0) );
   10919 
   10920    return delta;
   10921 }
   10922 
   10923 static Long dis_CVTDQ2PS_256 ( const VexAbiInfo* vbi, Prefix pfx,
   10924                                Long delta )
   10925 {
   10926    IRTemp addr   = IRTemp_INVALID;
   10927    Int    alen   = 0;
   10928    HChar  dis_buf[50];
   10929    UChar  modrm  = getUChar(delta);
   10930    IRTemp argV   = newTemp(Ity_V256);
   10931    IRTemp rmode  = newTemp(Ity_I32);
   10932    UInt   rG     = gregOfRexRM(pfx,modrm);
   10933    IRTemp t0, t1, t2, t3, t4, t5, t6, t7;
   10934 
   10935    if (epartIsReg(modrm)) {
   10936       UInt rE = eregOfRexRM(pfx,modrm);
   10937       assign( argV, getYMMReg(rE) );
   10938       delta += 1;
   10939       DIP("vcvtdq2ps %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   10940    } else {
   10941       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10942       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
   10943       delta += alen;
   10944       DIP("vcvtdq2ps %s,%s\n", dis_buf, nameYMMReg(rG) );
   10945    }
   10946 
   10947    assign( rmode, get_sse_roundingmode() );
   10948    t0 = IRTemp_INVALID;
   10949    t1 = IRTemp_INVALID;
   10950    t2 = IRTemp_INVALID;
   10951    t3 = IRTemp_INVALID;
   10952    t4 = IRTemp_INVALID;
   10953    t5 = IRTemp_INVALID;
   10954    t6 = IRTemp_INVALID;
   10955    t7 = IRTemp_INVALID;
   10956    breakupV256to32s( argV, &t7, &t6, &t5, &t4, &t3, &t2, &t1, &t0 );
   10957 
   10958 #  define CVT(_t)  binop( Iop_F64toF32,                    \
   10959                           mkexpr(rmode),                   \
   10960                           unop(Iop_I32StoF64,mkexpr(_t)))
   10961 
   10962    putYMMRegLane32F( rG, 7, CVT(t7) );
   10963    putYMMRegLane32F( rG, 6, CVT(t6) );
   10964    putYMMRegLane32F( rG, 5, CVT(t5) );
   10965    putYMMRegLane32F( rG, 4, CVT(t4) );
   10966    putYMMRegLane32F( rG, 3, CVT(t3) );
   10967    putYMMRegLane32F( rG, 2, CVT(t2) );
   10968    putYMMRegLane32F( rG, 1, CVT(t1) );
   10969    putYMMRegLane32F( rG, 0, CVT(t0) );
   10970 #  undef CVT
   10971 
   10972    return delta;
   10973 }
   10974 
   10975 
   10976 static Long dis_PMOVMSKB_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10977                                Long delta, Bool isAvx )
   10978 {
   10979    UChar modrm = getUChar(delta);
   10980    vassert(epartIsReg(modrm)); /* ensured by caller */
   10981    UInt   rE = eregOfRexRM(pfx,modrm);
   10982    UInt   rG = gregOfRexRM(pfx,modrm);
   10983    IRTemp t0 = newTemp(Ity_V128);
   10984    IRTemp t1 = newTemp(Ity_I32);
   10985    assign(t0, getXMMReg(rE));
   10986    assign(t1, unop(Iop_16Uto32, unop(Iop_GetMSBs8x16, mkexpr(t0))));
   10987    putIReg32(rG, mkexpr(t1));
   10988    DIP("%spmovmskb %s,%s\n", isAvx ? "v" : "", nameXMMReg(rE),
   10989        nameIReg32(rG));
   10990    delta += 1;
   10991    return delta;
   10992 }
   10993 
   10994 
   10995 static Long dis_PMOVMSKB_256 ( const VexAbiInfo* vbi, Prefix pfx,
   10996                                Long delta  )
   10997 {
   10998    UChar modrm = getUChar(delta);
   10999    vassert(epartIsReg(modrm)); /* ensured by caller */
   11000    UInt   rE = eregOfRexRM(pfx,modrm);
   11001    UInt   rG = gregOfRexRM(pfx,modrm);
   11002    IRTemp t0 = newTemp(Ity_V128);
   11003    IRTemp t1 = newTemp(Ity_V128);
   11004    IRTemp t2 = newTemp(Ity_I16);
   11005    IRTemp t3 = newTemp(Ity_I16);
   11006    assign(t0, getYMMRegLane128(rE, 0));
   11007    assign(t1, getYMMRegLane128(rE, 1));
   11008    assign(t2, unop(Iop_GetMSBs8x16, mkexpr(t0)));
   11009    assign(t3, unop(Iop_GetMSBs8x16, mkexpr(t1)));
   11010    putIReg32(rG, binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)));
   11011    DIP("vpmovmskb %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
   11012    delta += 1;
   11013    return delta;
   11014 }
   11015 
   11016 
   11017 /* FIXME: why not just use InterleaveLO / InterleaveHI?  I think the
   11018    relevant ops are "xIsH ? InterleaveHI32x4 : InterleaveLO32x4". */
   11019 /* Does the maths for 128 bit versions of UNPCKLPS and UNPCKHPS */
   11020 static IRTemp math_UNPCKxPS_128 ( IRTemp sV, IRTemp dV, Bool xIsH )
   11021 {
   11022    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   11023    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   11024    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   11025    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   11026    IRTemp res = newTemp(Ity_V128);
   11027    assign(res,  xIsH ? mkV128from32s( s3, d3, s2, d2 )
   11028                      : mkV128from32s( s1, d1, s0, d0 ));
   11029    return res;
   11030 }
   11031 
   11032 
   11033 /* FIXME: why not just use InterleaveLO / InterleaveHI ?? */
   11034 /* Does the maths for 128 bit versions of UNPCKLPD and UNPCKHPD */
   11035 static IRTemp math_UNPCKxPD_128 ( IRTemp sV, IRTemp dV, Bool xIsH )
   11036 {
   11037    IRTemp s1 = newTemp(Ity_I64);
   11038    IRTemp s0 = newTemp(Ity_I64);
   11039    IRTemp d1 = newTemp(Ity_I64);
   11040    IRTemp d0 = newTemp(Ity_I64);
   11041    assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   11042    assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   11043    assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   11044    assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   11045    IRTemp res = newTemp(Ity_V128);
   11046    assign(res, xIsH ? binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1))
   11047                     : binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)));
   11048    return res;
   11049 }
   11050 
   11051 
   11052 /* Does the maths for 256 bit versions of UNPCKLPD and UNPCKHPD.
   11053    Doesn't seem like this fits in either of the Iop_Interleave{LO,HI}
   11054    or the Iop_Cat{Odd,Even}Lanes idioms, hence just do it the stupid
   11055    way. */
   11056 static IRTemp math_UNPCKxPD_256 ( IRTemp sV, IRTemp dV, Bool xIsH )
   11057 {
   11058    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   11059    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   11060    breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
   11061    breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
   11062    IRTemp res = newTemp(Ity_V256);
   11063    assign(res, xIsH
   11064                ? IRExpr_Qop(Iop_64x4toV256, mkexpr(s3), mkexpr(d3),
   11065                                             mkexpr(s1), mkexpr(d1))
   11066                : IRExpr_Qop(Iop_64x4toV256, mkexpr(s2), mkexpr(d2),
   11067                                             mkexpr(s0), mkexpr(d0)));
   11068    return res;
   11069 }
   11070 
   11071 
   11072 /* FIXME: this is really bad.  Surely can do something better here?
   11073    One observation is that the steering in the upper and lower 128 bit
   11074    halves is the same as with math_UNPCKxPS_128, so we simply split
   11075    into two halves, and use that.  Consequently any improvement in
   11076    math_UNPCKxPS_128 (probably, to use interleave-style primops)
   11077    benefits this too. */
   11078 static IRTemp math_UNPCKxPS_256 ( IRTemp sV, IRTemp dV, Bool xIsH )
   11079 {
   11080    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   11081    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   11082    breakupV256toV128s( sV, &sVhi, &sVlo );
   11083    breakupV256toV128s( dV, &dVhi, &dVlo );
   11084    IRTemp rVhi = math_UNPCKxPS_128(sVhi, dVhi, xIsH);
   11085    IRTemp rVlo = math_UNPCKxPS_128(sVlo, dVlo, xIsH);
   11086    IRTemp rV   = newTemp(Ity_V256);
   11087    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   11088    return rV;
   11089 }
   11090 
   11091 
   11092 static IRTemp math_SHUFPS_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11093 {
   11094    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   11095    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   11096    vassert(imm8 < 256);
   11097 
   11098    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   11099    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   11100 
   11101 #  define SELD(n) ((n)==0 ? d0 : ((n)==1 ? d1 : ((n)==2 ? d2 : d3)))
   11102 #  define SELS(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   11103    IRTemp res = newTemp(Ity_V128);
   11104    assign(res,
   11105           mkV128from32s( SELS((imm8>>6)&3), SELS((imm8>>4)&3),
   11106                          SELD((imm8>>2)&3), SELD((imm8>>0)&3) ) );
   11107 #  undef SELD
   11108 #  undef SELS
   11109    return res;
   11110 }
   11111 
   11112 
   11113 /* 256-bit SHUFPS appears to steer each of the 128-bit halves
   11114    identically.  Hence do the clueless thing and use math_SHUFPS_128
   11115    twice. */
   11116 static IRTemp math_SHUFPS_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11117 {
   11118    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   11119    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   11120    breakupV256toV128s( sV, &sVhi, &sVlo );
   11121    breakupV256toV128s( dV, &dVhi, &dVlo );
   11122    IRTemp rVhi = math_SHUFPS_128(sVhi, dVhi, imm8);
   11123    IRTemp rVlo = math_SHUFPS_128(sVlo, dVlo, imm8);
   11124    IRTemp rV   = newTemp(Ity_V256);
   11125    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   11126    return rV;
   11127 }
   11128 
   11129 
   11130 static IRTemp math_SHUFPD_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11131 {
   11132    IRTemp s1 = newTemp(Ity_I64);
   11133    IRTemp s0 = newTemp(Ity_I64);
   11134    IRTemp d1 = newTemp(Ity_I64);
   11135    IRTemp d0 = newTemp(Ity_I64);
   11136 
   11137    assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   11138    assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   11139    assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   11140    assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   11141 
   11142 #  define SELD(n) mkexpr((n)==0 ? d0 : d1)
   11143 #  define SELS(n) mkexpr((n)==0 ? s0 : s1)
   11144 
   11145    IRTemp res = newTemp(Ity_V128);
   11146    assign(res, binop( Iop_64HLtoV128,
   11147                       SELS((imm8>>1)&1), SELD((imm8>>0)&1) ) );
   11148 
   11149 #  undef SELD
   11150 #  undef SELS
   11151    return res;
   11152 }
   11153 
   11154 
   11155 static IRTemp math_SHUFPD_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11156 {
   11157    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   11158    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   11159    breakupV256toV128s( sV, &sVhi, &sVlo );
   11160    breakupV256toV128s( dV, &dVhi, &dVlo );
   11161    IRTemp rVhi = math_SHUFPD_128(sVhi, dVhi, (imm8 >> 2) & 3);
   11162    IRTemp rVlo = math_SHUFPD_128(sVlo, dVlo, imm8 & 3);
   11163    IRTemp rV   = newTemp(Ity_V256);
   11164    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   11165    return rV;
   11166 }
   11167 
   11168 
   11169 static IRTemp math_BLENDPD_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11170 {
   11171    UShort imm8_mask_16;
   11172    IRTemp imm8_mask = newTemp(Ity_V128);
   11173 
   11174    switch( imm8 & 3 ) {
   11175       case 0:  imm8_mask_16 = 0x0000; break;
   11176       case 1:  imm8_mask_16 = 0x00FF; break;
   11177       case 2:  imm8_mask_16 = 0xFF00; break;
   11178       case 3:  imm8_mask_16 = 0xFFFF; break;
   11179       default: vassert(0);            break;
   11180    }
   11181    assign( imm8_mask, mkV128( imm8_mask_16 ) );
   11182 
   11183    IRTemp res = newTemp(Ity_V128);
   11184    assign ( res, binop( Iop_OrV128,
   11185                         binop( Iop_AndV128, mkexpr(sV),
   11186                                             mkexpr(imm8_mask) ),
   11187                         binop( Iop_AndV128, mkexpr(dV),
   11188                                unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
   11189    return res;
   11190 }
   11191 
   11192 
   11193 static IRTemp math_BLENDPD_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11194 {
   11195    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   11196    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   11197    breakupV256toV128s( sV, &sVhi, &sVlo );
   11198    breakupV256toV128s( dV, &dVhi, &dVlo );
   11199    IRTemp rVhi = math_BLENDPD_128(sVhi, dVhi, (imm8 >> 2) & 3);
   11200    IRTemp rVlo = math_BLENDPD_128(sVlo, dVlo, imm8 & 3);
   11201    IRTemp rV   = newTemp(Ity_V256);
   11202    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   11203    return rV;
   11204 }
   11205 
   11206 
   11207 static IRTemp math_BLENDPS_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11208 {
   11209    UShort imm8_perms[16] = { 0x0000, 0x000F, 0x00F0, 0x00FF, 0x0F00,
   11210                              0x0F0F, 0x0FF0, 0x0FFF, 0xF000, 0xF00F,
   11211                              0xF0F0, 0xF0FF, 0xFF00, 0xFF0F, 0xFFF0,
   11212                              0xFFFF };
   11213    IRTemp imm8_mask = newTemp(Ity_V128);
   11214    assign( imm8_mask, mkV128( imm8_perms[ (imm8 & 15) ] ) );
   11215 
   11216    IRTemp res = newTemp(Ity_V128);
   11217    assign ( res, binop( Iop_OrV128,
   11218                         binop( Iop_AndV128, mkexpr(sV),
   11219                                             mkexpr(imm8_mask) ),
   11220                         binop( Iop_AndV128, mkexpr(dV),
   11221                                unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
   11222    return res;
   11223 }
   11224 
   11225 
   11226 static IRTemp math_BLENDPS_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11227 {
   11228    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   11229    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   11230    breakupV256toV128s( sV, &sVhi, &sVlo );
   11231    breakupV256toV128s( dV, &dVhi, &dVlo );
   11232    IRTemp rVhi = math_BLENDPS_128(sVhi, dVhi, (imm8 >> 4) & 15);
   11233    IRTemp rVlo = math_BLENDPS_128(sVlo, dVlo, imm8 & 15);
   11234    IRTemp rV   = newTemp(Ity_V256);
   11235    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   11236    return rV;
   11237 }
   11238 
   11239 
   11240 static IRTemp math_PBLENDW_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11241 {
   11242    /* Make w be a 16-bit version of imm8, formed by duplicating each
   11243       bit in imm8. */
   11244    Int i;
   11245    UShort imm16 = 0;
   11246    for (i = 0; i < 8; i++) {
   11247       if (imm8 & (1 << i))
   11248          imm16 |= (3 << (2*i));
   11249    }
   11250    IRTemp imm16_mask = newTemp(Ity_V128);
   11251    assign( imm16_mask, mkV128( imm16 ));
   11252 
   11253    IRTemp res = newTemp(Ity_V128);
   11254    assign ( res, binop( Iop_OrV128,
   11255                         binop( Iop_AndV128, mkexpr(sV),
   11256                                             mkexpr(imm16_mask) ),
   11257                         binop( Iop_AndV128, mkexpr(dV),
   11258                                unop( Iop_NotV128, mkexpr(imm16_mask) ) ) ) );
   11259    return res;
   11260 }
   11261 
   11262 
   11263 static IRTemp math_PMULUDQ_128 ( IRTemp sV, IRTemp dV )
   11264 {
   11265    /* This is a really poor translation -- could be improved if
   11266       performance critical */
   11267    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   11268    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   11269    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   11270    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   11271    IRTemp res = newTemp(Ity_V128);
   11272    assign(res, binop(Iop_64HLtoV128,
   11273                      binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)),
   11274                      binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) ));
   11275    return res;
   11276 }
   11277 
   11278 
   11279 static IRTemp math_PMULUDQ_256 ( IRTemp sV, IRTemp dV )
   11280 {
   11281    /* This is a really poor translation -- could be improved if
   11282       performance critical */
   11283    IRTemp sHi, sLo, dHi, dLo;
   11284    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   11285    breakupV256toV128s( dV, &dHi, &dLo);
   11286    breakupV256toV128s( sV, &sHi, &sLo);
   11287    IRTemp res = newTemp(Ity_V256);
   11288    assign(res, binop(Iop_V128HLtoV256,
   11289                      mkexpr(math_PMULUDQ_128(sHi, dHi)),
   11290                      mkexpr(math_PMULUDQ_128(sLo, dLo))));
   11291    return res;
   11292 }
   11293 
   11294 
   11295 static IRTemp math_PMULDQ_128 ( IRTemp dV, IRTemp sV )
   11296 {
   11297    /* This is a really poor translation -- could be improved if
   11298       performance critical */
   11299    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   11300    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   11301    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   11302    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   11303    IRTemp res = newTemp(Ity_V128);
   11304    assign(res, binop(Iop_64HLtoV128,
   11305                      binop( Iop_MullS32, mkexpr(d2), mkexpr(s2)),
   11306                      binop( Iop_MullS32, mkexpr(d0), mkexpr(s0)) ));
   11307    return res;
   11308 }
   11309 
   11310 
   11311 static IRTemp math_PMULDQ_256 ( IRTemp sV, IRTemp dV )
   11312 {
   11313    /* This is a really poor translation -- could be improved if
   11314       performance critical */
   11315    IRTemp sHi, sLo, dHi, dLo;
   11316    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   11317    breakupV256toV128s( dV, &dHi, &dLo);
   11318    breakupV256toV128s( sV, &sHi, &sLo);
   11319    IRTemp res = newTemp(Ity_V256);
   11320    assign(res, binop(Iop_V128HLtoV256,
   11321                      mkexpr(math_PMULDQ_128(sHi, dHi)),
   11322                      mkexpr(math_PMULDQ_128(sLo, dLo))));
   11323    return res;
   11324 }
   11325 
   11326 
   11327 static IRTemp math_PMADDWD_128 ( IRTemp dV, IRTemp sV )
   11328 {
   11329    IRTemp sVhi, sVlo, dVhi, dVlo;
   11330    IRTemp resHi = newTemp(Ity_I64);
   11331    IRTemp resLo = newTemp(Ity_I64);
   11332    sVhi = sVlo = dVhi = dVlo = IRTemp_INVALID;
   11333    breakupV128to64s( sV, &sVhi, &sVlo );
   11334    breakupV128to64s( dV, &dVhi, &dVlo );
   11335    assign( resHi, mkIRExprCCall(Ity_I64, 0/*regparms*/,
   11336                                 "amd64g_calculate_mmx_pmaddwd",
   11337                                 &amd64g_calculate_mmx_pmaddwd,
   11338                                 mkIRExprVec_2( mkexpr(sVhi), mkexpr(dVhi))));
   11339    assign( resLo, mkIRExprCCall(Ity_I64, 0/*regparms*/,
   11340                                 "amd64g_calculate_mmx_pmaddwd",
   11341                                 &amd64g_calculate_mmx_pmaddwd,
   11342                                 mkIRExprVec_2( mkexpr(sVlo), mkexpr(dVlo))));
   11343    IRTemp res = newTemp(Ity_V128);
   11344    assign( res, binop(Iop_64HLtoV128, mkexpr(resHi), mkexpr(resLo))) ;
   11345    return res;
   11346 }
   11347 
   11348 
   11349 static IRTemp math_PMADDWD_256 ( IRTemp dV, IRTemp sV )
   11350 {
   11351    IRTemp sHi, sLo, dHi, dLo;
   11352    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   11353    breakupV256toV128s( dV, &dHi, &dLo);
   11354    breakupV256toV128s( sV, &sHi, &sLo);
   11355    IRTemp res = newTemp(Ity_V256);
   11356    assign(res, binop(Iop_V128HLtoV256,
   11357                      mkexpr(math_PMADDWD_128(dHi, sHi)),
   11358                      mkexpr(math_PMADDWD_128(dLo, sLo))));
   11359    return res;
   11360 }
   11361 
   11362 
   11363 static IRTemp math_ADDSUBPD_128 ( IRTemp dV, IRTemp sV )
   11364 {
   11365    IRTemp addV = newTemp(Ity_V128);
   11366    IRTemp subV = newTemp(Ity_V128);
   11367    IRTemp a1   = newTemp(Ity_I64);
   11368    IRTemp s0   = newTemp(Ity_I64);
   11369    IRTemp rm   = newTemp(Ity_I32);
   11370 
   11371    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11372    assign( addV, triop(Iop_Add64Fx2, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11373    assign( subV, triop(Iop_Sub64Fx2, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11374 
   11375    assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
   11376    assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
   11377 
   11378    IRTemp res = newTemp(Ity_V128);
   11379    assign( res, binop(Iop_64HLtoV128, mkexpr(a1), mkexpr(s0)) );
   11380    return res;
   11381 }
   11382 
   11383 
   11384 static IRTemp math_ADDSUBPD_256 ( IRTemp dV, IRTemp sV )
   11385 {
   11386    IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
   11387    IRTemp addV = newTemp(Ity_V256);
   11388    IRTemp subV = newTemp(Ity_V256);
   11389    IRTemp rm   = newTemp(Ity_I32);
   11390    a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11391 
   11392    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11393    assign( addV, triop(Iop_Add64Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11394    assign( subV, triop(Iop_Sub64Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11395 
   11396    breakupV256to64s( addV, &a3, &a2, &a1, &a0 );
   11397    breakupV256to64s( subV, &s3, &s2, &s1, &s0 );
   11398 
   11399    IRTemp res = newTemp(Ity_V256);
   11400    assign( res, mkV256from64s( a3, s2, a1, s0 ) );
   11401    return res;
   11402 }
   11403 
   11404 
   11405 static IRTemp math_ADDSUBPS_128 ( IRTemp dV, IRTemp sV )
   11406 {
   11407    IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
   11408    IRTemp addV = newTemp(Ity_V128);
   11409    IRTemp subV = newTemp(Ity_V128);
   11410    IRTemp rm   = newTemp(Ity_I32);
   11411    a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11412 
   11413    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11414    assign( addV, triop(Iop_Add32Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11415    assign( subV, triop(Iop_Sub32Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11416 
   11417    breakupV128to32s( addV, &a3, &a2, &a1, &a0 );
   11418    breakupV128to32s( subV, &s3, &s2, &s1, &s0 );
   11419 
   11420    IRTemp res = newTemp(Ity_V128);
   11421    assign( res, mkV128from32s( a3, s2, a1, s0 ) );
   11422    return res;
   11423 }
   11424 
   11425 
   11426 static IRTemp math_ADDSUBPS_256 ( IRTemp dV, IRTemp sV )
   11427 {
   11428    IRTemp a7, a6, a5, a4, a3, a2, a1, a0;
   11429    IRTemp s7, s6, s5, s4, s3, s2, s1, s0;
   11430    IRTemp addV = newTemp(Ity_V256);
   11431    IRTemp subV = newTemp(Ity_V256);
   11432    IRTemp rm   = newTemp(Ity_I32);
   11433    a7 = a6 = a5 = a4 = a3 = a2 = a1 = a0 = IRTemp_INVALID;
   11434    s7 = s6 = s5 = s4 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11435 
   11436    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11437    assign( addV, triop(Iop_Add32Fx8, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11438    assign( subV, triop(Iop_Sub32Fx8, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11439 
   11440    breakupV256to32s( addV, &a7, &a6, &a5, &a4, &a3, &a2, &a1, &a0 );
   11441    breakupV256to32s( subV, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
   11442 
   11443    IRTemp res = newTemp(Ity_V256);
   11444    assign( res, mkV256from32s( a7, s6, a5, s4, a3, s2, a1, s0 ) );
   11445    return res;
   11446 }
   11447 
   11448 
   11449 /* Handle 128 bit PSHUFLW and PSHUFHW. */
   11450 static Long dis_PSHUFxW_128 ( const VexAbiInfo* vbi, Prefix pfx,
   11451                               Long delta, Bool isAvx, Bool xIsH )
   11452 {
   11453    IRTemp addr  = IRTemp_INVALID;
   11454    Int    alen  = 0;
   11455    HChar  dis_buf[50];
   11456    UChar  modrm = getUChar(delta);
   11457    UInt   rG = gregOfRexRM(pfx,modrm);
   11458    UInt   imm8;
   11459    IRTemp sVmut, dVmut, sVcon, sV, dV, s3, s2, s1, s0;
   11460    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11461    sV    = newTemp(Ity_V128);
   11462    dV    = newTemp(Ity_V128);
   11463    sVmut = newTemp(Ity_I64);
   11464    dVmut = newTemp(Ity_I64);
   11465    sVcon = newTemp(Ity_I64);
   11466    if (epartIsReg(modrm)) {
   11467       UInt rE = eregOfRexRM(pfx,modrm);
   11468       assign( sV, getXMMReg(rE) );
   11469       imm8 = (UInt)getUChar(delta+1);
   11470       delta += 1+1;
   11471       DIP("%spshuf%cw $%u,%s,%s\n",
   11472           isAvx ? "v" : "", xIsH ? 'h' : 'l',
   11473           imm8, nameXMMReg(rE), nameXMMReg(rG));
   11474    } else {
   11475       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   11476       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11477       imm8 = (UInt)getUChar(delta+alen);
   11478       delta += alen+1;
   11479       DIP("%spshuf%cw $%u,%s,%s\n",
   11480           isAvx ? "v" : "", xIsH ? 'h' : 'l',
   11481           imm8, dis_buf, nameXMMReg(rG));
   11482    }
   11483 
   11484    /* Get the to-be-changed (mut) and unchanging (con) bits of the
   11485       source. */
   11486    assign( sVmut, unop(xIsH ? Iop_V128HIto64 : Iop_V128to64,   mkexpr(sV)) );
   11487    assign( sVcon, unop(xIsH ? Iop_V128to64   : Iop_V128HIto64, mkexpr(sV)) );
   11488 
   11489    breakup64to16s( sVmut, &s3, &s2, &s1, &s0 );
   11490 #  define SEL(n) \
   11491              ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   11492    assign(dVmut, mk64from16s( SEL((imm8>>6)&3), SEL((imm8>>4)&3),
   11493                               SEL((imm8>>2)&3), SEL((imm8>>0)&3) ));
   11494 #  undef SEL
   11495 
   11496    assign(dV, xIsH ? binop(Iop_64HLtoV128, mkexpr(dVmut), mkexpr(sVcon))
   11497                    : binop(Iop_64HLtoV128, mkexpr(sVcon), mkexpr(dVmut)) );
   11498 
   11499    (isAvx ? putYMMRegLoAndZU : putXMMReg)(rG, mkexpr(dV));
   11500    return delta;
   11501 }
   11502 
   11503 
   11504 /* Handle 256 bit PSHUFLW and PSHUFHW. */
   11505 static Long dis_PSHUFxW_256 ( const VexAbiInfo* vbi, Prefix pfx,
   11506                               Long delta, Bool xIsH )
   11507 {
   11508    IRTemp addr  = IRTemp_INVALID;
   11509    Int    alen  = 0;
   11510    HChar  dis_buf[50];
   11511    UChar  modrm = getUChar(delta);
   11512    UInt   rG = gregOfRexRM(pfx,modrm);
   11513    UInt   imm8;
   11514    IRTemp sV, s[8], sV64[4], dVhi, dVlo;
   11515    sV64[3] = sV64[2] = sV64[1] = sV64[0] = IRTemp_INVALID;
   11516    s[7] = s[6] = s[5] = s[4] = s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID;
   11517    sV    = newTemp(Ity_V256);
   11518    dVhi  = newTemp(Ity_I64);
   11519    dVlo  = newTemp(Ity_I64);
   11520    if (epartIsReg(modrm)) {
   11521       UInt rE = eregOfRexRM(pfx,modrm);
   11522       assign( sV, getYMMReg(rE) );
   11523       imm8 = (UInt)getUChar(delta+1);
   11524       delta += 1+1;
   11525       DIP("vpshuf%cw $%u,%s,%s\n", xIsH ? 'h' : 'l',
   11526           imm8, nameYMMReg(rE), nameYMMReg(rG));
   11527    } else {
   11528       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   11529       assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   11530       imm8 = (UInt)getUChar(delta+alen);
   11531       delta += alen+1;
   11532       DIP("vpshuf%cw $%u,%s,%s\n", xIsH ? 'h' : 'l',
   11533           imm8, dis_buf, nameYMMReg(rG));
   11534    }
   11535 
   11536    breakupV256to64s( sV, &sV64[3], &sV64[2], &sV64[1], &sV64[0] );
   11537    breakup64to16s( sV64[xIsH ? 3 : 2], &s[7], &s[6], &s[5], &s[4] );
   11538    breakup64to16s( sV64[xIsH ? 1 : 0], &s[3], &s[2], &s[1], &s[0] );
   11539 
   11540    assign( dVhi, mk64from16s( s[4 + ((imm8>>6)&3)], s[4 + ((imm8>>4)&3)],
   11541                               s[4 + ((imm8>>2)&3)], s[4 + ((imm8>>0)&3)] ) );
   11542    assign( dVlo, mk64from16s( s[0 + ((imm8>>6)&3)], s[0 + ((imm8>>4)&3)],
   11543                               s[0 + ((imm8>>2)&3)], s[0 + ((imm8>>0)&3)] ) );
   11544    putYMMReg( rG, mkV256from64s( xIsH ? dVhi : sV64[3],
   11545                                  xIsH ? sV64[2] : dVhi,
   11546                                  xIsH ? dVlo : sV64[1],
   11547                                  xIsH ? sV64[0] : dVlo ) );
   11548    return delta;
   11549 }
   11550 
   11551 
   11552 static Long dis_PEXTRW_128_EregOnly_toG ( const VexAbiInfo* vbi, Prefix pfx,
   11553                                           Long delta, Bool isAvx )
   11554 {
   11555    Long   deltaIN = delta;
   11556    UChar  modrm   = getUChar(delta);
   11557    UInt   rG      = gregOfRexRM(pfx,modrm);
   11558    IRTemp sV      = newTemp(Ity_V128);
   11559    IRTemp d16     = newTemp(Ity_I16);
   11560    UInt   imm8;
   11561    IRTemp s0, s1, s2, s3;
   11562    if (epartIsReg(modrm)) {
   11563       UInt rE = eregOfRexRM(pfx,modrm);
   11564       assign(sV, getXMMReg(rE));
   11565       imm8 = getUChar(delta+1) & 7;
   11566       delta += 1+1;
   11567       DIP("%spextrw $%u,%s,%s\n", isAvx ? "v" : "",
   11568           imm8, nameXMMReg(rE), nameIReg32(rG));
   11569    } else {
   11570       /* The memory case is disallowed, apparently. */
   11571       return deltaIN; /* FAIL */
   11572    }
   11573    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11574    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   11575    switch (imm8) {
   11576       case 0:  assign(d16, unop(Iop_32to16,   mkexpr(s0))); break;
   11577       case 1:  assign(d16, unop(Iop_32HIto16, mkexpr(s0))); break;
   11578       case 2:  assign(d16, unop(Iop_32to16,   mkexpr(s1))); break;
   11579       case 3:  assign(d16, unop(Iop_32HIto16, mkexpr(s1))); break;
   11580       case 4:  assign(d16, unop(Iop_32to16,   mkexpr(s2))); break;
   11581       case 5:  assign(d16, unop(Iop_32HIto16, mkexpr(s2))); break;
   11582       case 6:  assign(d16, unop(Iop_32to16,   mkexpr(s3))); break;
   11583       case 7:  assign(d16, unop(Iop_32HIto16, mkexpr(s3))); break;
   11584       default: vassert(0);
   11585    }
   11586    putIReg32(rG, unop(Iop_16Uto32, mkexpr(d16)));
   11587    return delta;
   11588 }
   11589 
   11590 
   11591 static Long dis_CVTDQ2PD_128 ( const VexAbiInfo* vbi, Prefix pfx,
   11592                                Long delta, Bool isAvx )
   11593 {
   11594    IRTemp addr  = IRTemp_INVALID;
   11595    Int    alen  = 0;
   11596    HChar  dis_buf[50];
   11597    UChar  modrm = getUChar(delta);
   11598    IRTemp arg64 = newTemp(Ity_I64);
   11599    UInt   rG    = gregOfRexRM(pfx,modrm);
   11600    const HChar* mbV   = isAvx ? "v" : "";
   11601    if (epartIsReg(modrm)) {
   11602       UInt rE = eregOfRexRM(pfx,modrm);
   11603       assign( arg64, getXMMRegLane64(rE, 0) );
   11604       delta += 1;
   11605       DIP("%scvtdq2pd %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG));
   11606    } else {
   11607       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11608       assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   11609       delta += alen;
   11610       DIP("%scvtdq2pd %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
   11611    }
   11612    putXMMRegLane64F(
   11613       rG, 0,
   11614       unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
   11615    );
   11616    putXMMRegLane64F(
   11617       rG, 1,
   11618       unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
   11619    );
   11620    if (isAvx)
   11621       putYMMRegLane128(rG, 1, mkV128(0));
   11622    return delta;
   11623 }
   11624 
   11625 
   11626 static Long dis_STMXCSR ( const VexAbiInfo* vbi, Prefix pfx,
   11627                           Long delta, Bool isAvx )
   11628 {
   11629    IRTemp addr  = IRTemp_INVALID;
   11630    Int    alen  = 0;
   11631    HChar  dis_buf[50];
   11632    UChar  modrm = getUChar(delta);
   11633    vassert(!epartIsReg(modrm)); /* ensured by caller */
   11634    vassert(gregOfRexRM(pfx,modrm) == 3); /* ditto */
   11635 
   11636    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11637    delta += alen;
   11638 
   11639    /* Fake up a native SSE mxcsr word.  The only thing it depends on
   11640       is SSEROUND[1:0], so call a clean helper to cook it up.
   11641    */
   11642    /* ULong amd64h_create_mxcsr ( ULong sseround ) */
   11643    DIP("%sstmxcsr %s\n",  isAvx ? "v" : "", dis_buf);
   11644    storeLE(
   11645       mkexpr(addr),
   11646       unop(Iop_64to32,
   11647            mkIRExprCCall(
   11648               Ity_I64, 0/*regp*/,
   11649               "amd64g_create_mxcsr", &amd64g_create_mxcsr,
   11650               mkIRExprVec_1( unop(Iop_32Uto64,get_sse_roundingmode()) )
   11651            )
   11652       )
   11653    );
   11654    return delta;
   11655 }
   11656 
   11657 
   11658 static Long dis_LDMXCSR ( const VexAbiInfo* vbi, Prefix pfx,
   11659                           Long delta, Bool isAvx )
   11660 {
   11661    IRTemp addr  = IRTemp_INVALID;
   11662    Int    alen  = 0;
   11663    HChar  dis_buf[50];
   11664    UChar  modrm = getUChar(delta);
   11665    vassert(!epartIsReg(modrm)); /* ensured by caller */
   11666    vassert(gregOfRexRM(pfx,modrm) == 2); /* ditto */
   11667 
   11668    IRTemp t64 = newTemp(Ity_I64);
   11669    IRTemp ew  = newTemp(Ity_I32);
   11670 
   11671    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11672    delta += alen;
   11673    DIP("%sldmxcsr %s\n",  isAvx ? "v" : "", dis_buf);
   11674 
   11675    /* The only thing we observe in %mxcsr is the rounding mode.
   11676       Therefore, pass the 32-bit value (SSE native-format control
   11677       word) to a clean helper, getting back a 64-bit value, the
   11678       lower half of which is the SSEROUND value to store, and the
   11679       upper half of which is the emulation-warning token which may
   11680       be generated.
   11681    */
   11682    /* ULong amd64h_check_ldmxcsr ( ULong ); */
   11683    assign( t64, mkIRExprCCall(
   11684                    Ity_I64, 0/*regparms*/,
   11685                    "amd64g_check_ldmxcsr",
   11686                    &amd64g_check_ldmxcsr,
   11687                    mkIRExprVec_1(
   11688                       unop(Iop_32Uto64,
   11689                            loadLE(Ity_I32, mkexpr(addr))
   11690                       )
   11691                    )
   11692                 )
   11693          );
   11694 
   11695    put_sse_roundingmode( unop(Iop_64to32, mkexpr(t64)) );
   11696    assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   11697    put_emwarn( mkexpr(ew) );
   11698    /* Finally, if an emulation warning was reported, side-exit to
   11699       the next insn, reporting the warning, so that Valgrind's
   11700       dispatcher sees the warning. */
   11701    stmt(
   11702       IRStmt_Exit(
   11703          binop(Iop_CmpNE64, unop(Iop_32Uto64,mkexpr(ew)), mkU64(0)),
   11704          Ijk_EmWarn,
   11705          IRConst_U64(guest_RIP_bbstart+delta),
   11706          OFFB_RIP
   11707       )
   11708    );
   11709    return delta;
   11710 }
   11711 
   11712 
   11713 static void gen_XSAVE_SEQUENCE ( IRTemp addr, IRTemp rfbm )
   11714 {
   11715    /* ------ rfbm[0] gates the x87 state ------ */
   11716 
   11717    /* Uses dirty helper:
   11718          void amd64g_do_XSAVE_COMPONENT_0 ( VexGuestAMD64State*, ULong )
   11719    */
   11720    IRDirty* d0 = unsafeIRDirty_0_N (
   11721                     0/*regparms*/,
   11722                     "amd64g_dirtyhelper_XSAVE_COMPONENT_0",
   11723                     &amd64g_dirtyhelper_XSAVE_COMPONENT_0,
   11724                     mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
   11725                  );
   11726    d0->guard = binop(Iop_CmpEQ64, binop(Iop_And64, mkexpr(rfbm), mkU64(1)),
   11727                      mkU64(1));
   11728 
   11729    /* Declare we're writing memory.  Really, bytes 24 through 31
   11730       (MXCSR and MXCSR_MASK) aren't written, but we can't express more
   11731       than 1 memory area here, so just mark the whole thing as
   11732       written. */
   11733    d0->mFx   = Ifx_Write;
   11734    d0->mAddr = mkexpr(addr);
   11735    d0->mSize = 160;
   11736 
   11737    /* declare we're reading guest state */
   11738    d0->nFxState = 5;
   11739    vex_bzero(&d0->fxState, sizeof(d0->fxState));
   11740 
   11741    d0->fxState[0].fx     = Ifx_Read;
   11742    d0->fxState[0].offset = OFFB_FTOP;
   11743    d0->fxState[0].size   = sizeof(UInt);
   11744 
   11745    d0->fxState[1].fx     = Ifx_Read;
   11746    d0->fxState[1].offset = OFFB_FPREGS;
   11747    d0->fxState[1].size   = 8 * sizeof(ULong);
   11748 
   11749    d0->fxState[2].fx     = Ifx_Read;
   11750    d0->fxState[2].offset = OFFB_FPTAGS;
   11751    d0->fxState[2].size   = 8 * sizeof(UChar);
   11752 
   11753    d0->fxState[3].fx     = Ifx_Read;
   11754    d0->fxState[3].offset = OFFB_FPROUND;
   11755    d0->fxState[3].size   = sizeof(ULong);
   11756 
   11757    d0->fxState[4].fx     = Ifx_Read;
   11758    d0->fxState[4].offset = OFFB_FC3210;
   11759    d0->fxState[4].size   = sizeof(ULong);
   11760 
   11761    stmt( IRStmt_Dirty(d0) );
   11762 
   11763    /* ------ rfbm[1] gates the SSE state ------ */
   11764 
   11765    IRTemp rfbm_1    = newTemp(Ity_I64);
   11766    IRTemp rfbm_1or2 = newTemp(Ity_I64);
   11767    assign(rfbm_1,    binop(Iop_And64, mkexpr(rfbm), mkU64(2)));
   11768    assign(rfbm_1or2, binop(Iop_And64, mkexpr(rfbm), mkU64(6)));
   11769 
   11770    IRExpr* guard_1    = binop(Iop_CmpEQ64, mkexpr(rfbm_1),    mkU64(2));
   11771    IRExpr* guard_1or2 = binop(Iop_CmpNE64, mkexpr(rfbm_1or2), mkU64(0));
   11772 
   11773    /* Uses dirty helper:
   11774          void amd64g_do_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS
   11775                  ( VexGuestAMD64State*, ULong )
   11776       This creates only MXCSR and MXCSR_MASK.  We need to do this if
   11777       either components 1 (SSE) or 2 (AVX) are requested.  Hence the
   11778       guard condition is a bit more complex.
   11779    */
   11780    IRDirty* d1 = unsafeIRDirty_0_N (
   11781                     0/*regparms*/,
   11782                     "amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS",
   11783                     &amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS,
   11784                     mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
   11785                  );
   11786    d1->guard = guard_1or2;
   11787 
   11788    /* Declare we're writing memory: MXCSR and MXCSR_MASK.  Note that
   11789       the code for rbfm[0] just above claims a write of 0 .. 159, so
   11790       this duplicates it.  But at least correctly connects 24 .. 31 to
   11791       the MXCSR guest state representation (SSEROUND field). */
   11792    d1->mFx   = Ifx_Write;
   11793    d1->mAddr = binop(Iop_Add64, mkexpr(addr), mkU64(24));
   11794    d1->mSize = 8;
   11795 
   11796    /* declare we're reading guest state */
   11797    d1->nFxState = 1;
   11798    vex_bzero(&d1->fxState, sizeof(d1->fxState));
   11799 
   11800    d1->fxState[0].fx     = Ifx_Read;
   11801    d1->fxState[0].offset = OFFB_SSEROUND;
   11802    d1->fxState[0].size   = sizeof(ULong);
   11803 
   11804    /* Call the helper.  This creates MXCSR and MXCSR_MASK but nothing
   11805       else.  We do the actual register array, XMM[0..15], separately,
   11806       in order that any undefinedness in the XMM registers is tracked
   11807       separately by Memcheck and does not "infect" the in-memory
   11808       shadow for the other parts of the image. */
   11809    stmt( IRStmt_Dirty(d1) );
   11810 
   11811    /* And now the XMMs themselves. */
   11812    UInt reg;
   11813    for (reg = 0; reg < 16; reg++) {
   11814       stmt( IRStmt_StoreG(
   11815                Iend_LE,
   11816                binop(Iop_Add64, mkexpr(addr), mkU64(160 + reg * 16)),
   11817                getXMMReg(reg),
   11818                guard_1
   11819       ));
   11820    }
   11821 
   11822    /* ------ rfbm[2] gates the AVX state ------ */
   11823    /* Component 2 is just a bunch of register saves, so we'll do it
   11824       inline, just to be simple and to be Memcheck friendly. */
   11825 
   11826    IRTemp rfbm_2 = newTemp(Ity_I64);
   11827    assign(rfbm_2, binop(Iop_And64, mkexpr(rfbm), mkU64(4)));
   11828 
   11829    IRExpr* guard_2 = binop(Iop_CmpEQ64, mkexpr(rfbm_2), mkU64(4));
   11830 
   11831    for (reg = 0; reg < 16; reg++) {
   11832       stmt( IRStmt_StoreG(
   11833                Iend_LE,
   11834                binop(Iop_Add64, mkexpr(addr), mkU64(576 + reg * 16)),
   11835                getYMMRegLane128(reg,1),
   11836                guard_2
   11837       ));
   11838    }
   11839 }
   11840 
   11841 
   11842 static Long dis_XSAVE ( const VexAbiInfo* vbi,
   11843                         Prefix pfx, Long delta, Int sz )
   11844 {
   11845    /* Note that the presence or absence of REX.W (indicated here by
   11846       |sz|) slightly affects the written format: whether the saved FPU
   11847       IP and DP pointers are 64 or 32 bits.  But the helper function
   11848       we call simply writes zero bits in the relevant fields, which
   11849       are 64 bits regardless of what REX.W is, and so it's good enough
   11850       (iow, equally broken) in both cases. */
   11851    IRTemp addr  = IRTemp_INVALID;
   11852    Int    alen  = 0;
   11853    HChar  dis_buf[50];
   11854    UChar  modrm = getUChar(delta);
   11855    vassert(!epartIsReg(modrm)); /* ensured by caller */
   11856    vassert(sz == 4 || sz == 8); /* ditto */
   11857 
   11858    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11859    delta += alen;
   11860    gen_SEGV_if_not_64_aligned(addr);
   11861 
   11862    DIP("%sxsave %s\n", sz==8 ? "rex64/" : "", dis_buf);
   11863 
   11864    /* VEX's caller is assumed to have checked this. */
   11865    const ULong aSSUMED_XCR0_VALUE = 7;
   11866 
   11867    IRTemp rfbm = newTemp(Ity_I64);
   11868    assign(rfbm,
   11869           binop(Iop_And64,
   11870                 binop(Iop_Or64,
   11871                       binop(Iop_Shl64,
   11872                             unop(Iop_32Uto64, getIRegRDX(4)), mkU8(32)),
   11873                       unop(Iop_32Uto64, getIRegRAX(4))),
   11874                 mkU64(aSSUMED_XCR0_VALUE)));
   11875 
   11876    gen_XSAVE_SEQUENCE(addr, rfbm);
   11877 
   11878    /* Finally, we need to update XSTATE_BV in the XSAVE header area, by
   11879       OR-ing the RFBM value into it. */
   11880    IRTemp addr_plus_512 = newTemp(Ity_I64);
   11881    assign(addr_plus_512, binop(Iop_Add64, mkexpr(addr), mkU64(512)));
   11882    storeLE( mkexpr(addr_plus_512),
   11883             binop(Iop_Or8,
   11884                   unop(Iop_64to8, mkexpr(rfbm)),
   11885                   loadLE(Ity_I8, mkexpr(addr_plus_512))) );
   11886 
   11887    return delta;
   11888 }
   11889 
   11890 
   11891 static Long dis_FXSAVE ( const VexAbiInfo* vbi,
   11892                          Prefix pfx, Long delta, Int sz )
   11893 {
   11894    /* See comment in dis_XSAVE about the significance of REX.W. */
   11895    IRTemp addr  = IRTemp_INVALID;
   11896    Int    alen  = 0;
   11897    HChar  dis_buf[50];
   11898    UChar  modrm = getUChar(delta);
   11899    vassert(!epartIsReg(modrm)); /* ensured by caller */
   11900    vassert(sz == 4 || sz == 8); /* ditto */
   11901 
   11902    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11903    delta += alen;
   11904    gen_SEGV_if_not_16_aligned(addr);
   11905 
   11906    DIP("%sfxsave %s\n", sz==8 ? "rex64/" : "", dis_buf);
   11907 
   11908    /* FXSAVE is just XSAVE with components 0 and 1 selected.  Set rfbm
   11909       to 0b011, generate the XSAVE sequence accordingly, and let iropt
   11910       fold out the unused (AVX) parts accordingly. */
   11911    IRTemp rfbm = newTemp(Ity_I64);
   11912    assign(rfbm, mkU64(3));
   11913    gen_XSAVE_SEQUENCE(addr, rfbm);
   11914 
   11915    return delta;
   11916 }
   11917 
   11918 
   11919 static void gen_XRSTOR_SEQUENCE ( IRTemp addr, IRTemp xstate_bv, IRTemp rfbm )
   11920 {
   11921    /* ------ rfbm[0] gates the x87 state ------ */
   11922 
   11923    /* If rfbm[0] == 1, we have to write the x87 state.  If
   11924       xstate_bv[0] == 1, we will read it from the memory image, else
   11925       we'll set it to initial values.  Doing this with a helper
   11926       function and getting the definedness flow annotations correct is
   11927       too difficult, so generate stupid but simple code: first set the
   11928       registers to initial values, regardless of xstate_bv[0].  Then,
   11929       conditionally restore from the memory image. */
   11930 
   11931    IRTemp rfbm_0       = newTemp(Ity_I64);
   11932    IRTemp xstate_bv_0  = newTemp(Ity_I64);
   11933    IRTemp restore_0    = newTemp(Ity_I64);
   11934    assign(rfbm_0,      binop(Iop_And64, mkexpr(rfbm), mkU64(1)));
   11935    assign(xstate_bv_0, binop(Iop_And64, mkexpr(xstate_bv), mkU64(1)));
   11936    assign(restore_0,   binop(Iop_And64, mkexpr(rfbm_0), mkexpr(xstate_bv_0)));
   11937 
   11938    gen_FINIT_SEQUENCE( binop(Iop_CmpNE64, mkexpr(rfbm_0), mkU64(0)) );
   11939 
   11940    /* Uses dirty helper:
   11941          void amd64g_do_XRSTOR_COMPONENT_0 ( VexGuestAMD64State*, ULong )
   11942    */
   11943    IRDirty* d0 = unsafeIRDirty_0_N (
   11944                     0/*regparms*/,
   11945                     "amd64g_dirtyhelper_XRSTOR_COMPONENT_0",
   11946                     &amd64g_dirtyhelper_XRSTOR_COMPONENT_0,
   11947                     mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
   11948                  );
   11949    d0->guard = binop(Iop_CmpNE64, mkexpr(restore_0), mkU64(0));
   11950 
   11951    /* Declare we're reading memory.  Really, bytes 24 through 31
   11952       (MXCSR and MXCSR_MASK) aren't read, but we can't express more
   11953       than 1 memory area here, so just mark the whole thing as
   11954       read. */
   11955    d0->mFx   = Ifx_Read;
   11956    d0->mAddr = mkexpr(addr);
   11957    d0->mSize = 160;
   11958 
   11959    /* declare we're writing guest state */
   11960    d0->nFxState = 5;
   11961    vex_bzero(&d0->fxState, sizeof(d0->fxState));
   11962 
   11963    d0->fxState[0].fx     = Ifx_Write;
   11964    d0->fxState[0].offset = OFFB_FTOP;
   11965    d0->fxState[0].size   = sizeof(UInt);
   11966 
   11967    d0->fxState[1].fx     = Ifx_Write;
   11968    d0->fxState[1].offset = OFFB_FPREGS;
   11969    d0->fxState[1].size   = 8 * sizeof(ULong);
   11970 
   11971    d0->fxState[2].fx     = Ifx_Write;
   11972    d0->fxState[2].offset = OFFB_FPTAGS;
   11973    d0->fxState[2].size   = 8 * sizeof(UChar);
   11974 
   11975    d0->fxState[3].fx     = Ifx_Write;
   11976    d0->fxState[3].offset = OFFB_FPROUND;
   11977    d0->fxState[3].size   = sizeof(ULong);
   11978 
   11979    d0->fxState[4].fx     = Ifx_Write;
   11980    d0->fxState[4].offset = OFFB_FC3210;
   11981    d0->fxState[4].size   = sizeof(ULong);
   11982 
   11983    stmt( IRStmt_Dirty(d0) );
   11984 
   11985    /* ------ rfbm[1] gates the SSE state ------ */
   11986 
   11987    /* Same scheme as component 0: first zero it out, and then possibly
   11988       restore from the memory area. */
   11989    IRTemp rfbm_1       = newTemp(Ity_I64);
   11990    IRTemp xstate_bv_1  = newTemp(Ity_I64);
   11991    IRTemp restore_1    = newTemp(Ity_I64);
   11992    assign(rfbm_1,      binop(Iop_And64, mkexpr(rfbm), mkU64(2)));
   11993    assign(xstate_bv_1, binop(Iop_And64, mkexpr(xstate_bv), mkU64(2)));
   11994    assign(restore_1,   binop(Iop_And64, mkexpr(rfbm_1), mkexpr(xstate_bv_1)));
   11995    IRExpr* rfbm_1e     = binop(Iop_CmpNE64, mkexpr(rfbm_1),    mkU64(0));
   11996    IRExpr* restore_1e  = binop(Iop_CmpNE64, mkexpr(restore_1), mkU64(0));
   11997 
   11998    IRTemp rfbm_1or2       = newTemp(Ity_I64);
   11999    IRTemp xstate_bv_1or2  = newTemp(Ity_I64);
   12000    IRTemp restore_1or2    = newTemp(Ity_I64);
   12001    assign(rfbm_1or2,      binop(Iop_And64, mkexpr(rfbm), mkU64(6)));
   12002    assign(xstate_bv_1or2, binop(Iop_And64, mkexpr(xstate_bv), mkU64(6)));
   12003    assign(restore_1or2,   binop(Iop_And64, mkexpr(rfbm_1or2),
   12004                                            mkexpr(xstate_bv_1or2)));
   12005    IRExpr* rfbm_1or2e     = binop(Iop_CmpNE64, mkexpr(rfbm_1or2),    mkU64(0));
   12006    IRExpr* restore_1or2e  = binop(Iop_CmpNE64, mkexpr(restore_1or2), mkU64(0));
   12007 
   12008    /* The areas in question are: SSEROUND, and the XMM register array. */
   12009    putGuarded(OFFB_SSEROUND, rfbm_1or2e, mkU64(Irrm_NEAREST));
   12010 
   12011    UInt reg;
   12012    for (reg = 0; reg < 16; reg++) {
   12013       putGuarded(xmmGuestRegOffset(reg), rfbm_1e, mkV128(0));
   12014    }
   12015 
   12016    /* And now possibly restore from MXCSR/MXCSR_MASK */
   12017    /* Uses dirty helper:
   12018          void amd64g_do_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS
   12019                  ( VexGuestAMD64State*, ULong )
   12020       This restores from only MXCSR and MXCSR_MASK.  We need to do
   12021       this if either components 1 (SSE) or 2 (AVX) are requested.
   12022       Hence the guard condition is a bit more complex.
   12023    */
   12024    IRDirty* d1 = unsafeIRDirty_0_N (
   12025                     0/*regparms*/,
   12026                     "amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS",
   12027                     &amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS,
   12028                     mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
   12029                 ) ;
   12030    d1->guard = restore_1or2e;
   12031 
   12032    /* Declare we're reading memory: MXCSR and MXCSR_MASK.  Note that
   12033       the code for rbfm[0] just above claims a read of 0 .. 159, so
   12034       this duplicates it.  But at least correctly connects 24 .. 31 to
   12035       the MXCSR guest state representation (SSEROUND field). */
   12036    d1->mFx   = Ifx_Read;
   12037    d1->mAddr = binop(Iop_Add64, mkexpr(addr), mkU64(24));
   12038    d1->mSize = 8;
   12039 
   12040    /* declare we're writing guest state */
   12041    d1->nFxState = 1;
   12042    vex_bzero(&d1->fxState, sizeof(d1->fxState));
   12043 
   12044    d1->fxState[0].fx     = Ifx_Write;
   12045    d1->fxState[0].offset = OFFB_SSEROUND;
   12046    d1->fxState[0].size   = sizeof(ULong);
   12047 
   12048    /* Call the helper.  This creates SSEROUND but nothing
   12049       else.  We do the actual register array, XMM[0..15], separately,
   12050       in order that any undefinedness in the XMM registers is tracked
   12051       separately by Memcheck and is not "infected" by the in-memory
   12052       shadow for the other parts of the image. */
   12053    stmt( IRStmt_Dirty(d1) );
   12054 
   12055    /* And now the XMMs themselves.  For each register, we PUT either
   12056       its old value, or the value loaded from memory.  One convenient
   12057       way to do that is with a conditional load that has its the
   12058       default value, the old value of the register. */
   12059    for (reg = 0; reg < 16; reg++) {
   12060       IRExpr* ea  = binop(Iop_Add64, mkexpr(addr), mkU64(160 + reg * 16));
   12061       IRExpr* alt = getXMMReg(reg);
   12062       IRTemp  loadedValue = newTemp(Ity_V128);
   12063       stmt( IRStmt_LoadG(Iend_LE,
   12064                          ILGop_IdentV128,
   12065                          loadedValue, ea, alt, restore_1e) );
   12066       putXMMReg(reg, mkexpr(loadedValue));
   12067    }
   12068 
   12069    /* ------ rfbm[2] gates the AVX state ------ */
   12070    /* Component 2 is just a bunch of register loads, so we'll do it
   12071       inline, just to be simple and to be Memcheck friendly. */
   12072 
   12073    /* Same scheme as component 0: first zero it out, and then possibly
   12074       restore from the memory area. */
   12075    IRTemp rfbm_2      = newTemp(Ity_I64);
   12076    IRTemp xstate_bv_2 = newTemp(Ity_I64);
   12077    IRTemp restore_2   = newTemp(Ity_I64);
   12078    assign(rfbm_2,      binop(Iop_And64, mkexpr(rfbm), mkU64(4)));
   12079    assign(xstate_bv_2, binop(Iop_And64, mkexpr(xstate_bv), mkU64(4)));
   12080    assign(restore_2,   binop(Iop_And64, mkexpr(rfbm_2), mkexpr(xstate_bv_2)));
   12081 
   12082    IRExpr* rfbm_2e    = binop(Iop_CmpNE64, mkexpr(rfbm_2),    mkU64(0));
   12083    IRExpr* restore_2e = binop(Iop_CmpNE64, mkexpr(restore_2), mkU64(0));
   12084 
   12085    for (reg = 0; reg < 16; reg++) {
   12086       putGuarded(ymmGuestRegLane128offset(reg, 1), rfbm_2e, mkV128(0));
   12087    }
   12088 
   12089    for (reg = 0; reg < 16; reg++) {
   12090       IRExpr* ea  = binop(Iop_Add64, mkexpr(addr), mkU64(576 + reg * 16));
   12091       IRExpr* alt = getYMMRegLane128(reg, 1);
   12092       IRTemp  loadedValue = newTemp(Ity_V128);
   12093       stmt( IRStmt_LoadG(Iend_LE,
   12094                          ILGop_IdentV128,
   12095                          loadedValue, ea, alt, restore_2e) );
   12096       putYMMRegLane128(reg, 1, mkexpr(loadedValue));
   12097    }
   12098 }
   12099 
   12100 
   12101 static Long dis_XRSTOR ( const VexAbiInfo* vbi,
   12102                          Prefix pfx, Long delta, Int sz )
   12103 {
   12104    /* As with XRSTOR above we ignore the value of REX.W since we're
   12105       not bothering with the FPU DP and IP fields. */
   12106    IRTemp addr  = IRTemp_INVALID;
   12107    Int    alen  = 0;
   12108    HChar  dis_buf[50];
   12109    UChar  modrm = getUChar(delta);
   12110    vassert(!epartIsReg(modrm)); /* ensured by caller */
   12111    vassert(sz == 4 || sz == 8); /* ditto */
   12112 
   12113    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12114    delta += alen;
   12115    gen_SEGV_if_not_64_aligned(addr);
   12116 
   12117    DIP("%sxrstor %s\n", sz==8 ? "rex64/" : "", dis_buf);
   12118 
   12119    /* VEX's caller is assumed to have checked this. */
   12120    const ULong aSSUMED_XCR0_VALUE = 7;
   12121 
   12122    IRTemp rfbm = newTemp(Ity_I64);
   12123    assign(rfbm,
   12124           binop(Iop_And64,
   12125                 binop(Iop_Or64,
   12126                       binop(Iop_Shl64,
   12127                             unop(Iop_32Uto64, getIRegRDX(4)), mkU8(32)),
   12128                       unop(Iop_32Uto64, getIRegRAX(4))),
   12129                 mkU64(aSSUMED_XCR0_VALUE)));
   12130 
   12131    IRTemp xstate_bv = newTemp(Ity_I64);
   12132    assign(xstate_bv, loadLE(Ity_I64,
   12133                             binop(Iop_Add64, mkexpr(addr), mkU64(512+0))));
   12134 
   12135    IRTemp xcomp_bv = newTemp(Ity_I64);
   12136    assign(xcomp_bv, loadLE(Ity_I64,
   12137                            binop(Iop_Add64, mkexpr(addr), mkU64(512+8))));
   12138 
   12139    IRTemp xsavehdr_23_16 = newTemp(Ity_I64);
   12140    assign( xsavehdr_23_16,
   12141            loadLE(Ity_I64,
   12142                   binop(Iop_Add64, mkexpr(addr), mkU64(512+16))));
   12143 
   12144    /* We must fault if
   12145       * xcomp_bv[63] == 1, since this simulated CPU does not support
   12146         the compaction extension.
   12147       * xstate_bv sets a bit outside of XCR0 (which we assume to be 7).
   12148       * any of the xsave header bytes 23 .. 8 are nonzero.  This seems to
   12149         imply that xcomp_bv must be zero.
   12150       xcomp_bv is header bytes 15 .. 8 and xstate_bv is header bytes 7 .. 0
   12151    */
   12152    IRTemp fault_if_nonzero = newTemp(Ity_I64);
   12153    assign(fault_if_nonzero,
   12154           binop(Iop_Or64,
   12155                 binop(Iop_And64, mkexpr(xstate_bv), mkU64(~aSSUMED_XCR0_VALUE)),
   12156                 binop(Iop_Or64, mkexpr(xcomp_bv), mkexpr(xsavehdr_23_16))));
   12157    stmt( IRStmt_Exit(binop(Iop_CmpNE64, mkexpr(fault_if_nonzero), mkU64(0)),
   12158                      Ijk_SigSEGV,
   12159                      IRConst_U64(guest_RIP_curr_instr),
   12160                      OFFB_RIP
   12161    ));
   12162 
   12163    /* We are guaranteed now that both xstate_bv and rfbm are in the
   12164       range 0 .. 7.  Generate the restore sequence proper. */
   12165    gen_XRSTOR_SEQUENCE(addr, xstate_bv, rfbm);
   12166 
   12167    return delta;
   12168 }
   12169 
   12170 
   12171 static Long dis_FXRSTOR ( const VexAbiInfo* vbi,
   12172                           Prefix pfx, Long delta, Int sz )
   12173 {
   12174    /* As with FXSAVE above we ignore the value of REX.W since we're
   12175       not bothering with the FPU DP and IP fields. */
   12176    IRTemp addr  = IRTemp_INVALID;
   12177    Int    alen  = 0;
   12178    HChar  dis_buf[50];
   12179    UChar  modrm = getUChar(delta);
   12180    vassert(!epartIsReg(modrm)); /* ensured by caller */
   12181    vassert(sz == 4 || sz == 8); /* ditto */
   12182 
   12183    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12184    delta += alen;
   12185    gen_SEGV_if_not_16_aligned(addr);
   12186 
   12187    DIP("%sfxrstor %s\n", sz==8 ? "rex64/" : "", dis_buf);
   12188 
   12189    /* FXRSTOR is just XRSTOR with components 0 and 1 selected and also
   12190       as if components 0 and 1 are set as present in XSTATE_BV in the
   12191       XSAVE header.  Set both rfbm and xstate_bv to 0b011 therefore,
   12192       generate the XRSTOR sequence accordingly, and let iropt fold out
   12193       the unused (AVX) parts accordingly. */
   12194    IRTemp three = newTemp(Ity_I64);
   12195    assign(three, mkU64(3));
   12196    gen_XRSTOR_SEQUENCE(addr, three/*xstate_bv*/, three/*rfbm*/);
   12197 
   12198    return delta;
   12199 }
   12200 
   12201 
   12202 static IRTemp math_PINSRW_128 ( IRTemp v128, IRTemp u16, UInt imm8 )
   12203 {
   12204    vassert(imm8 >= 0 && imm8 <= 7);
   12205 
   12206    // Create a V128 value which has the selected word in the
   12207    // specified lane, and zeroes everywhere else.
   12208    IRTemp tmp128    = newTemp(Ity_V128);
   12209    IRTemp halfshift = newTemp(Ity_I64);
   12210    assign(halfshift, binop(Iop_Shl64,
   12211                            unop(Iop_16Uto64, mkexpr(u16)),
   12212                            mkU8(16 * (imm8 & 3))));
   12213    if (imm8 < 4) {
   12214       assign(tmp128, binop(Iop_64HLtoV128, mkU64(0), mkexpr(halfshift)));
   12215    } else {
   12216       assign(tmp128, binop(Iop_64HLtoV128, mkexpr(halfshift), mkU64(0)));
   12217    }
   12218 
   12219    UShort mask = ~(3 << (imm8 * 2));
   12220    IRTemp res  = newTemp(Ity_V128);
   12221    assign( res, binop(Iop_OrV128,
   12222                       mkexpr(tmp128),
   12223                       binop(Iop_AndV128, mkexpr(v128), mkV128(mask))) );
   12224    return res;
   12225 }
   12226 
   12227 
   12228 static IRTemp math_PSADBW_128 ( IRTemp dV, IRTemp sV )
   12229 {
   12230    IRTemp s1, s0, d1, d0;
   12231    s1 = s0 = d1 = d0 = IRTemp_INVALID;
   12232 
   12233    breakupV128to64s( sV, &s1, &s0 );
   12234    breakupV128to64s( dV, &d1, &d0 );
   12235 
   12236    IRTemp res = newTemp(Ity_V128);
   12237    assign( res,
   12238            binop(Iop_64HLtoV128,
   12239                  mkIRExprCCall(Ity_I64, 0/*regparms*/,
   12240                                "amd64g_calculate_mmx_psadbw",
   12241                                &amd64g_calculate_mmx_psadbw,
   12242                                mkIRExprVec_2( mkexpr(s1), mkexpr(d1))),
   12243                  mkIRExprCCall(Ity_I64, 0/*regparms*/,
   12244                                "amd64g_calculate_mmx_psadbw",
   12245                                &amd64g_calculate_mmx_psadbw,
   12246                                mkIRExprVec_2( mkexpr(s0), mkexpr(d0)))) );
   12247    return res;
   12248 }
   12249 
   12250 
   12251 static IRTemp math_PSADBW_256 ( IRTemp dV, IRTemp sV )
   12252 {
   12253    IRTemp sHi, sLo, dHi, dLo;
   12254    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   12255    breakupV256toV128s( dV, &dHi, &dLo);
   12256    breakupV256toV128s( sV, &sHi, &sLo);
   12257    IRTemp res = newTemp(Ity_V256);
   12258    assign(res, binop(Iop_V128HLtoV256,
   12259                      mkexpr(math_PSADBW_128(dHi, sHi)),
   12260                      mkexpr(math_PSADBW_128(dLo, sLo))));
   12261    return res;
   12262 }
   12263 
   12264 
   12265 static Long dis_MASKMOVDQU ( const VexAbiInfo* vbi, Prefix pfx,
   12266                              Long delta, Bool isAvx )
   12267 {
   12268    IRTemp regD    = newTemp(Ity_V128);
   12269    IRTemp mask    = newTemp(Ity_V128);
   12270    IRTemp olddata = newTemp(Ity_V128);
   12271    IRTemp newdata = newTemp(Ity_V128);
   12272    IRTemp addr    = newTemp(Ity_I64);
   12273    UChar  modrm   = getUChar(delta);
   12274    UInt   rG      = gregOfRexRM(pfx,modrm);
   12275    UInt   rE      = eregOfRexRM(pfx,modrm);
   12276 
   12277    assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
   12278    assign( regD, getXMMReg( rG ));
   12279 
   12280    /* Unfortunately can't do the obvious thing with SarN8x16
   12281       here since that can't be re-emitted as SSE2 code - no such
   12282       insn. */
   12283    assign( mask,
   12284            binop(Iop_64HLtoV128,
   12285                  binop(Iop_SarN8x8,
   12286                        getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ),
   12287                        mkU8(7) ),
   12288                  binop(Iop_SarN8x8,
   12289                        getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ),
   12290                        mkU8(7) ) ));
   12291    assign( olddata, loadLE( Ity_V128, mkexpr(addr) ));
   12292    assign( newdata, binop(Iop_OrV128,
   12293                           binop(Iop_AndV128,
   12294                                 mkexpr(regD),
   12295                                 mkexpr(mask) ),
   12296                           binop(Iop_AndV128,
   12297                                 mkexpr(olddata),
   12298                                 unop(Iop_NotV128, mkexpr(mask)))) );
   12299    storeLE( mkexpr(addr), mkexpr(newdata) );
   12300 
   12301    delta += 1;
   12302    DIP("%smaskmovdqu %s,%s\n", isAvx ? "v" : "",
   12303        nameXMMReg(rE), nameXMMReg(rG) );
   12304    return delta;
   12305 }
   12306 
   12307 
   12308 static Long dis_MOVMSKPS_128 ( const VexAbiInfo* vbi, Prefix pfx,
   12309                                Long delta, Bool isAvx )
   12310 {
   12311    UChar modrm = getUChar(delta);
   12312    UInt   rG   = gregOfRexRM(pfx,modrm);
   12313    UInt   rE   = eregOfRexRM(pfx,modrm);
   12314    IRTemp t0   = newTemp(Ity_I32);
   12315    IRTemp t1   = newTemp(Ity_I32);
   12316    IRTemp t2   = newTemp(Ity_I32);
   12317    IRTemp t3   = newTemp(Ity_I32);
   12318    delta += 1;
   12319    assign( t0, binop( Iop_And32,
   12320                       binop(Iop_Shr32, getXMMRegLane32(rE,0), mkU8(31)),
   12321                       mkU32(1) ));
   12322    assign( t1, binop( Iop_And32,
   12323                       binop(Iop_Shr32, getXMMRegLane32(rE,1), mkU8(30)),
   12324                       mkU32(2) ));
   12325    assign( t2, binop( Iop_And32,
   12326                       binop(Iop_Shr32, getXMMRegLane32(rE,2), mkU8(29)),
   12327                       mkU32(4) ));
   12328    assign( t3, binop( Iop_And32,
   12329                       binop(Iop_Shr32, getXMMRegLane32(rE,3), mkU8(28)),
   12330                       mkU32(8) ));
   12331    putIReg32( rG, binop(Iop_Or32,
   12332                         binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
   12333                         binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ) );
   12334    DIP("%smovmskps %s,%s\n", isAvx ? "v" : "",
   12335        nameXMMReg(rE), nameIReg32(rG));
   12336    return delta;
   12337 }
   12338 
   12339 
   12340 static Long dis_MOVMSKPS_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta )
   12341 {
   12342    UChar modrm = getUChar(delta);
   12343    UInt   rG   = gregOfRexRM(pfx,modrm);
   12344    UInt   rE   = eregOfRexRM(pfx,modrm);
   12345    IRTemp t0   = newTemp(Ity_I32);
   12346    IRTemp t1   = newTemp(Ity_I32);
   12347    IRTemp t2   = newTemp(Ity_I32);
   12348    IRTemp t3   = newTemp(Ity_I32);
   12349    IRTemp t4   = newTemp(Ity_I32);
   12350    IRTemp t5   = newTemp(Ity_I32);
   12351    IRTemp t6   = newTemp(Ity_I32);
   12352    IRTemp t7   = newTemp(Ity_I32);
   12353    delta += 1;
   12354    assign( t0, binop( Iop_And32,
   12355                       binop(Iop_Shr32, getYMMRegLane32(rE,0), mkU8(31)),
   12356                       mkU32(1) ));
   12357    assign( t1, binop( Iop_And32,
   12358                       binop(Iop_Shr32, getYMMRegLane32(rE,1), mkU8(30)),
   12359                       mkU32(2) ));
   12360    assign( t2, binop( Iop_And32,
   12361                       binop(Iop_Shr32, getYMMRegLane32(rE,2), mkU8(29)),
   12362                       mkU32(4) ));
   12363    assign( t3, binop( Iop_And32,
   12364                       binop(Iop_Shr32, getYMMRegLane32(rE,3), mkU8(28)),
   12365                       mkU32(8) ));
   12366    assign( t4, binop( Iop_And32,
   12367                       binop(Iop_Shr32, getYMMRegLane32(rE,4), mkU8(27)),
   12368                       mkU32(16) ));
   12369    assign( t5, binop( Iop_And32,
   12370                       binop(Iop_Shr32, getYMMRegLane32(rE,5), mkU8(26)),
   12371                       mkU32(32) ));
   12372    assign( t6, binop( Iop_And32,
   12373                       binop(Iop_Shr32, getYMMRegLane32(rE,6), mkU8(25)),
   12374                       mkU32(64) ));
   12375    assign( t7, binop( Iop_And32,
   12376                       binop(Iop_Shr32, getYMMRegLane32(rE,7), mkU8(24)),
   12377                       mkU32(128) ));
   12378    putIReg32( rG, binop(Iop_Or32,
   12379                         binop(Iop_Or32,
   12380                               binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
   12381                               binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ),
   12382                         binop(Iop_Or32,
   12383                               binop(Iop_Or32, mkexpr(t4), mkexpr(t5)),
   12384                               binop(Iop_Or32, mkexpr(t6), mkexpr(t7)) ) ) );
   12385    DIP("vmovmskps %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
   12386    return delta;
   12387 }
   12388 
   12389 
   12390 static Long dis_MOVMSKPD_128 ( const VexAbiInfo* vbi, Prefix pfx,
   12391                                Long delta, Bool isAvx )
   12392 {
   12393    UChar modrm = getUChar(delta);
   12394    UInt   rG   = gregOfRexRM(pfx,modrm);
   12395    UInt   rE   = eregOfRexRM(pfx,modrm);
   12396    IRTemp t0   = newTemp(Ity_I32);
   12397    IRTemp t1   = newTemp(Ity_I32);
   12398    delta += 1;
   12399    assign( t0, binop( Iop_And32,
   12400                       binop(Iop_Shr32, getXMMRegLane32(rE,1), mkU8(31)),
   12401                       mkU32(1) ));
   12402    assign( t1, binop( Iop_And32,
   12403                       binop(Iop_Shr32, getXMMRegLane32(rE,3), mkU8(30)),
   12404                       mkU32(2) ));
   12405    putIReg32( rG, binop(Iop_Or32, mkexpr(t0), mkexpr(t1) ) );
   12406    DIP("%smovmskpd %s,%s\n", isAvx ? "v" : "",
   12407        nameXMMReg(rE), nameIReg32(rG));
   12408    return delta;
   12409 }
   12410 
   12411 
   12412 static Long dis_MOVMSKPD_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta )
   12413 {
   12414    UChar modrm = getUChar(delta);
   12415    UInt   rG   = gregOfRexRM(pfx,modrm);
   12416    UInt   rE   = eregOfRexRM(pfx,modrm);
   12417    IRTemp t0   = newTemp(Ity_I32);
   12418    IRTemp t1   = newTemp(Ity_I32);
   12419    IRTemp t2   = newTemp(Ity_I32);
   12420    IRTemp t3   = newTemp(Ity_I32);
   12421    delta += 1;
   12422    assign( t0, binop( Iop_And32,
   12423                       binop(Iop_Shr32, getYMMRegLane32(rE,1), mkU8(31)),
   12424                       mkU32(1) ));
   12425    assign( t1, binop( Iop_And32,
   12426                       binop(Iop_Shr32, getYMMRegLane32(rE,3), mkU8(30)),
   12427                       mkU32(2) ));
   12428    assign( t2, binop( Iop_And32,
   12429                       binop(Iop_Shr32, getYMMRegLane32(rE,5), mkU8(29)),
   12430                       mkU32(4) ));
   12431    assign( t3, binop( Iop_And32,
   12432                       binop(Iop_Shr32, getYMMRegLane32(rE,7), mkU8(28)),
   12433                       mkU32(8) ));
   12434    putIReg32( rG, binop(Iop_Or32,
   12435                         binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
   12436                         binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ) );
   12437    DIP("vmovmskps %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
   12438    return delta;
   12439 }
   12440 
   12441 
   12442 /* Note, this also handles SSE(1) insns. */
   12443 __attribute__((noinline))
   12444 static
   12445 Long dis_ESC_0F__SSE2 ( Bool* decode_OK,
   12446                         const VexArchInfo* archinfo,
   12447                         const VexAbiInfo* vbi,
   12448                         Prefix pfx, Int sz, Long deltaIN,
   12449                         DisResult* dres )
   12450 {
   12451    IRTemp addr  = IRTemp_INVALID;
   12452    IRTemp t0    = IRTemp_INVALID;
   12453    IRTemp t1    = IRTemp_INVALID;
   12454    IRTemp t2    = IRTemp_INVALID;
   12455    IRTemp t3    = IRTemp_INVALID;
   12456    IRTemp t4    = IRTemp_INVALID;
   12457    IRTemp t5    = IRTemp_INVALID;
   12458    IRTemp t6    = IRTemp_INVALID;
   12459    UChar  modrm = 0;
   12460    Int    alen  = 0;
   12461    HChar  dis_buf[50];
   12462 
   12463    *decode_OK = False;
   12464 
   12465    Long   delta = deltaIN;
   12466    UChar  opc   = getUChar(delta);
   12467    delta++;
   12468    switch (opc) {
   12469 
   12470    case 0x10:
   12471       if (have66noF2noF3(pfx)
   12472           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12473          /* 66 0F 10 = MOVUPD -- move from E (mem or xmm) to G (xmm). */
   12474          modrm = getUChar(delta);
   12475          if (epartIsReg(modrm)) {
   12476             putXMMReg( gregOfRexRM(pfx,modrm),
   12477                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   12478             DIP("movupd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12479                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12480             delta += 1;
   12481          } else {
   12482             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12483             putXMMReg( gregOfRexRM(pfx,modrm),
   12484                        loadLE(Ity_V128, mkexpr(addr)) );
   12485             DIP("movupd %s,%s\n", dis_buf,
   12486                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12487             delta += alen;
   12488          }
   12489          goto decode_success;
   12490       }
   12491       /* F2 0F 10 = MOVSD -- move 64 bits from E (mem or lo half xmm) to
   12492          G (lo half xmm).  If E is mem, upper half of G is zeroed out.
   12493          If E is reg, upper half of G is unchanged. */
   12494       if (haveF2no66noF3(pfx)
   12495           && (sz == 4 || /* ignore redundant REX.W */ sz == 8) ) {
   12496          modrm = getUChar(delta);
   12497          if (epartIsReg(modrm)) {
   12498             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   12499                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
   12500             DIP("movsd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12501                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   12502             delta += 1;
   12503          } else {
   12504             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12505             putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
   12506             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   12507                              loadLE(Ity_I64, mkexpr(addr)) );
   12508             DIP("movsd %s,%s\n", dis_buf,
   12509                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   12510             delta += alen;
   12511          }
   12512          goto decode_success;
   12513       }
   12514       /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
   12515          (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
   12516       if (haveF3no66noF2(pfx)
   12517           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12518          modrm = getUChar(delta);
   12519          if (epartIsReg(modrm)) {
   12520             putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
   12521                              getXMMRegLane32( eregOfRexRM(pfx,modrm), 0 ));
   12522             DIP("movss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12523                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   12524             delta += 1;
   12525          } else {
   12526             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12527             putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
   12528             putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
   12529                              loadLE(Ity_I32, mkexpr(addr)) );
   12530             DIP("movss %s,%s\n", dis_buf,
   12531                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   12532             delta += alen;
   12533          }
   12534          goto decode_success;
   12535       }
   12536       /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
   12537       if (haveNo66noF2noF3(pfx)
   12538           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12539          modrm = getUChar(delta);
   12540          if (epartIsReg(modrm)) {
   12541             putXMMReg( gregOfRexRM(pfx,modrm),
   12542                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   12543             DIP("movups %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12544                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12545             delta += 1;
   12546          } else {
   12547             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12548             putXMMReg( gregOfRexRM(pfx,modrm),
   12549                        loadLE(Ity_V128, mkexpr(addr)) );
   12550             DIP("movups %s,%s\n", dis_buf,
   12551                                      nameXMMReg(gregOfRexRM(pfx,modrm)));
   12552             delta += alen;
   12553          }
   12554          goto decode_success;
   12555       }
   12556       break;
   12557 
   12558    case 0x11:
   12559       /* F2 0F 11 = MOVSD -- move 64 bits from G (lo half xmm) to E (mem
   12560          or lo half xmm). */
   12561       if (haveF2no66noF3(pfx)
   12562           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12563          modrm = getUChar(delta);
   12564          if (epartIsReg(modrm)) {
   12565             putXMMRegLane64( eregOfRexRM(pfx,modrm), 0,
   12566                              getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
   12567             DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12568                                  nameXMMReg(eregOfRexRM(pfx,modrm)));
   12569             delta += 1;
   12570          } else {
   12571             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12572             storeLE( mkexpr(addr),
   12573                      getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
   12574             DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12575                                  dis_buf);
   12576             delta += alen;
   12577          }
   12578          goto decode_success;
   12579       }
   12580       /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
   12581          or lo 1/4 xmm). */
   12582       if (haveF3no66noF2(pfx) && sz == 4) {
   12583          modrm = getUChar(delta);
   12584          if (epartIsReg(modrm)) {
   12585             /* fall through, we don't yet have a test case */
   12586          } else {
   12587             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12588             storeLE( mkexpr(addr),
   12589                      getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
   12590             DIP("movss %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12591                                  dis_buf);
   12592             delta += alen;
   12593             goto decode_success;
   12594          }
   12595       }
   12596       /* 66 0F 11 = MOVUPD -- move from G (xmm) to E (mem or xmm). */
   12597       if (have66noF2noF3(pfx)
   12598           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12599          modrm = getUChar(delta);
   12600          if (epartIsReg(modrm)) {
   12601             putXMMReg( eregOfRexRM(pfx,modrm),
   12602                        getXMMReg( gregOfRexRM(pfx,modrm) ) );
   12603             DIP("movupd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12604                                   nameXMMReg(eregOfRexRM(pfx,modrm)));
   12605             delta += 1;
   12606          } else {
   12607             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12608             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   12609             DIP("movupd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12610                                   dis_buf );
   12611             delta += alen;
   12612          }
   12613          goto decode_success;
   12614       }
   12615       /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
   12616       if (haveNo66noF2noF3(pfx)
   12617           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12618          modrm = getUChar(delta);
   12619          if (epartIsReg(modrm)) {
   12620             /* fall through; awaiting test case */
   12621          } else {
   12622             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12623             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   12624             DIP("movups %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12625                                   dis_buf );
   12626             delta += alen;
   12627             goto decode_success;
   12628          }
   12629       }
   12630       break;
   12631 
   12632    case 0x12:
   12633       /* 66 0F 12 = MOVLPD -- move from mem to low half of XMM. */
   12634       /* Identical to MOVLPS ? */
   12635       if (have66noF2noF3(pfx)
   12636           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12637          modrm = getUChar(delta);
   12638          if (epartIsReg(modrm)) {
   12639             /* fall through; apparently reg-reg is not possible */
   12640          } else {
   12641             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12642             delta += alen;
   12643             putXMMRegLane64( gregOfRexRM(pfx,modrm),
   12644                              0/*lower lane*/,
   12645                              loadLE(Ity_I64, mkexpr(addr)) );
   12646             DIP("movlpd %s, %s\n",
   12647                 dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
   12648             goto decode_success;
   12649          }
   12650       }
   12651       /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
   12652       /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
   12653       if (haveNo66noF2noF3(pfx)
   12654           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12655          modrm = getUChar(delta);
   12656          if (epartIsReg(modrm)) {
   12657             delta += 1;
   12658             putXMMRegLane64( gregOfRexRM(pfx,modrm),
   12659                              0/*lower lane*/,
   12660                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ));
   12661             DIP("movhlps %s, %s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12662                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12663          } else {
   12664             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12665             delta += alen;
   12666             putXMMRegLane64( gregOfRexRM(pfx,modrm),  0/*lower lane*/,
   12667                              loadLE(Ity_I64, mkexpr(addr)) );
   12668             DIP("movlps %s, %s\n",
   12669                 dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
   12670          }
   12671          goto decode_success;
   12672       }
   12673       break;
   12674 
   12675    case 0x13:
   12676       /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
   12677       if (haveNo66noF2noF3(pfx)
   12678           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12679          modrm = getUChar(delta);
   12680          if (!epartIsReg(modrm)) {
   12681             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12682             delta += alen;
   12683             storeLE( mkexpr(addr),
   12684                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
   12685                                       0/*lower lane*/ ) );
   12686             DIP("movlps %s, %s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   12687                                    dis_buf);
   12688             goto decode_success;
   12689          }
   12690          /* else fall through */
   12691       }
   12692       /* 66 0F 13 = MOVLPD -- move from low half of XMM to mem. */
   12693       /* Identical to MOVLPS ? */
   12694       if (have66noF2noF3(pfx)
   12695           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12696          modrm = getUChar(delta);
   12697          if (!epartIsReg(modrm)) {
   12698             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12699             delta += alen;
   12700             storeLE( mkexpr(addr),
   12701                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
   12702                                       0/*lower lane*/ ) );
   12703             DIP("movlpd %s, %s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   12704                                    dis_buf);
   12705             goto decode_success;
   12706          }
   12707          /* else fall through */
   12708       }
   12709       break;
   12710 
   12711    case 0x14:
   12712    case 0x15:
   12713       /* 0F 14 = UNPCKLPS -- unpack and interleave low part F32s */
   12714       /* 0F 15 = UNPCKHPS -- unpack and interleave high part F32s */
   12715       /* These just appear to be special cases of SHUFPS */
   12716       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12717          Bool   hi = toBool(opc == 0x15);
   12718          IRTemp sV = newTemp(Ity_V128);
   12719          IRTemp dV = newTemp(Ity_V128);
   12720          modrm = getUChar(delta);
   12721          UInt   rG = gregOfRexRM(pfx,modrm);
   12722          assign( dV, getXMMReg(rG) );
   12723          if (epartIsReg(modrm)) {
   12724             UInt rE = eregOfRexRM(pfx,modrm);
   12725             assign( sV, getXMMReg(rE) );
   12726             delta += 1;
   12727             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   12728                 nameXMMReg(rE), nameXMMReg(rG));
   12729          } else {
   12730             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12731             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12732             delta += alen;
   12733             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   12734                 dis_buf, nameXMMReg(rG));
   12735          }
   12736          IRTemp res = math_UNPCKxPS_128( sV, dV, hi );
   12737          putXMMReg( rG, mkexpr(res) );
   12738          goto decode_success;
   12739       }
   12740       /* 66 0F 15 = UNPCKHPD -- unpack and interleave high part F64s */
   12741       /* 66 0F 14 = UNPCKLPD -- unpack and interleave low part F64s */
   12742       /* These just appear to be special cases of SHUFPS */
   12743       if (have66noF2noF3(pfx)
   12744           && sz == 2 /* could be 8 if rex also present */) {
   12745          Bool   hi = toBool(opc == 0x15);
   12746          IRTemp sV = newTemp(Ity_V128);
   12747          IRTemp dV = newTemp(Ity_V128);
   12748          modrm = getUChar(delta);
   12749          UInt   rG = gregOfRexRM(pfx,modrm);
   12750          assign( dV, getXMMReg(rG) );
   12751          if (epartIsReg(modrm)) {
   12752             UInt rE = eregOfRexRM(pfx,modrm);
   12753             assign( sV, getXMMReg(rE) );
   12754             delta += 1;
   12755             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   12756                 nameXMMReg(rE), nameXMMReg(rG));
   12757          } else {
   12758             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12759             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12760             delta += alen;
   12761             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   12762                 dis_buf, nameXMMReg(rG));
   12763          }
   12764          IRTemp res = math_UNPCKxPD_128( sV, dV, hi );
   12765          putXMMReg( rG, mkexpr(res) );
   12766          goto decode_success;
   12767       }
   12768       break;
   12769 
   12770    case 0x16:
   12771       /* 66 0F 16 = MOVHPD -- move from mem to high half of XMM. */
   12772       /* These seems identical to MOVHPS.  This instruction encoding is
   12773          completely crazy. */
   12774       if (have66noF2noF3(pfx)
   12775           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12776          modrm = getUChar(delta);
   12777          if (epartIsReg(modrm)) {
   12778             /* fall through; apparently reg-reg is not possible */
   12779          } else {
   12780             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12781             delta += alen;
   12782             putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
   12783                              loadLE(Ity_I64, mkexpr(addr)) );
   12784             DIP("movhpd %s,%s\n", dis_buf,
   12785                                   nameXMMReg( gregOfRexRM(pfx,modrm) ));
   12786             goto decode_success;
   12787          }
   12788       }
   12789       /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
   12790       /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
   12791       if (haveNo66noF2noF3(pfx)
   12792           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12793          modrm = getUChar(delta);
   12794          if (epartIsReg(modrm)) {
   12795             delta += 1;
   12796             putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
   12797                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ) );
   12798             DIP("movhps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12799                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12800          } else {
   12801             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12802             delta += alen;
   12803             putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
   12804                              loadLE(Ity_I64, mkexpr(addr)) );
   12805             DIP("movhps %s,%s\n", dis_buf,
   12806                                   nameXMMReg( gregOfRexRM(pfx,modrm) ));
   12807          }
   12808          goto decode_success;
   12809       }
   12810       break;
   12811 
   12812    case 0x17:
   12813       /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
   12814       if (haveNo66noF2noF3(pfx)
   12815           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12816          modrm = getUChar(delta);
   12817          if (!epartIsReg(modrm)) {
   12818             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12819             delta += alen;
   12820             storeLE( mkexpr(addr),
   12821                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
   12822                                       1/*upper lane*/ ) );
   12823             DIP("movhps %s,%s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   12824                                   dis_buf);
   12825             goto decode_success;
   12826          }
   12827          /* else fall through */
   12828       }
   12829       /* 66 0F 17 = MOVHPD -- move from high half of XMM to mem. */
   12830       /* Again, this seems identical to MOVHPS. */
   12831       if (have66noF2noF3(pfx)
   12832           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12833          modrm = getUChar(delta);
   12834          if (!epartIsReg(modrm)) {
   12835             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12836             delta += alen;
   12837             storeLE( mkexpr(addr),
   12838                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
   12839                                       1/*upper lane*/ ) );
   12840             DIP("movhpd %s,%s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   12841                                   dis_buf);
   12842             goto decode_success;
   12843          }
   12844          /* else fall through */
   12845       }
   12846       break;
   12847 
   12848    case 0x18:
   12849       /* 0F 18 /0 = PREFETCHNTA -- prefetch into caches, */
   12850       /* 0F 18 /1 = PREFETCH0   -- with various different hints */
   12851       /* 0F 18 /2 = PREFETCH1 */
   12852       /* 0F 18 /3 = PREFETCH2 */
   12853       if (haveNo66noF2noF3(pfx)
   12854           && !epartIsReg(getUChar(delta))
   12855           && gregLO3ofRM(getUChar(delta)) >= 0
   12856           && gregLO3ofRM(getUChar(delta)) <= 3) {
   12857          const HChar* hintstr = "??";
   12858 
   12859          modrm = getUChar(delta);
   12860          vassert(!epartIsReg(modrm));
   12861 
   12862          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12863          delta += alen;
   12864 
   12865          switch (gregLO3ofRM(modrm)) {
   12866             case 0: hintstr = "nta"; break;
   12867             case 1: hintstr = "t0"; break;
   12868             case 2: hintstr = "t1"; break;
   12869             case 3: hintstr = "t2"; break;
   12870             default: vassert(0);
   12871          }
   12872 
   12873          DIP("prefetch%s %s\n", hintstr, dis_buf);
   12874          goto decode_success;
   12875       }
   12876       break;
   12877 
   12878    case 0x28:
   12879       /* 66 0F 28 = MOVAPD -- move from E (mem or xmm) to G (xmm). */
   12880       if (have66noF2noF3(pfx)
   12881           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12882          modrm = getUChar(delta);
   12883          if (epartIsReg(modrm)) {
   12884             putXMMReg( gregOfRexRM(pfx,modrm),
   12885                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   12886             DIP("movapd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12887                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12888             delta += 1;
   12889          } else {
   12890             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12891             gen_SEGV_if_not_16_aligned( addr );
   12892             putXMMReg( gregOfRexRM(pfx,modrm),
   12893                        loadLE(Ity_V128, mkexpr(addr)) );
   12894             DIP("movapd %s,%s\n", dis_buf,
   12895                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12896             delta += alen;
   12897          }
   12898          goto decode_success;
   12899       }
   12900       /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
   12901       if (haveNo66noF2noF3(pfx)
   12902           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12903          modrm = getUChar(delta);
   12904          if (epartIsReg(modrm)) {
   12905             putXMMReg( gregOfRexRM(pfx,modrm),
   12906                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   12907             DIP("movaps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12908                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12909             delta += 1;
   12910          } else {
   12911             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12912             gen_SEGV_if_not_16_aligned( addr );
   12913             putXMMReg( gregOfRexRM(pfx,modrm),
   12914                        loadLE(Ity_V128, mkexpr(addr)) );
   12915             DIP("movaps %s,%s\n", dis_buf,
   12916                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12917             delta += alen;
   12918          }
   12919          goto decode_success;
   12920       }
   12921       break;
   12922 
   12923    case 0x29:
   12924       /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
   12925       if (haveNo66noF2noF3(pfx)
   12926           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12927          modrm = getUChar(delta);
   12928          if (epartIsReg(modrm)) {
   12929             putXMMReg( eregOfRexRM(pfx,modrm),
   12930                        getXMMReg( gregOfRexRM(pfx,modrm) ));
   12931             DIP("movaps %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12932                                   nameXMMReg(eregOfRexRM(pfx,modrm)));
   12933             delta += 1;
   12934          } else {
   12935             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12936             gen_SEGV_if_not_16_aligned( addr );
   12937             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   12938             DIP("movaps %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12939                                   dis_buf );
   12940             delta += alen;
   12941          }
   12942          goto decode_success;
   12943       }
   12944       /* 66 0F 29 = MOVAPD -- move from G (xmm) to E (mem or xmm). */
   12945       if (have66noF2noF3(pfx)
   12946           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12947          modrm = getUChar(delta);
   12948          if (epartIsReg(modrm)) {
   12949             putXMMReg( eregOfRexRM(pfx,modrm),
   12950                        getXMMReg( gregOfRexRM(pfx,modrm) ) );
   12951             DIP("movapd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12952                                   nameXMMReg(eregOfRexRM(pfx,modrm)));
   12953             delta += 1;
   12954          } else {
   12955             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12956             gen_SEGV_if_not_16_aligned( addr );
   12957             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   12958             DIP("movapd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12959                                   dis_buf );
   12960             delta += alen;
   12961          }
   12962          goto decode_success;
   12963       }
   12964       break;
   12965 
   12966    case 0x2A:
   12967       /* 0F 2A = CVTPI2PS -- convert 2 x I32 in mem/mmx to 2 x F32 in low
   12968          half xmm */
   12969       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12970          IRTemp arg64 = newTemp(Ity_I64);
   12971          IRTemp rmode = newTemp(Ity_I32);
   12972 
   12973          modrm = getUChar(delta);
   12974          if (epartIsReg(modrm)) {
   12975             /* Only switch to MMX mode if the source is a MMX register.
   12976                See comments on CVTPI2PD for details.  Fixes #357059. */
   12977             do_MMX_preamble();
   12978             assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
   12979             delta += 1;
   12980             DIP("cvtpi2ps %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   12981                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12982          } else {
   12983             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12984             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   12985             delta += alen;
   12986             DIP("cvtpi2ps %s,%s\n", dis_buf,
   12987                                     nameXMMReg(gregOfRexRM(pfx,modrm)) );
   12988          }
   12989 
   12990          assign( rmode, get_sse_roundingmode() );
   12991 
   12992          putXMMRegLane32F(
   12993             gregOfRexRM(pfx,modrm), 0,
   12994             binop(Iop_F64toF32,
   12995                   mkexpr(rmode),
   12996                   unop(Iop_I32StoF64,
   12997                        unop(Iop_64to32, mkexpr(arg64)) )) );
   12998 
   12999          putXMMRegLane32F(
   13000             gregOfRexRM(pfx,modrm), 1,
   13001             binop(Iop_F64toF32,
   13002                   mkexpr(rmode),
   13003                   unop(Iop_I32StoF64,
   13004                        unop(Iop_64HIto32, mkexpr(arg64)) )) );
   13005 
   13006          goto decode_success;
   13007       }
   13008       /* F3 0F 2A = CVTSI2SS
   13009          -- sz==4: convert I32 in mem/ireg to F32 in low quarter xmm
   13010          -- sz==8: convert I64 in mem/ireg to F32 in low quarter xmm */
   13011       if (haveF3no66noF2(pfx) && (sz == 4 || sz == 8)) {
   13012          IRTemp rmode = newTemp(Ity_I32);
   13013          assign( rmode, get_sse_roundingmode() );
   13014          modrm = getUChar(delta);
   13015          if (sz == 4) {
   13016             IRTemp arg32 = newTemp(Ity_I32);
   13017             if (epartIsReg(modrm)) {
   13018                assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
   13019                delta += 1;
   13020                DIP("cvtsi2ss %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   13021                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
   13022             } else {
   13023                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13024                assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   13025                delta += alen;
   13026                DIP("cvtsi2ss %s,%s\n", dis_buf,
   13027                                        nameXMMReg(gregOfRexRM(pfx,modrm)) );
   13028             }
   13029             putXMMRegLane32F(
   13030                gregOfRexRM(pfx,modrm), 0,
   13031                binop(Iop_F64toF32,
   13032                      mkexpr(rmode),
   13033                      unop(Iop_I32StoF64, mkexpr(arg32)) ) );
   13034          } else {
   13035             /* sz == 8 */
   13036             IRTemp arg64 = newTemp(Ity_I64);
   13037             if (epartIsReg(modrm)) {
   13038                assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
   13039                delta += 1;
   13040                DIP("cvtsi2ssq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   13041                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   13042             } else {
   13043                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13044                assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   13045                delta += alen;
   13046                DIP("cvtsi2ssq %s,%s\n", dis_buf,
   13047                                         nameXMMReg(gregOfRexRM(pfx,modrm)) );
   13048             }
   13049             putXMMRegLane32F(
   13050                gregOfRexRM(pfx,modrm), 0,
   13051                binop(Iop_F64toF32,
   13052                      mkexpr(rmode),
   13053                      binop(Iop_I64StoF64, mkexpr(rmode), mkexpr(arg64)) ) );
   13054          }
   13055          goto decode_success;
   13056       }
   13057       /* F2 0F 2A = CVTSI2SD
   13058          when sz==4 -- convert I32 in mem/ireg to F64 in low half xmm
   13059          when sz==8 -- convert I64 in mem/ireg to F64 in low half xmm
   13060       */
   13061       if (haveF2no66noF3(pfx) && (sz == 4 || sz == 8)) {
   13062          modrm = getUChar(delta);
   13063          if (sz == 4) {
   13064             IRTemp arg32 = newTemp(Ity_I32);
   13065             if (epartIsReg(modrm)) {
   13066                assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
   13067                delta += 1;
   13068                DIP("cvtsi2sdl %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   13069                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   13070             } else {
   13071                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13072                assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   13073                delta += alen;
   13074                DIP("cvtsi2sdl %s,%s\n", dis_buf,
   13075                                         nameXMMReg(gregOfRexRM(pfx,modrm)) );
   13076             }
   13077             putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
   13078                               unop(Iop_I32StoF64, mkexpr(arg32))
   13079             );
   13080          } else {
   13081             /* sz == 8 */
   13082             IRTemp arg64 = newTemp(Ity_I64);
   13083             if (epartIsReg(modrm)) {
   13084                assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
   13085                delta += 1;
   13086                DIP("cvtsi2sdq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   13087                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   13088             } else {
   13089                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13090                assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   13091                delta += alen;
   13092                DIP("cvtsi2sdq %s,%s\n", dis_buf,
   13093                                         nameXMMReg(gregOfRexRM(pfx,modrm)) );
   13094             }
   13095             putXMMRegLane64F(
   13096                gregOfRexRM(pfx,modrm),
   13097                0,
   13098                binop( Iop_I64StoF64,
   13099                       get_sse_roundingmode(),
   13100                       mkexpr(arg64)
   13101                )
   13102             );
   13103          }
   13104          goto decode_success;
   13105       }
   13106       /* 66 0F 2A = CVTPI2PD -- convert 2 x I32 in mem/mmx to 2 x F64 in
   13107          xmm(G) */
   13108       if (have66noF2noF3(pfx) && sz == 2) {
   13109          IRTemp arg64 = newTemp(Ity_I64);
   13110 
   13111          modrm = getUChar(delta);
   13112          if (epartIsReg(modrm)) {
   13113             /* Only switch to MMX mode if the source is a MMX register.
   13114                This is inconsistent with all other instructions which
   13115                convert between XMM and (M64 or MMX), which always switch
   13116                to MMX mode even if 64-bit operand is M64 and not MMX.  At
   13117                least, that's what the Intel docs seem to me to say.
   13118                Fixes #210264. */
   13119             do_MMX_preamble();
   13120             assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
   13121             delta += 1;
   13122             DIP("cvtpi2pd %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   13123                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13124          } else {
   13125             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13126             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   13127             delta += alen;
   13128             DIP("cvtpi2pd %s,%s\n", dis_buf,
   13129                                     nameXMMReg(gregOfRexRM(pfx,modrm)) );
   13130          }
   13131 
   13132          putXMMRegLane64F(
   13133             gregOfRexRM(pfx,modrm), 0,
   13134             unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)) )
   13135          );
   13136 
   13137          putXMMRegLane64F(
   13138             gregOfRexRM(pfx,modrm), 1,
   13139             unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)) )
   13140          );
   13141 
   13142          goto decode_success;
   13143       }
   13144       break;
   13145 
   13146    case 0x2B:
   13147       /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
   13148       /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
   13149       if ( (haveNo66noF2noF3(pfx) && sz == 4)
   13150            || (have66noF2noF3(pfx) && sz == 2) ) {
   13151          modrm = getUChar(delta);
   13152          if (!epartIsReg(modrm)) {
   13153             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13154             gen_SEGV_if_not_16_aligned( addr );
   13155             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   13156             DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
   13157                                     dis_buf,
   13158                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13159             delta += alen;
   13160             goto decode_success;
   13161          }
   13162          /* else fall through */
   13163       }
   13164       break;
   13165 
   13166    case 0x2C:
   13167    case 0x2D:
   13168       /* 0F 2D = CVTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   13169          I32 in mmx, according to prevailing SSE rounding mode */
   13170       /* 0F 2C = CVTTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   13171          I32 in mmx, rounding towards zero */
   13172       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13173          IRTemp dst64  = newTemp(Ity_I64);
   13174          IRTemp rmode  = newTemp(Ity_I32);
   13175          IRTemp f32lo  = newTemp(Ity_F32);
   13176          IRTemp f32hi  = newTemp(Ity_F32);
   13177          Bool   r2zero = toBool(opc == 0x2C);
   13178 
   13179          do_MMX_preamble();
   13180          modrm = getUChar(delta);
   13181 
   13182          if (epartIsReg(modrm)) {
   13183             delta += 1;
   13184             assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
   13185             assign(f32hi, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 1));
   13186             DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   13187                                       nameXMMReg(eregOfRexRM(pfx,modrm)),
   13188                                       nameMMXReg(gregLO3ofRM(modrm)));
   13189          } else {
   13190             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13191             assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   13192             assign(f32hi, loadLE(Ity_F32, binop( Iop_Add64,
   13193                                                  mkexpr(addr),
   13194                                                  mkU64(4) )));
   13195             delta += alen;
   13196             DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   13197                                       dis_buf,
   13198                                       nameMMXReg(gregLO3ofRM(modrm)));
   13199          }
   13200 
   13201          if (r2zero) {
   13202             assign(rmode, mkU32((UInt)Irrm_ZERO) );
   13203          } else {
   13204             assign( rmode, get_sse_roundingmode() );
   13205          }
   13206 
   13207          assign(
   13208             dst64,
   13209             binop( Iop_32HLto64,
   13210                    binop( Iop_F64toI32S,
   13211                           mkexpr(rmode),
   13212                           unop( Iop_F32toF64, mkexpr(f32hi) ) ),
   13213                    binop( Iop_F64toI32S,
   13214                           mkexpr(rmode),
   13215                           unop( Iop_F32toF64, mkexpr(f32lo) ) )
   13216                  )
   13217          );
   13218 
   13219          putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
   13220          goto decode_success;
   13221       }
   13222       /* F3 0F 2D = CVTSS2SI
   13223          when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg,
   13224                        according to prevailing SSE rounding mode
   13225          when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg,
   13226                        according to prevailing SSE rounding mode
   13227       */
   13228       /* F3 0F 2C = CVTTSS2SI
   13229          when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg,
   13230                        truncating towards zero
   13231          when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg,
   13232                        truncating towards zero
   13233       */
   13234       if (haveF3no66noF2(pfx) && (sz == 4 || sz == 8)) {
   13235          delta = dis_CVTxSS2SI( vbi, pfx, delta, False/*!isAvx*/, opc, sz);
   13236          goto decode_success;
   13237       }
   13238       /* F2 0F 2D = CVTSD2SI
   13239          when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg,
   13240                        according to prevailing SSE rounding mode
   13241          when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg,
   13242                        according to prevailing SSE rounding mode
   13243       */
   13244       /* F2 0F 2C = CVTTSD2SI
   13245          when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg,
   13246                        truncating towards zero
   13247          when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg,
   13248                        truncating towards zero
   13249       */
   13250       if (haveF2no66noF3(pfx) && (sz == 4 || sz == 8)) {
   13251          delta = dis_CVTxSD2SI( vbi, pfx, delta, False/*!isAvx*/, opc, sz);
   13252          goto decode_success;
   13253       }
   13254       /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   13255          I32 in mmx, according to prevailing SSE rounding mode */
   13256       /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   13257          I32 in mmx, rounding towards zero */
   13258       if (have66noF2noF3(pfx) && sz == 2) {
   13259          IRTemp dst64  = newTemp(Ity_I64);
   13260          IRTemp rmode  = newTemp(Ity_I32);
   13261          IRTemp f64lo  = newTemp(Ity_F64);
   13262          IRTemp f64hi  = newTemp(Ity_F64);
   13263          Bool   r2zero = toBool(opc == 0x2C);
   13264 
   13265          do_MMX_preamble();
   13266          modrm = getUChar(delta);
   13267 
   13268          if (epartIsReg(modrm)) {
   13269             delta += 1;
   13270             assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
   13271             assign(f64hi, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 1));
   13272             DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "",
   13273                                       nameXMMReg(eregOfRexRM(pfx,modrm)),
   13274                                       nameMMXReg(gregLO3ofRM(modrm)));
   13275          } else {
   13276             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13277             assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   13278             assign(f64hi, loadLE(Ity_F64, binop( Iop_Add64,
   13279                                                  mkexpr(addr),
   13280                                                  mkU64(8) )));
   13281             delta += alen;
   13282             DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "",
   13283                                       dis_buf,
   13284                                       nameMMXReg(gregLO3ofRM(modrm)));
   13285          }
   13286 
   13287          if (r2zero) {
   13288             assign(rmode, mkU32((UInt)Irrm_ZERO) );
   13289          } else {
   13290             assign( rmode, get_sse_roundingmode() );
   13291          }
   13292 
   13293          assign(
   13294             dst64,
   13295             binop( Iop_32HLto64,
   13296                    binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64hi) ),
   13297                    binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo) )
   13298                  )
   13299          );
   13300 
   13301          putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
   13302          goto decode_success;
   13303       }
   13304       break;
   13305 
   13306    case 0x2E:
   13307    case 0x2F:
   13308       /* 66 0F 2E = UCOMISD -- 64F0x2 comparison G,E, and set ZCP */
   13309       /* 66 0F 2F = COMISD  -- 64F0x2 comparison G,E, and set ZCP */
   13310       if (have66noF2noF3(pfx) && sz == 2) {
   13311          delta = dis_COMISD( vbi, pfx, delta, False/*!isAvx*/, opc );
   13312          goto decode_success;
   13313       }
   13314       /* 0F 2E = UCOMISS -- 32F0x4 comparison G,E, and set ZCP */
   13315       /* 0F 2F = COMISS  -- 32F0x4 comparison G,E, and set ZCP */
   13316       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13317          delta = dis_COMISS( vbi, pfx, delta, False/*!isAvx*/, opc );
   13318          goto decode_success;
   13319       }
   13320       break;
   13321 
   13322    case 0x50:
   13323       /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
   13324          to 4 lowest bits of ireg(G) */
   13325       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   13326           && epartIsReg(getUChar(delta))) {
   13327          /* sz == 8 is a kludge to handle insns with REX.W redundantly
   13328             set to 1, which has been known to happen:
   13329 
   13330             4c 0f 50 d9             rex64X movmskps %xmm1,%r11d
   13331 
   13332             20071106: Intel docs say that REX.W isn't redundant: when
   13333             present, a 64-bit register is written; when not present, only
   13334             the 32-bit half is written.  However, testing on a Core2
   13335             machine suggests the entire 64 bit register is written
   13336             irrespective of the status of REX.W.  That could be because
   13337             of the default rule that says "if the lower half of a 32-bit
   13338             register is written, the upper half is zeroed".  By using
   13339             putIReg32 here we inadvertantly produce the same behaviour as
   13340             the Core2, for the same reason -- putIReg32 implements said
   13341             rule.
   13342 
   13343             AMD docs give no indication that REX.W is even valid for this
   13344             insn. */
   13345          delta = dis_MOVMSKPS_128( vbi, pfx, delta, False/*!isAvx*/ );
   13346          goto decode_success;
   13347       }
   13348       /* 66 0F 50 = MOVMSKPD - move 2 sign bits from 2 x F64 in xmm(E) to
   13349          2 lowest bits of ireg(G) */
   13350       if (have66noF2noF3(pfx) && (sz == 2 || sz == 8)) {
   13351          /* sz == 8 is a kludge to handle insns with REX.W redundantly
   13352             set to 1, which has been known to happen:
   13353             66 4c 0f 50 d9          rex64X movmskpd %xmm1,%r11d
   13354             20071106: see further comments on MOVMSKPS implementation above.
   13355          */
   13356          delta = dis_MOVMSKPD_128( vbi, pfx, delta, False/*!isAvx*/ );
   13357          goto decode_success;
   13358       }
   13359       break;
   13360 
   13361    case 0x51:
   13362       /* F3 0F 51 = SQRTSS -- approx sqrt 32F0x4 from R/M to R */
   13363       if (haveF3no66noF2(pfx) && sz == 4) {
   13364          delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
   13365                                             "sqrtss", Iop_Sqrt32F0x4 );
   13366          goto decode_success;
   13367       }
   13368       /* 0F 51 = SQRTPS -- approx sqrt 32Fx4 from R/M to R */
   13369       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13370          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
   13371                                            "sqrtps", Iop_Sqrt32Fx4 );
   13372          goto decode_success;
   13373       }
   13374       /* F2 0F 51 = SQRTSD -- approx sqrt 64F0x2 from R/M to R */
   13375       if (haveF2no66noF3(pfx) && sz == 4) {
   13376          delta = dis_SSE_E_to_G_unary_lo64( vbi, pfx, delta,
   13377                                             "sqrtsd", Iop_Sqrt64F0x2 );
   13378          goto decode_success;
   13379       }
   13380       /* 66 0F 51 = SQRTPD -- approx sqrt 64Fx2 from R/M to R */
   13381       if (have66noF2noF3(pfx) && sz == 2) {
   13382          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
   13383                                            "sqrtpd", Iop_Sqrt64Fx2 );
   13384          goto decode_success;
   13385       }
   13386       break;
   13387 
   13388    case 0x52:
   13389       /* F3 0F 52 = RSQRTSS -- approx reciprocal sqrt 32F0x4 from R/M to R */
   13390       if (haveF3no66noF2(pfx) && sz == 4) {
   13391          delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
   13392                                             "rsqrtss", Iop_RSqrtEst32F0x4 );
   13393          goto decode_success;
   13394       }
   13395       /* 0F 52 = RSQRTPS -- approx reciprocal sqrt 32Fx4 from R/M to R */
   13396       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13397          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
   13398                                            "rsqrtps", Iop_RSqrtEst32Fx4 );
   13399          goto decode_success;
   13400       }
   13401       break;
   13402 
   13403    case 0x53:
   13404       /* F3 0F 53 = RCPSS -- approx reciprocal 32F0x4 from R/M to R */
   13405       if (haveF3no66noF2(pfx) && sz == 4) {
   13406          delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
   13407                                             "rcpss", Iop_RecipEst32F0x4 );
   13408          goto decode_success;
   13409       }
   13410       /* 0F 53 = RCPPS -- approx reciprocal 32Fx4 from R/M to R */
   13411       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13412          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
   13413                                            "rcpps", Iop_RecipEst32Fx4 );
   13414          goto decode_success;
   13415       }
   13416       break;
   13417 
   13418    case 0x54:
   13419       /* 0F 54 = ANDPS -- G = G and E */
   13420       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13421          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "andps", Iop_AndV128 );
   13422          goto decode_success;
   13423       }
   13424       /* 66 0F 54 = ANDPD -- G = G and E */
   13425       if (have66noF2noF3(pfx) && sz == 2) {
   13426          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "andpd", Iop_AndV128 );
   13427          goto decode_success;
   13428       }
   13429       break;
   13430 
   13431    case 0x55:
   13432       /* 0F 55 = ANDNPS -- G = (not G) and E */
   13433       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13434          delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "andnps",
   13435                                                            Iop_AndV128 );
   13436          goto decode_success;
   13437       }
   13438       /* 66 0F 55 = ANDNPD -- G = (not G) and E */
   13439       if (have66noF2noF3(pfx) && sz == 2) {
   13440          delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "andnpd",
   13441                                                            Iop_AndV128 );
   13442          goto decode_success;
   13443       }
   13444       break;
   13445 
   13446    case 0x56:
   13447       /* 0F 56 = ORPS -- G = G and E */
   13448       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13449          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "orps", Iop_OrV128 );
   13450          goto decode_success;
   13451       }
   13452       /* 66 0F 56 = ORPD -- G = G and E */
   13453       if (have66noF2noF3(pfx) && sz == 2) {
   13454          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "orpd", Iop_OrV128 );
   13455          goto decode_success;
   13456       }
   13457       break;
   13458 
   13459    case 0x57:
   13460       /* 66 0F 57 = XORPD -- G = G xor E */
   13461       if (have66noF2noF3(pfx) && sz == 2) {
   13462          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "xorpd", Iop_XorV128 );
   13463          goto decode_success;
   13464       }
   13465       /* 0F 57 = XORPS -- G = G xor E */
   13466       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13467          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "xorps", Iop_XorV128 );
   13468          goto decode_success;
   13469       }
   13470       break;
   13471 
   13472    case 0x58:
   13473       /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
   13474       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13475          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "addps", Iop_Add32Fx4 );
   13476          goto decode_success;
   13477       }
   13478       /* F3 0F 58 = ADDSS -- add 32F0x4 from R/M to R */
   13479       if (haveF3no66noF2(pfx) && sz == 4) {
   13480          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "addss", Iop_Add32F0x4 );
   13481          goto decode_success;
   13482       }
   13483       /* F2 0F 58 = ADDSD -- add 64F0x2 from R/M to R */
   13484       if (haveF2no66noF3(pfx)
   13485           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   13486          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "addsd", Iop_Add64F0x2 );
   13487          goto decode_success;
   13488       }
   13489       /* 66 0F 58 = ADDPD -- add 32Fx4 from R/M to R */
   13490       if (have66noF2noF3(pfx)
   13491           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   13492          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "addpd", Iop_Add64Fx2 );
   13493          goto decode_success;
   13494       }
   13495       break;
   13496 
   13497    case 0x59:
   13498       /* F2 0F 59 = MULSD -- mul 64F0x2 from R/M to R */
   13499       if (haveF2no66noF3(pfx)
   13500           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   13501          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "mulsd", Iop_Mul64F0x2 );
   13502          goto decode_success;
   13503       }
   13504       /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
   13505       if (haveF3no66noF2(pfx) && sz == 4) {
   13506          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "mulss", Iop_Mul32F0x4 );
   13507          goto decode_success;
   13508       }
   13509       /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
   13510       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13511          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "mulps", Iop_Mul32Fx4 );
   13512          goto decode_success;
   13513       }
   13514       /* 66 0F 59 = MULPD -- mul 64Fx2 from R/M to R */
   13515       if (have66noF2noF3(pfx)
   13516           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   13517          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "mulpd", Iop_Mul64Fx2 );
   13518          goto decode_success;
   13519       }
   13520       break;
   13521 
   13522    case 0x5A:
   13523       /* 0F 5A = CVTPS2PD -- convert 2 x F32 in low half mem/xmm to 2 x
   13524          F64 in xmm(G). */
   13525       if (haveNo66noF2noF3(pfx)
   13526           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   13527          delta = dis_CVTPS2PD_128( vbi, pfx, delta, False/*!isAvx*/ );
   13528          goto decode_success;
   13529       }
   13530       /* F3 0F 5A = CVTSS2SD -- convert F32 in mem/low 1/4 xmm to F64 in
   13531          low half xmm(G) */
   13532       if (haveF3no66noF2(pfx) && sz == 4) {
   13533          IRTemp f32lo = newTemp(Ity_F32);
   13534 
   13535          modrm = getUChar(delta);
   13536          if (epartIsReg(modrm)) {
   13537             delta += 1;
   13538             assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
   13539             DIP("cvtss2sd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13540                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13541          } else {
   13542             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13543             assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   13544             delta += alen;
   13545             DIP("cvtss2sd %s,%s\n", dis_buf,
   13546                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13547          }
   13548 
   13549          putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
   13550                            unop( Iop_F32toF64, mkexpr(f32lo) ) );
   13551 
   13552          goto decode_success;
   13553       }
   13554       /* F2 0F 5A = CVTSD2SS -- convert F64 in mem/low half xmm to F32 in
   13555          low 1/4 xmm(G), according to prevailing SSE rounding mode */
   13556       if (haveF2no66noF3(pfx) && sz == 4) {
   13557          IRTemp rmode = newTemp(Ity_I32);
   13558          IRTemp f64lo = newTemp(Ity_F64);
   13559 
   13560          modrm = getUChar(delta);
   13561          if (epartIsReg(modrm)) {
   13562             delta += 1;
   13563             assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
   13564             DIP("cvtsd2ss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13565                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13566          } else {
   13567             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13568             assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   13569             delta += alen;
   13570             DIP("cvtsd2ss %s,%s\n", dis_buf,
   13571                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13572          }
   13573 
   13574          assign( rmode, get_sse_roundingmode() );
   13575          putXMMRegLane32F(
   13576             gregOfRexRM(pfx,modrm), 0,
   13577             binop( Iop_F64toF32, mkexpr(rmode), mkexpr(f64lo) )
   13578          );
   13579 
   13580          goto decode_success;
   13581       }
   13582       /* 66 0F 5A = CVTPD2PS -- convert 2 x F64 in mem/xmm to 2 x F32 in
   13583          lo half xmm(G), rounding according to prevailing SSE rounding
   13584          mode, and zero upper half */
   13585       /* Note, this is practically identical to CVTPD2DQ.  It would have
   13586          be nice to merge them together. */
   13587       if (have66noF2noF3(pfx) && sz == 2) {
   13588          delta = dis_CVTPD2PS_128( vbi, pfx, delta, False/*!isAvx*/ );
   13589          goto decode_success;
   13590       }
   13591       break;
   13592 
   13593    case 0x5B:
   13594       /* F3 0F 5B = CVTTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   13595          xmm(G), rounding towards zero */
   13596       /* 66 0F 5B = CVTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   13597          xmm(G), as per the prevailing rounding mode */
   13598       if ( (have66noF2noF3(pfx) && sz == 2)
   13599            || (haveF3no66noF2(pfx) && sz == 4) ) {
   13600          Bool r2zero = toBool(sz == 4); // FIXME -- unreliable (???)
   13601          delta = dis_CVTxPS2DQ_128( vbi, pfx, delta, False/*!isAvx*/, r2zero );
   13602          goto decode_success;
   13603       }
   13604       /* 0F 5B = CVTDQ2PS -- convert 4 x I32 in mem/xmm to 4 x F32 in
   13605          xmm(G) */
   13606       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13607          delta = dis_CVTDQ2PS_128( vbi, pfx, delta, False/*!isAvx*/ );
   13608          goto decode_success;
   13609       }
   13610       break;
   13611 
   13612    case 0x5C:
   13613       /* F3 0F 5C = SUBSS -- sub 32F0x4 from R/M to R */
   13614       if (haveF3no66noF2(pfx) && sz == 4) {
   13615          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "subss", Iop_Sub32F0x4 );
   13616          goto decode_success;
   13617       }
   13618       /* F2 0F 5C = SUBSD -- sub 64F0x2 from R/M to R */
   13619       if (haveF2no66noF3(pfx)
   13620           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   13621          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "subsd", Iop_Sub64F0x2 );
   13622          goto decode_success;
   13623       }
   13624       /* 0F 5C = SUBPS -- sub 32Fx4 from R/M to R */
   13625       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13626          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "subps", Iop_Sub32Fx4 );
   13627          goto decode_success;
   13628       }
   13629       /* 66 0F 5C = SUBPD -- sub 64Fx2 from R/M to R */
   13630       if (have66noF2noF3(pfx) && sz == 2) {
   13631          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "subpd", Iop_Sub64Fx2 );
   13632          goto decode_success;
   13633       }
   13634       break;
   13635 
   13636    case 0x5D:
   13637       /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
   13638       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13639          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "minps", Iop_Min32Fx4 );
   13640          goto decode_success;
   13641       }
   13642       /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
   13643       if (haveF3no66noF2(pfx) && sz == 4) {
   13644          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "minss", Iop_Min32F0x4 );
   13645          goto decode_success;
   13646       }
   13647       /* F2 0F 5D = MINSD -- min 64F0x2 from R/M to R */
   13648       if (haveF2no66noF3(pfx)
   13649           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   13650          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "minsd", Iop_Min64F0x2 );
   13651          goto decode_success;
   13652       }
   13653       /* 66 0F 5D = MINPD -- min 64Fx2 from R/M to R */
   13654       if (have66noF2noF3(pfx) && sz == 2) {
   13655          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "minpd", Iop_Min64Fx2 );
   13656          goto decode_success;
   13657       }
   13658       break;
   13659 
   13660    case 0x5E:
   13661       /* F2 0F 5E = DIVSD -- div 64F0x2 from R/M to R */
   13662       if (haveF2no66noF3(pfx) && sz == 4) {
   13663          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "divsd", Iop_Div64F0x2 );
   13664          goto decode_success;
   13665       }
   13666       /* 0F 5E = DIVPS -- div 32Fx4 from R/M to R */
   13667       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13668          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "divps", Iop_Div32Fx4 );
   13669          goto decode_success;
   13670       }
   13671       /* F3 0F 5E = DIVSS -- div 32F0x4 from R/M to R */
   13672       if (haveF3no66noF2(pfx) && sz == 4) {
   13673          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "divss", Iop_Div32F0x4 );
   13674          goto decode_success;
   13675       }
   13676       /* 66 0F 5E = DIVPD -- div 64Fx2 from R/M to R */
   13677       if (have66noF2noF3(pfx) && sz == 2) {
   13678          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "divpd", Iop_Div64Fx2 );
   13679          goto decode_success;
   13680       }
   13681       break;
   13682 
   13683    case 0x5F:
   13684       /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
   13685       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13686          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "maxps", Iop_Max32Fx4 );
   13687          goto decode_success;
   13688       }
   13689       /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
   13690       if (haveF3no66noF2(pfx) && sz == 4) {
   13691          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "maxss", Iop_Max32F0x4 );
   13692          goto decode_success;
   13693       }
   13694       /* F2 0F 5F = MAXSD -- max 64F0x2 from R/M to R */
   13695       if (haveF2no66noF3(pfx)
   13696           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   13697          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "maxsd", Iop_Max64F0x2 );
   13698          goto decode_success;
   13699       }
   13700       /* 66 0F 5F = MAXPD -- max 64Fx2 from R/M to R */
   13701       if (have66noF2noF3(pfx) && sz == 2) {
   13702          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "maxpd", Iop_Max64Fx2 );
   13703          goto decode_success;
   13704       }
   13705       break;
   13706 
   13707    case 0x60:
   13708       /* 66 0F 60 = PUNPCKLBW */
   13709       if (have66noF2noF3(pfx) && sz == 2) {
   13710          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13711                                     "punpcklbw",
   13712                                     Iop_InterleaveLO8x16, True );
   13713          goto decode_success;
   13714       }
   13715       break;
   13716 
   13717    case 0x61:
   13718       /* 66 0F 61 = PUNPCKLWD */
   13719       if (have66noF2noF3(pfx) && sz == 2) {
   13720          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13721                                     "punpcklwd",
   13722                                     Iop_InterleaveLO16x8, True );
   13723          goto decode_success;
   13724       }
   13725       break;
   13726 
   13727    case 0x62:
   13728       /* 66 0F 62 = PUNPCKLDQ */
   13729       if (have66noF2noF3(pfx) && sz == 2) {
   13730          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13731                                     "punpckldq",
   13732                                     Iop_InterleaveLO32x4, True );
   13733          goto decode_success;
   13734       }
   13735       break;
   13736 
   13737    case 0x63:
   13738       /* 66 0F 63 = PACKSSWB */
   13739       if (have66noF2noF3(pfx) && sz == 2) {
   13740          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13741                                     "packsswb",
   13742                                     Iop_QNarrowBin16Sto8Sx16, True );
   13743          goto decode_success;
   13744       }
   13745       break;
   13746 
   13747    case 0x64:
   13748       /* 66 0F 64 = PCMPGTB */
   13749       if (have66noF2noF3(pfx) && sz == 2) {
   13750          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13751                                     "pcmpgtb", Iop_CmpGT8Sx16, False );
   13752          goto decode_success;
   13753       }
   13754       break;
   13755 
   13756    case 0x65:
   13757       /* 66 0F 65 = PCMPGTW */
   13758       if (have66noF2noF3(pfx) && sz == 2) {
   13759          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13760                                     "pcmpgtw", Iop_CmpGT16Sx8, False );
   13761          goto decode_success;
   13762       }
   13763       break;
   13764 
   13765    case 0x66:
   13766       /* 66 0F 66 = PCMPGTD */
   13767       if (have66noF2noF3(pfx) && sz == 2) {
   13768          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13769                                     "pcmpgtd", Iop_CmpGT32Sx4, False );
   13770          goto decode_success;
   13771       }
   13772       break;
   13773 
   13774    case 0x67:
   13775       /* 66 0F 67 = PACKUSWB */
   13776       if (have66noF2noF3(pfx) && sz == 2) {
   13777          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13778                                     "packuswb",
   13779                                     Iop_QNarrowBin16Sto8Ux16, True );
   13780          goto decode_success;
   13781       }
   13782       break;
   13783 
   13784    case 0x68:
   13785       /* 66 0F 68 = PUNPCKHBW */
   13786       if (have66noF2noF3(pfx) && sz == 2) {
   13787          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13788                                     "punpckhbw",
   13789                                     Iop_InterleaveHI8x16, True );
   13790          goto decode_success;
   13791       }
   13792       break;
   13793 
   13794    case 0x69:
   13795       /* 66 0F 69 = PUNPCKHWD */
   13796       if (have66noF2noF3(pfx) && sz == 2) {
   13797          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13798                                     "punpckhwd",
   13799                                     Iop_InterleaveHI16x8, True );
   13800          goto decode_success;
   13801       }
   13802       break;
   13803 
   13804    case 0x6A:
   13805       /* 66 0F 6A = PUNPCKHDQ */
   13806       if (have66noF2noF3(pfx) && sz == 2) {
   13807          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13808                                     "punpckhdq",
   13809                                     Iop_InterleaveHI32x4, True );
   13810          goto decode_success;
   13811       }
   13812       break;
   13813 
   13814    case 0x6B:
   13815       /* 66 0F 6B = PACKSSDW */
   13816       if (have66noF2noF3(pfx) && sz == 2) {
   13817          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13818                                     "packssdw",
   13819                                     Iop_QNarrowBin32Sto16Sx8, True );
   13820          goto decode_success;
   13821       }
   13822       break;
   13823 
   13824    case 0x6C:
   13825       /* 66 0F 6C = PUNPCKLQDQ */
   13826       if (have66noF2noF3(pfx) && sz == 2) {
   13827          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13828                                     "punpcklqdq",
   13829                                     Iop_InterleaveLO64x2, True );
   13830          goto decode_success;
   13831       }
   13832       break;
   13833 
   13834    case 0x6D:
   13835       /* 66 0F 6D = PUNPCKHQDQ */
   13836       if (have66noF2noF3(pfx) && sz == 2) {
   13837          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13838                                     "punpckhqdq",
   13839                                     Iop_InterleaveHI64x2, True );
   13840          goto decode_success;
   13841       }
   13842       break;
   13843 
   13844    case 0x6E:
   13845       /* 66 0F 6E = MOVD from ireg32/m32 to xmm lo 1/4,
   13846                     zeroing high 3/4 of xmm. */
   13847       /*              or from ireg64/m64 to xmm lo 1/2,
   13848                     zeroing high 1/2 of xmm. */
   13849       if (have66noF2noF3(pfx)) {
   13850          vassert(sz == 2 || sz == 8);
   13851          if (sz == 2) sz = 4;
   13852          modrm = getUChar(delta);
   13853          if (epartIsReg(modrm)) {
   13854             delta += 1;
   13855             if (sz == 4) {
   13856                putXMMReg(
   13857                   gregOfRexRM(pfx,modrm),
   13858                   unop( Iop_32UtoV128, getIReg32(eregOfRexRM(pfx,modrm)) )
   13859                );
   13860                DIP("movd %s, %s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   13861                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13862             } else {
   13863                putXMMReg(
   13864                   gregOfRexRM(pfx,modrm),
   13865                   unop( Iop_64UtoV128, getIReg64(eregOfRexRM(pfx,modrm)) )
   13866                );
   13867                DIP("movq %s, %s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   13868                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13869             }
   13870          } else {
   13871             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   13872             delta += alen;
   13873             putXMMReg(
   13874                gregOfRexRM(pfx,modrm),
   13875                sz == 4
   13876                   ?  unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)) )
   13877                   :  unop( Iop_64UtoV128,loadLE(Ity_I64, mkexpr(addr)) )
   13878             );
   13879             DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q', dis_buf,
   13880                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13881          }
   13882          goto decode_success;
   13883       }
   13884       break;
   13885 
   13886    case 0x6F:
   13887       if (have66noF2noF3(pfx)
   13888           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   13889          /* 66 0F 6F = MOVDQA -- move from E (mem or xmm) to G (xmm). */
   13890          modrm = getUChar(delta);
   13891          if (epartIsReg(modrm)) {
   13892             putXMMReg( gregOfRexRM(pfx,modrm),
   13893                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   13894             DIP("movdqa %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13895                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13896             delta += 1;
   13897          } else {
   13898             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13899             gen_SEGV_if_not_16_aligned( addr );
   13900             putXMMReg( gregOfRexRM(pfx,modrm),
   13901                        loadLE(Ity_V128, mkexpr(addr)) );
   13902             DIP("movdqa %s,%s\n", dis_buf,
   13903                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13904             delta += alen;
   13905          }
   13906          goto decode_success;
   13907       }
   13908       if (haveF3no66noF2(pfx) && sz == 4) {
   13909          /* F3 0F 6F = MOVDQU -- move from E (mem or xmm) to G (xmm). */
   13910          modrm = getUChar(delta);
   13911          if (epartIsReg(modrm)) {
   13912             putXMMReg( gregOfRexRM(pfx,modrm),
   13913                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   13914             DIP("movdqu %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13915                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13916             delta += 1;
   13917          } else {
   13918             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13919             putXMMReg( gregOfRexRM(pfx,modrm),
   13920                        loadLE(Ity_V128, mkexpr(addr)) );
   13921             DIP("movdqu %s,%s\n", dis_buf,
   13922                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13923             delta += alen;
   13924          }
   13925          goto decode_success;
   13926       }
   13927       break;
   13928 
   13929    case 0x70:
   13930       /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */
   13931       if (have66noF2noF3(pfx) && sz == 2) {
   13932          delta = dis_PSHUFD_32x4( vbi, pfx, delta, False/*!writesYmm*/);
   13933          goto decode_success;
   13934       }
   13935       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   13936       /* 0F 70 = PSHUFW -- rearrange 4x16 from E(mmx or mem) to G(mmx) */
   13937       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13938          Int order;
   13939          IRTemp sV, dV, s3, s2, s1, s0;
   13940          s3 = s2 = s1 = s0 = IRTemp_INVALID;
   13941          sV = newTemp(Ity_I64);
   13942          dV = newTemp(Ity_I64);
   13943          do_MMX_preamble();
   13944          modrm = getUChar(delta);
   13945          if (epartIsReg(modrm)) {
   13946             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   13947             order = (Int)getUChar(delta+1);
   13948             delta += 1+1;
   13949             DIP("pshufw $%d,%s,%s\n", order,
   13950                                       nameMMXReg(eregLO3ofRM(modrm)),
   13951                                       nameMMXReg(gregLO3ofRM(modrm)));
   13952          } else {
   13953             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   13954                               1/*extra byte after amode*/ );
   13955             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   13956             order = (Int)getUChar(delta+alen);
   13957             delta += 1+alen;
   13958             DIP("pshufw $%d,%s,%s\n", order,
   13959                                       dis_buf,
   13960                                       nameMMXReg(gregLO3ofRM(modrm)));
   13961          }
   13962          breakup64to16s( sV, &s3, &s2, &s1, &s0 );
   13963 #        define SEL(n) \
   13964                    ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   13965          assign(dV,
   13966                 mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   13967                              SEL((order>>2)&3), SEL((order>>0)&3) )
   13968          );
   13969          putMMXReg(gregLO3ofRM(modrm), mkexpr(dV));
   13970 #        undef SEL
   13971          goto decode_success;
   13972       }
   13973       /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or
   13974          mem) to G(xmm), and copy upper half */
   13975       if (haveF2no66noF3(pfx) && sz == 4) {
   13976          delta = dis_PSHUFxW_128( vbi, pfx, delta,
   13977                                   False/*!isAvx*/, False/*!xIsH*/ );
   13978          goto decode_success;
   13979       }
   13980       /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or
   13981          mem) to G(xmm), and copy lower half */
   13982       if (haveF3no66noF2(pfx) && sz == 4) {
   13983          delta = dis_PSHUFxW_128( vbi, pfx, delta,
   13984                                   False/*!isAvx*/, True/*xIsH*/ );
   13985          goto decode_success;
   13986       }
   13987       break;
   13988 
   13989    case 0x71:
   13990       /* 66 0F 71 /2 ib = PSRLW by immediate */
   13991       if (have66noF2noF3(pfx) && sz == 2
   13992           && epartIsReg(getUChar(delta))
   13993           && gregLO3ofRM(getUChar(delta)) == 2) {
   13994          delta = dis_SSE_shiftE_imm( pfx, delta, "psrlw", Iop_ShrN16x8 );
   13995          goto decode_success;
   13996       }
   13997       /* 66 0F 71 /4 ib = PSRAW by immediate */
   13998       if (have66noF2noF3(pfx) && sz == 2
   13999           && epartIsReg(getUChar(delta))
   14000           && gregLO3ofRM(getUChar(delta)) == 4) {
   14001          delta = dis_SSE_shiftE_imm( pfx, delta, "psraw", Iop_SarN16x8 );
   14002          goto decode_success;
   14003       }
   14004       /* 66 0F 71 /6 ib = PSLLW by immediate */
   14005       if (have66noF2noF3(pfx) && sz == 2
   14006           && epartIsReg(getUChar(delta))
   14007           && gregLO3ofRM(getUChar(delta)) == 6) {
   14008          delta = dis_SSE_shiftE_imm( pfx, delta, "psllw", Iop_ShlN16x8 );
   14009          goto decode_success;
   14010       }
   14011       break;
   14012 
   14013    case 0x72:
   14014       /* 66 0F 72 /2 ib = PSRLD by immediate */
   14015       if (have66noF2noF3(pfx) && sz == 2
   14016           && epartIsReg(getUChar(delta))
   14017           && gregLO3ofRM(getUChar(delta)) == 2) {
   14018          delta = dis_SSE_shiftE_imm( pfx, delta, "psrld", Iop_ShrN32x4 );
   14019          goto decode_success;
   14020       }
   14021       /* 66 0F 72 /4 ib = PSRAD by immediate */
   14022       if (have66noF2noF3(pfx) && sz == 2
   14023           && epartIsReg(getUChar(delta))
   14024           && gregLO3ofRM(getUChar(delta)) == 4) {
   14025          delta = dis_SSE_shiftE_imm( pfx, delta, "psrad", Iop_SarN32x4 );
   14026          goto decode_success;
   14027       }
   14028       /* 66 0F 72 /6 ib = PSLLD by immediate */
   14029       if (have66noF2noF3(pfx) && sz == 2
   14030           && epartIsReg(getUChar(delta))
   14031           && gregLO3ofRM(getUChar(delta)) == 6) {
   14032          delta = dis_SSE_shiftE_imm( pfx, delta, "pslld", Iop_ShlN32x4 );
   14033          goto decode_success;
   14034       }
   14035       break;
   14036 
   14037    case 0x73:
   14038       /* 66 0F 73 /3 ib = PSRLDQ by immediate */
   14039       /* note, if mem case ever filled in, 1 byte after amode */
   14040       if (have66noF2noF3(pfx) && sz == 2
   14041           && epartIsReg(getUChar(delta))
   14042           && gregLO3ofRM(getUChar(delta)) == 3) {
   14043          Int imm = (Int)getUChar(delta+1);
   14044          Int reg = eregOfRexRM(pfx,getUChar(delta));
   14045          DIP("psrldq $%d,%s\n", imm, nameXMMReg(reg));
   14046          delta += 2;
   14047          IRTemp sV = newTemp(Ity_V128);
   14048          assign( sV, getXMMReg(reg) );
   14049          putXMMReg(reg, mkexpr(math_PSRLDQ( sV, imm )));
   14050          goto decode_success;
   14051       }
   14052       /* 66 0F 73 /7 ib = PSLLDQ by immediate */
   14053       /* note, if mem case ever filled in, 1 byte after amode */
   14054       if (have66noF2noF3(pfx) && sz == 2
   14055           && epartIsReg(getUChar(delta))
   14056           && gregLO3ofRM(getUChar(delta)) == 7) {
   14057          Int imm = (Int)getUChar(delta+1);
   14058          Int reg = eregOfRexRM(pfx,getUChar(delta));
   14059          DIP("pslldq $%d,%s\n", imm, nameXMMReg(reg));
   14060          vassert(imm >= 0 && imm <= 255);
   14061          delta += 2;
   14062          IRTemp sV = newTemp(Ity_V128);
   14063          assign( sV, getXMMReg(reg) );
   14064          putXMMReg(reg, mkexpr(math_PSLLDQ( sV, imm )));
   14065          goto decode_success;
   14066       }
   14067       /* 66 0F 73 /2 ib = PSRLQ by immediate */
   14068       if (have66noF2noF3(pfx) && sz == 2
   14069           && epartIsReg(getUChar(delta))
   14070           && gregLO3ofRM(getUChar(delta)) == 2) {
   14071          delta = dis_SSE_shiftE_imm( pfx, delta, "psrlq", Iop_ShrN64x2 );
   14072          goto decode_success;
   14073       }
   14074       /* 66 0F 73 /6 ib = PSLLQ by immediate */
   14075       if (have66noF2noF3(pfx) && sz == 2
   14076           && epartIsReg(getUChar(delta))
   14077           && gregLO3ofRM(getUChar(delta)) == 6) {
   14078          delta = dis_SSE_shiftE_imm( pfx, delta, "psllq", Iop_ShlN64x2 );
   14079          goto decode_success;
   14080       }
   14081       break;
   14082 
   14083    case 0x74:
   14084       /* 66 0F 74 = PCMPEQB */
   14085       if (have66noF2noF3(pfx) && sz == 2) {
   14086          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14087                                     "pcmpeqb", Iop_CmpEQ8x16, False );
   14088          goto decode_success;
   14089       }
   14090       break;
   14091 
   14092    case 0x75:
   14093       /* 66 0F 75 = PCMPEQW */
   14094       if (have66noF2noF3(pfx) && sz == 2) {
   14095          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14096                                     "pcmpeqw", Iop_CmpEQ16x8, False );
   14097          goto decode_success;
   14098       }
   14099       break;
   14100 
   14101    case 0x76:
   14102       /* 66 0F 76 = PCMPEQD */
   14103       if (have66noF2noF3(pfx) && sz == 2) {
   14104          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14105                                     "pcmpeqd", Iop_CmpEQ32x4, False );
   14106          goto decode_success;
   14107       }
   14108       break;
   14109 
   14110    case 0x7E:
   14111       /* F3 0F 7E = MOVQ -- move 64 bits from E (mem or lo half xmm) to
   14112          G (lo half xmm).  Upper half of G is zeroed out. */
   14113       if (haveF3no66noF2(pfx)
   14114           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   14115          modrm = getUChar(delta);
   14116          if (epartIsReg(modrm)) {
   14117             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   14118                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
   14119                /* zero bits 127:64 */
   14120                putXMMRegLane64( gregOfRexRM(pfx,modrm), 1, mkU64(0) );
   14121             DIP("movsd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   14122                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   14123             delta += 1;
   14124          } else {
   14125             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14126             putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
   14127             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   14128                              loadLE(Ity_I64, mkexpr(addr)) );
   14129             DIP("movsd %s,%s\n", dis_buf,
   14130                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   14131             delta += alen;
   14132          }
   14133          goto decode_success;
   14134       }
   14135       /* 66 0F 7E = MOVD from xmm low 1/4 to ireg32 or m32. */
   14136       /*              or from xmm low 1/2 to ireg64 or m64. */
   14137          if (have66noF2noF3(pfx) && (sz == 2 || sz == 8)) {
   14138          if (sz == 2) sz = 4;
   14139          modrm = getUChar(delta);
   14140          if (epartIsReg(modrm)) {
   14141             delta += 1;
   14142             if (sz == 4) {
   14143                putIReg32( eregOfRexRM(pfx,modrm),
   14144                           getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
   14145                DIP("movd %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   14146                                     nameIReg32(eregOfRexRM(pfx,modrm)));
   14147             } else {
   14148                putIReg64( eregOfRexRM(pfx,modrm),
   14149                           getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
   14150                DIP("movq %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   14151                                     nameIReg64(eregOfRexRM(pfx,modrm)));
   14152             }
   14153          } else {
   14154             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   14155             delta += alen;
   14156             storeLE( mkexpr(addr),
   14157                      sz == 4
   14158                         ? getXMMRegLane32(gregOfRexRM(pfx,modrm),0)
   14159                         : getXMMRegLane64(gregOfRexRM(pfx,modrm),0) );
   14160             DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q',
   14161                                   nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
   14162          }
   14163          goto decode_success;
   14164       }
   14165       break;
   14166 
   14167    case 0x7F:
   14168       /* F3 0F 7F = MOVDQU -- move from G (xmm) to E (mem or xmm). */
   14169       if (haveF3no66noF2(pfx) && sz == 4) {
   14170          modrm = getUChar(delta);
   14171          if (epartIsReg(modrm)) {
   14172             goto decode_failure; /* awaiting test case */
   14173             delta += 1;
   14174             putXMMReg( eregOfRexRM(pfx,modrm),
   14175                        getXMMReg(gregOfRexRM(pfx,modrm)) );
   14176             DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   14177                                    nameXMMReg(eregOfRexRM(pfx,modrm)));
   14178          } else {
   14179             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   14180             delta += alen;
   14181             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   14182             DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
   14183          }
   14184          goto decode_success;
   14185       }
   14186       /* 66 0F 7F = MOVDQA -- move from G (xmm) to E (mem or xmm). */
   14187       if (have66noF2noF3(pfx) && sz == 2) {
   14188          modrm = getUChar(delta);
   14189          if (epartIsReg(modrm)) {
   14190             delta += 1;
   14191             putXMMReg( eregOfRexRM(pfx,modrm),
   14192                        getXMMReg(gregOfRexRM(pfx,modrm)) );
   14193             DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   14194                                    nameXMMReg(eregOfRexRM(pfx,modrm)));
   14195          } else {
   14196             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   14197             gen_SEGV_if_not_16_aligned( addr );
   14198             delta += alen;
   14199             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   14200             DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
   14201          }
   14202          goto decode_success;
   14203       }
   14204       break;
   14205 
   14206    case 0xAE:
   14207       /* 0F AE /7 = SFENCE -- flush pending operations to memory */
   14208       if (haveNo66noF2noF3(pfx)
   14209           && epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 7
   14210           && sz == 4) {
   14211          delta += 1;
   14212          /* Insert a memory fence.  It's sometimes important that these
   14213             are carried through to the generated code. */
   14214          stmt( IRStmt_MBE(Imbe_Fence) );
   14215          DIP("sfence\n");
   14216          goto decode_success;
   14217       }
   14218       /* mindless duplication follows .. */
   14219       /* 0F AE /5 = LFENCE -- flush pending operations to memory */
   14220       /* 0F AE /6 = MFENCE -- flush pending operations to memory */
   14221       if (haveNo66noF2noF3(pfx)
   14222           && epartIsReg(getUChar(delta))
   14223           && (gregLO3ofRM(getUChar(delta)) == 5
   14224               || gregLO3ofRM(getUChar(delta)) == 6)
   14225           && sz == 4) {
   14226          delta += 1;
   14227          /* Insert a memory fence.  It's sometimes important that these
   14228             are carried through to the generated code. */
   14229          stmt( IRStmt_MBE(Imbe_Fence) );
   14230          DIP("%sfence\n", gregLO3ofRM(getUChar(delta-1))==5 ? "l" : "m");
   14231          goto decode_success;
   14232       }
   14233 
   14234       /* 0F AE /7 = CLFLUSH -- flush cache line */
   14235       if (haveNo66noF2noF3(pfx)
   14236           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 7
   14237           && sz == 4) {
   14238 
   14239          /* This is something of a hack.  We need to know the size of
   14240             the cache line containing addr.  Since we don't (easily),
   14241             assume 256 on the basis that no real cache would have a
   14242             line that big.  It's safe to invalidate more stuff than we
   14243             need, just inefficient. */
   14244          ULong lineszB = 256ULL;
   14245 
   14246          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14247          delta += alen;
   14248 
   14249          /* Round addr down to the start of the containing block. */
   14250          stmt( IRStmt_Put(
   14251                   OFFB_CMSTART,
   14252                   binop( Iop_And64,
   14253                          mkexpr(addr),
   14254                          mkU64( ~(lineszB-1) ))) );
   14255 
   14256          stmt( IRStmt_Put(OFFB_CMLEN, mkU64(lineszB) ) );
   14257 
   14258          jmp_lit(dres, Ijk_InvalICache, (Addr64)(guest_RIP_bbstart+delta));
   14259 
   14260          DIP("clflush %s\n", dis_buf);
   14261          goto decode_success;
   14262       }
   14263 
   14264       /* 0F AE /3 = STMXCSR m32 -- store %mxcsr */
   14265       if (haveNo66noF2noF3(pfx)
   14266           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 3
   14267           && sz == 4) {
   14268          delta = dis_STMXCSR(vbi, pfx, delta, False/*!isAvx*/);
   14269          goto decode_success;
   14270       }
   14271       /* 0F AE /2 = LDMXCSR m32 -- load %mxcsr */
   14272       if (haveNo66noF2noF3(pfx)
   14273           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 2
   14274           && sz == 4) {
   14275          delta = dis_LDMXCSR(vbi, pfx, delta, False/*!isAvx*/);
   14276          goto decode_success;
   14277       }
   14278       /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory */
   14279       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   14280           && !epartIsReg(getUChar(delta))
   14281           && gregOfRexRM(pfx,getUChar(delta)) == 0) {
   14282          delta = dis_FXSAVE(vbi, pfx, delta, sz);
   14283          goto decode_success;
   14284       }
   14285       /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory */
   14286       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   14287           && !epartIsReg(getUChar(delta))
   14288           && gregOfRexRM(pfx,getUChar(delta)) == 1) {
   14289          delta = dis_FXRSTOR(vbi, pfx, delta, sz);
   14290          goto decode_success;
   14291       }
   14292       /* 0F AE /4 = XSAVE mem -- write x87, SSE, AVX state to memory */
   14293       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   14294           && !epartIsReg(getUChar(delta))
   14295           && gregOfRexRM(pfx,getUChar(delta)) == 4
   14296           && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   14297          delta = dis_XSAVE(vbi, pfx, delta, sz);
   14298          goto decode_success;
   14299       }
   14300       /* 0F AE /5 = XRSTOR mem -- read x87, SSE, AVX state from memory */
   14301       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   14302           && !epartIsReg(getUChar(delta))
   14303           && gregOfRexRM(pfx,getUChar(delta)) == 5
   14304           && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   14305          delta = dis_XRSTOR(vbi, pfx, delta, sz);
   14306          goto decode_success;
   14307       }
   14308       break;
   14309 
   14310    case 0xC2:
   14311       /* 0F C2 = CMPPS -- 32Fx4 comparison from R/M to R */
   14312       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14313          Long delta0 = delta;
   14314          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpps", True, 4 );
   14315          if (delta > delta0) goto decode_success;
   14316       }
   14317       /* F3 0F C2 = CMPSS -- 32F0x4 comparison from R/M to R */
   14318       if (haveF3no66noF2(pfx) && sz == 4) {
   14319          Long delta0 = delta;
   14320          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpss", False, 4 );
   14321          if (delta > delta0) goto decode_success;
   14322       }
   14323       /* F2 0F C2 = CMPSD -- 64F0x2 comparison from R/M to R */
   14324       if (haveF2no66noF3(pfx) && sz == 4) {
   14325          Long delta0 = delta;
   14326          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpsd", False, 8 );
   14327          if (delta > delta0) goto decode_success;
   14328       }
   14329       /* 66 0F C2 = CMPPD -- 64Fx2 comparison from R/M to R */
   14330       if (have66noF2noF3(pfx) && sz == 2) {
   14331          Long delta0 = delta;
   14332          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmppd", True, 8 );
   14333          if (delta > delta0) goto decode_success;
   14334       }
   14335       break;
   14336 
   14337    case 0xC3:
   14338       /* 0F C3 = MOVNTI -- for us, just a plain ireg store. */
   14339       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)) {
   14340          modrm = getUChar(delta);
   14341          if (!epartIsReg(modrm)) {
   14342             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14343             storeLE( mkexpr(addr), getIRegG(sz, pfx, modrm) );
   14344             DIP("movnti %s,%s\n", dis_buf,
   14345                                   nameIRegG(sz, pfx, modrm));
   14346             delta += alen;
   14347             goto decode_success;
   14348          }
   14349          /* else fall through */
   14350       }
   14351       break;
   14352 
   14353    case 0xC4:
   14354       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14355       /* 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   14356          put it into the specified lane of mmx(G). */
   14357       if (haveNo66noF2noF3(pfx)
   14358           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   14359          /* Use t0 .. t3 to hold the 4 original 16-bit lanes of the
   14360             mmx reg.  t4 is the new lane value.  t5 is the original
   14361             mmx value. t6 is the new mmx value. */
   14362          Int lane;
   14363          t4 = newTemp(Ity_I16);
   14364          t5 = newTemp(Ity_I64);
   14365          t6 = newTemp(Ity_I64);
   14366          modrm = getUChar(delta);
   14367          do_MMX_preamble();
   14368 
   14369          assign(t5, getMMXReg(gregLO3ofRM(modrm)));
   14370          breakup64to16s( t5, &t3, &t2, &t1, &t0 );
   14371 
   14372          if (epartIsReg(modrm)) {
   14373             assign(t4, getIReg16(eregOfRexRM(pfx,modrm)));
   14374             delta += 1+1;
   14375             lane = getUChar(delta-1);
   14376             DIP("pinsrw $%d,%s,%s\n", lane,
   14377                                       nameIReg16(eregOfRexRM(pfx,modrm)),
   14378                                       nameMMXReg(gregLO3ofRM(modrm)));
   14379          } else {
   14380             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   14381             delta += 1+alen;
   14382             lane = getUChar(delta-1);
   14383             assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   14384             DIP("pinsrw $%d,%s,%s\n", lane,
   14385                                       dis_buf,
   14386                                       nameMMXReg(gregLO3ofRM(modrm)));
   14387          }
   14388 
   14389          switch (lane & 3) {
   14390             case 0:  assign(t6, mk64from16s(t3,t2,t1,t4)); break;
   14391             case 1:  assign(t6, mk64from16s(t3,t2,t4,t0)); break;
   14392             case 2:  assign(t6, mk64from16s(t3,t4,t1,t0)); break;
   14393             case 3:  assign(t6, mk64from16s(t4,t2,t1,t0)); break;
   14394             default: vassert(0);
   14395          }
   14396          putMMXReg(gregLO3ofRM(modrm), mkexpr(t6));
   14397          goto decode_success;
   14398       }
   14399       /* 66 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   14400          put it into the specified lane of xmm(G). */
   14401       if (have66noF2noF3(pfx)
   14402           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   14403          Int lane;
   14404          t4 = newTemp(Ity_I16);
   14405          modrm = getUChar(delta);
   14406          UInt rG = gregOfRexRM(pfx,modrm);
   14407          if (epartIsReg(modrm)) {
   14408             UInt rE = eregOfRexRM(pfx,modrm);
   14409             assign(t4, getIReg16(rE));
   14410             delta += 1+1;
   14411             lane = getUChar(delta-1);
   14412             DIP("pinsrw $%d,%s,%s\n",
   14413                 lane, nameIReg16(rE), nameXMMReg(rG));
   14414          } else {
   14415             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   14416                               1/*byte after the amode*/ );
   14417             delta += 1+alen;
   14418             lane = getUChar(delta-1);
   14419             assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   14420             DIP("pinsrw $%d,%s,%s\n",
   14421                 lane, dis_buf, nameXMMReg(rG));
   14422          }
   14423          IRTemp src_vec = newTemp(Ity_V128);
   14424          assign(src_vec, getXMMReg(rG));
   14425          IRTemp res_vec = math_PINSRW_128( src_vec, t4, lane & 7);
   14426          putXMMReg(rG, mkexpr(res_vec));
   14427          goto decode_success;
   14428       }
   14429       break;
   14430 
   14431    case 0xC5:
   14432       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14433       /* 0F C5 = PEXTRW -- extract 16-bit field from mmx(E) and put
   14434          zero-extend of it in ireg(G). */
   14435       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)) {
   14436          modrm = getUChar(delta);
   14437          if (epartIsReg(modrm)) {
   14438             IRTemp sV = newTemp(Ity_I64);
   14439             t5 = newTemp(Ity_I16);
   14440             do_MMX_preamble();
   14441             assign(sV, getMMXReg(eregLO3ofRM(modrm)));
   14442             breakup64to16s( sV, &t3, &t2, &t1, &t0 );
   14443             switch (getUChar(delta+1) & 3) {
   14444                case 0:  assign(t5, mkexpr(t0)); break;
   14445                case 1:  assign(t5, mkexpr(t1)); break;
   14446                case 2:  assign(t5, mkexpr(t2)); break;
   14447                case 3:  assign(t5, mkexpr(t3)); break;
   14448                default: vassert(0);
   14449             }
   14450             if (sz == 8)
   14451                putIReg64(gregOfRexRM(pfx,modrm), unop(Iop_16Uto64, mkexpr(t5)));
   14452             else
   14453                putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_16Uto32, mkexpr(t5)));
   14454             DIP("pextrw $%d,%s,%s\n",
   14455                 (Int)getUChar(delta+1),
   14456                 nameMMXReg(eregLO3ofRM(modrm)),
   14457                 sz==8 ? nameIReg64(gregOfRexRM(pfx,modrm))
   14458                       : nameIReg32(gregOfRexRM(pfx,modrm))
   14459             );
   14460             delta += 2;
   14461             goto decode_success;
   14462          }
   14463          /* else fall through */
   14464          /* note, for anyone filling in the mem case: this insn has one
   14465             byte after the amode and therefore you must pass 1 as the
   14466             last arg to disAMode */
   14467       }
   14468       /* 66 0F C5 = PEXTRW -- extract 16-bit field from xmm(E) and put
   14469          zero-extend of it in ireg(G). */
   14470       if (have66noF2noF3(pfx)
   14471           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   14472          Long delta0 = delta;
   14473          delta = dis_PEXTRW_128_EregOnly_toG( vbi, pfx, delta,
   14474                                               False/*!isAvx*/ );
   14475          if (delta > delta0) goto decode_success;
   14476          /* else fall through -- decoding has failed */
   14477       }
   14478       break;
   14479 
   14480    case 0xC6:
   14481       /* 0F C6 /r ib = SHUFPS -- shuffle packed F32s */
   14482       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14483          Int    imm8 = 0;
   14484          IRTemp sV   = newTemp(Ity_V128);
   14485          IRTemp dV   = newTemp(Ity_V128);
   14486          modrm = getUChar(delta);
   14487          UInt rG = gregOfRexRM(pfx,modrm);
   14488          assign( dV, getXMMReg(rG) );
   14489          if (epartIsReg(modrm)) {
   14490             UInt rE = eregOfRexRM(pfx,modrm);
   14491             assign( sV, getXMMReg(rE) );
   14492             imm8 = (Int)getUChar(delta+1);
   14493             delta += 1+1;
   14494             DIP("shufps $%d,%s,%s\n", imm8, nameXMMReg(rE), nameXMMReg(rG));
   14495          } else {
   14496             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   14497             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   14498             imm8 = (Int)getUChar(delta+alen);
   14499             delta += 1+alen;
   14500             DIP("shufps $%d,%s,%s\n", imm8, dis_buf, nameXMMReg(rG));
   14501          }
   14502          IRTemp res = math_SHUFPS_128( sV, dV, imm8 );
   14503          putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
   14504          goto decode_success;
   14505       }
   14506       /* 66 0F C6 /r ib = SHUFPD -- shuffle packed F64s */
   14507       if (have66noF2noF3(pfx) && sz == 2) {
   14508          Int    select;
   14509          IRTemp sV = newTemp(Ity_V128);
   14510          IRTemp dV = newTemp(Ity_V128);
   14511 
   14512          modrm = getUChar(delta);
   14513          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   14514 
   14515          if (epartIsReg(modrm)) {
   14516             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   14517             select = (Int)getUChar(delta+1);
   14518             delta += 1+1;
   14519             DIP("shufpd $%d,%s,%s\n", select,
   14520                                       nameXMMReg(eregOfRexRM(pfx,modrm)),
   14521                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
   14522          } else {
   14523             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   14524             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   14525             select = getUChar(delta+alen);
   14526             delta += 1+alen;
   14527             DIP("shufpd $%d,%s,%s\n", select,
   14528                                       dis_buf,
   14529                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
   14530          }
   14531 
   14532          IRTemp res = math_SHUFPD_128( sV, dV, select );
   14533          putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
   14534          goto decode_success;
   14535       }
   14536       break;
   14537 
   14538    case 0xD1:
   14539       /* 66 0F D1 = PSRLW by E */
   14540       if (have66noF2noF3(pfx) && sz == 2) {
   14541          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrlw", Iop_ShrN16x8 );
   14542          goto decode_success;
   14543       }
   14544       break;
   14545 
   14546    case 0xD2:
   14547       /* 66 0F D2 = PSRLD by E */
   14548       if (have66noF2noF3(pfx) && sz == 2) {
   14549          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrld", Iop_ShrN32x4 );
   14550          goto decode_success;
   14551       }
   14552       break;
   14553 
   14554    case 0xD3:
   14555       /* 66 0F D3 = PSRLQ by E */
   14556       if (have66noF2noF3(pfx) && sz == 2) {
   14557          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrlq", Iop_ShrN64x2 );
   14558          goto decode_success;
   14559       }
   14560       break;
   14561 
   14562    case 0xD4:
   14563       /* 66 0F D4 = PADDQ */
   14564       if (have66noF2noF3(pfx) && sz == 2) {
   14565          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14566                                     "paddq", Iop_Add64x2, False );
   14567          goto decode_success;
   14568       }
   14569       /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   14570       /* 0F D4 = PADDQ -- add 64x1 */
   14571       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14572          do_MMX_preamble();
   14573          delta = dis_MMXop_regmem_to_reg (
   14574                    vbi, pfx, delta, opc, "paddq", False );
   14575          goto decode_success;
   14576       }
   14577       break;
   14578 
   14579    case 0xD5:
   14580       /* 66 0F D5 = PMULLW -- 16x8 multiply */
   14581       if (have66noF2noF3(pfx) && sz == 2) {
   14582          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14583                                     "pmullw", Iop_Mul16x8, False );
   14584          goto decode_success;
   14585       }
   14586       break;
   14587 
   14588    case 0xD6:
   14589       /* F3 0F D6 = MOVQ2DQ -- move from E (mmx) to G (lo half xmm, zero
   14590          hi half). */
   14591       if (haveF3no66noF2(pfx) && sz == 4) {
   14592          modrm = getUChar(delta);
   14593          if (epartIsReg(modrm)) {
   14594             do_MMX_preamble();
   14595             putXMMReg( gregOfRexRM(pfx,modrm),
   14596                        unop(Iop_64UtoV128, getMMXReg( eregLO3ofRM(modrm) )) );
   14597             DIP("movq2dq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   14598                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   14599             delta += 1;
   14600             goto decode_success;
   14601          }
   14602          /* apparently no mem case for this insn */
   14603       }
   14604       /* 66 0F D6 = MOVQ -- move 64 bits from G (lo half xmm) to E (mem
   14605          or lo half xmm).  */
   14606       if (have66noF2noF3(pfx)
   14607           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   14608          modrm = getUChar(delta);
   14609          if (epartIsReg(modrm)) {
   14610             /* fall through, awaiting test case */
   14611             /* dst: lo half copied, hi half zeroed */
   14612          } else {
   14613             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14614             storeLE( mkexpr(addr),
   14615                      getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
   14616             DIP("movq %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf );
   14617             delta += alen;
   14618             goto decode_success;
   14619          }
   14620       }
   14621       /* F2 0F D6 = MOVDQ2Q -- move from E (lo half xmm, not mem) to G (mmx). */
   14622       if (haveF2no66noF3(pfx) && sz == 4) {
   14623          modrm = getUChar(delta);
   14624          if (epartIsReg(modrm)) {
   14625             do_MMX_preamble();
   14626             putMMXReg( gregLO3ofRM(modrm),
   14627                        getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
   14628             DIP("movdq2q %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   14629                                    nameMMXReg(gregLO3ofRM(modrm)));
   14630             delta += 1;
   14631             goto decode_success;
   14632          }
   14633          /* apparently no mem case for this insn */
   14634       }
   14635       break;
   14636 
   14637    case 0xD7:
   14638       /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16
   14639          lanes in xmm(E), turn them into a byte, and put
   14640          zero-extend of it in ireg(G).  Doing this directly is just
   14641          too cumbersome; give up therefore and call a helper. */
   14642       if (have66noF2noF3(pfx)
   14643           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
   14644           && epartIsReg(getUChar(delta))) { /* no memory case, it seems */
   14645          delta = dis_PMOVMSKB_128( vbi, pfx, delta, False/*!isAvx*/ );
   14646          goto decode_success;
   14647       }
   14648       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14649       /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
   14650          mmx(E), turn them into a byte, and put zero-extend of it in
   14651          ireg(G). */
   14652       if (haveNo66noF2noF3(pfx)
   14653           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   14654          modrm = getUChar(delta);
   14655          if (epartIsReg(modrm)) {
   14656             do_MMX_preamble();
   14657             t0 = newTemp(Ity_I64);
   14658             t1 = newTemp(Ity_I32);
   14659             assign(t0, getMMXReg(eregLO3ofRM(modrm)));
   14660             assign(t1, unop(Iop_8Uto32, unop(Iop_GetMSBs8x8, mkexpr(t0))));
   14661             putIReg32(gregOfRexRM(pfx,modrm), mkexpr(t1));
   14662             DIP("pmovmskb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   14663                                     nameIReg32(gregOfRexRM(pfx,modrm)));
   14664             delta += 1;
   14665             goto decode_success;
   14666          }
   14667          /* else fall through */
   14668       }
   14669       break;
   14670 
   14671    case 0xD8:
   14672       /* 66 0F D8 = PSUBUSB */
   14673       if (have66noF2noF3(pfx) && sz == 2) {
   14674          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14675                                     "psubusb", Iop_QSub8Ux16, False );
   14676          goto decode_success;
   14677       }
   14678       break;
   14679 
   14680    case 0xD9:
   14681       /* 66 0F D9 = PSUBUSW */
   14682       if (have66noF2noF3(pfx) && sz == 2) {
   14683          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14684                                     "psubusw", Iop_QSub16Ux8, False );
   14685          goto decode_success;
   14686       }
   14687       break;
   14688 
   14689    case 0xDA:
   14690       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14691       /* 0F DA = PMINUB -- 8x8 unsigned min */
   14692       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14693          do_MMX_preamble();
   14694          delta = dis_MMXop_regmem_to_reg (
   14695                     vbi, pfx, delta, opc, "pminub", False );
   14696          goto decode_success;
   14697       }
   14698       /* 66 0F DA = PMINUB -- 8x16 unsigned min */
   14699       if (have66noF2noF3(pfx) && sz == 2) {
   14700          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14701                                     "pminub", Iop_Min8Ux16, False );
   14702          goto decode_success;
   14703       }
   14704       break;
   14705 
   14706    case 0xDB:
   14707       /* 66 0F DB = PAND */
   14708       if (have66noF2noF3(pfx) && sz == 2) {
   14709          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "pand", Iop_AndV128 );
   14710          goto decode_success;
   14711       }
   14712       break;
   14713 
   14714    case 0xDC:
   14715       /* 66 0F DC = PADDUSB */
   14716       if (have66noF2noF3(pfx) && sz == 2) {
   14717          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14718                                     "paddusb", Iop_QAdd8Ux16, False );
   14719          goto decode_success;
   14720       }
   14721       break;
   14722 
   14723    case 0xDD:
   14724       /* 66 0F DD = PADDUSW */
   14725       if (have66noF2noF3(pfx) && sz == 2) {
   14726          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14727                                     "paddusw", Iop_QAdd16Ux8, False );
   14728          goto decode_success;
   14729       }
   14730       break;
   14731 
   14732    case 0xDE:
   14733       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14734       /* 0F DE = PMAXUB -- 8x8 unsigned max */
   14735       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14736          do_MMX_preamble();
   14737          delta = dis_MMXop_regmem_to_reg (
   14738                     vbi, pfx, delta, opc, "pmaxub", False );
   14739          goto decode_success;
   14740       }
   14741       /* 66 0F DE = PMAXUB -- 8x16 unsigned max */
   14742       if (have66noF2noF3(pfx) && sz == 2) {
   14743          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14744                                     "pmaxub", Iop_Max8Ux16, False );
   14745          goto decode_success;
   14746       }
   14747       break;
   14748 
   14749    case 0xDF:
   14750       /* 66 0F DF = PANDN */
   14751       if (have66noF2noF3(pfx) && sz == 2) {
   14752          delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "pandn", Iop_AndV128 );
   14753          goto decode_success;
   14754       }
   14755       break;
   14756 
   14757    case 0xE0:
   14758       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14759       /* 0F E0 = PAVGB -- 8x8 unsigned Packed Average, with rounding */
   14760       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14761          do_MMX_preamble();
   14762          delta = dis_MMXop_regmem_to_reg (
   14763                     vbi, pfx, delta, opc, "pavgb", False );
   14764          goto decode_success;
   14765       }
   14766       /* 66 0F E0 = PAVGB */
   14767       if (have66noF2noF3(pfx) && sz == 2) {
   14768          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14769                                     "pavgb", Iop_Avg8Ux16, False );
   14770          goto decode_success;
   14771       }
   14772       break;
   14773 
   14774    case 0xE1:
   14775       /* 66 0F E1 = PSRAW by E */
   14776       if (have66noF2noF3(pfx) && sz == 2) {
   14777          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psraw", Iop_SarN16x8 );
   14778          goto decode_success;
   14779       }
   14780       break;
   14781 
   14782    case 0xE2:
   14783       /* 66 0F E2 = PSRAD by E */
   14784       if (have66noF2noF3(pfx) && sz == 2) {
   14785          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrad", Iop_SarN32x4 );
   14786          goto decode_success;
   14787       }
   14788       break;
   14789 
   14790    case 0xE3:
   14791       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14792       /* 0F E3 = PAVGW -- 16x4 unsigned Packed Average, with rounding */
   14793       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14794          do_MMX_preamble();
   14795          delta = dis_MMXop_regmem_to_reg (
   14796                     vbi, pfx, delta, opc, "pavgw", False );
   14797          goto decode_success;
   14798       }
   14799       /* 66 0F E3 = PAVGW */
   14800       if (have66noF2noF3(pfx) && sz == 2) {
   14801          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14802                                     "pavgw", Iop_Avg16Ux8, False );
   14803          goto decode_success;
   14804       }
   14805       break;
   14806 
   14807    case 0xE4:
   14808       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14809       /* 0F E4 = PMULUH -- 16x4 hi-half of unsigned widening multiply */
   14810       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14811          do_MMX_preamble();
   14812          delta = dis_MMXop_regmem_to_reg (
   14813                     vbi, pfx, delta, opc, "pmuluh", False );
   14814          goto decode_success;
   14815       }
   14816       /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */
   14817       if (have66noF2noF3(pfx) && sz == 2) {
   14818          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14819                                     "pmulhuw", Iop_MulHi16Ux8, False );
   14820          goto decode_success;
   14821       }
   14822       break;
   14823 
   14824    case 0xE5:
   14825       /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */
   14826       if (have66noF2noF3(pfx) && sz == 2) {
   14827          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14828                                     "pmulhw", Iop_MulHi16Sx8, False );
   14829          goto decode_success;
   14830       }
   14831       break;
   14832 
   14833    case 0xE6:
   14834       /* 66 0F E6 = CVTTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   14835          lo half xmm(G), and zero upper half, rounding towards zero */
   14836       /* F2 0F E6 = CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   14837          lo half xmm(G), according to prevailing rounding mode, and zero
   14838          upper half */
   14839       if ( (haveF2no66noF3(pfx) && sz == 4)
   14840            || (have66noF2noF3(pfx) && sz == 2) ) {
   14841          delta = dis_CVTxPD2DQ_128( vbi, pfx, delta, False/*!isAvx*/,
   14842                                     toBool(sz == 2)/*r2zero*/);
   14843          goto decode_success;
   14844       }
   14845       /* F3 0F E6 = CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
   14846          F64 in xmm(G) */
   14847       if (haveF3no66noF2(pfx) && sz == 4) {
   14848          delta = dis_CVTDQ2PD_128(vbi, pfx, delta, False/*!isAvx*/);
   14849          goto decode_success;
   14850       }
   14851       break;
   14852 
   14853    case 0xE7:
   14854       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14855       /* 0F E7 = MOVNTQ -- for us, just a plain MMX store.  Note, the
   14856          Intel manual does not say anything about the usual business of
   14857          the FP reg tags getting trashed whenever an MMX insn happens.
   14858          So we just leave them alone.
   14859       */
   14860       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14861          modrm = getUChar(delta);
   14862          if (!epartIsReg(modrm)) {
   14863             /* do_MMX_preamble(); Intel docs don't specify this */
   14864             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14865             storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
   14866             DIP("movntq %s,%s\n", dis_buf,
   14867                                   nameMMXReg(gregLO3ofRM(modrm)));
   14868             delta += alen;
   14869             goto decode_success;
   14870          }
   14871          /* else fall through */
   14872       }
   14873       /* 66 0F E7 = MOVNTDQ -- for us, just a plain SSE store. */
   14874       if (have66noF2noF3(pfx) && sz == 2) {
   14875          modrm = getUChar(delta);
   14876          if (!epartIsReg(modrm)) {
   14877             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14878             gen_SEGV_if_not_16_aligned( addr );
   14879             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   14880             DIP("movntdq %s,%s\n", dis_buf,
   14881                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   14882             delta += alen;
   14883             goto decode_success;
   14884          }
   14885          /* else fall through */
   14886       }
   14887       break;
   14888 
   14889    case 0xE8:
   14890       /* 66 0F E8 = PSUBSB */
   14891       if (have66noF2noF3(pfx) && sz == 2) {
   14892          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14893                                     "psubsb", Iop_QSub8Sx16, False );
   14894          goto decode_success;
   14895       }
   14896       break;
   14897 
   14898    case 0xE9:
   14899       /* 66 0F E9 = PSUBSW */
   14900       if (have66noF2noF3(pfx) && sz == 2) {
   14901          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14902                                     "psubsw", Iop_QSub16Sx8, False );
   14903          goto decode_success;
   14904       }
   14905       break;
   14906 
   14907    case 0xEA:
   14908       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14909       /* 0F EA = PMINSW -- 16x4 signed min */
   14910       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14911          do_MMX_preamble();
   14912          delta = dis_MMXop_regmem_to_reg (
   14913                     vbi, pfx, delta, opc, "pminsw", False );
   14914          goto decode_success;
   14915       }
   14916       /* 66 0F EA = PMINSW -- 16x8 signed min */
   14917       if (have66noF2noF3(pfx) && sz == 2) {
   14918          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14919                                     "pminsw", Iop_Min16Sx8, False );
   14920          goto decode_success;
   14921       }
   14922       break;
   14923 
   14924    case 0xEB:
   14925       /* 66 0F EB = POR */
   14926       if (have66noF2noF3(pfx) && sz == 2) {
   14927          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "por", Iop_OrV128 );
   14928          goto decode_success;
   14929       }
   14930       break;
   14931 
   14932    case 0xEC:
   14933       /* 66 0F EC = PADDSB */
   14934       if (have66noF2noF3(pfx) && sz == 2) {
   14935          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14936                                     "paddsb", Iop_QAdd8Sx16, False );
   14937          goto decode_success;
   14938       }
   14939       break;
   14940 
   14941    case 0xED:
   14942       /* 66 0F ED = PADDSW */
   14943       if (have66noF2noF3(pfx) && sz == 2) {
   14944          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14945                                     "paddsw", Iop_QAdd16Sx8, False );
   14946          goto decode_success;
   14947       }
   14948       break;
   14949 
   14950    case 0xEE:
   14951       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14952       /* 0F EE = PMAXSW -- 16x4 signed max */
   14953       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14954          do_MMX_preamble();
   14955          delta = dis_MMXop_regmem_to_reg (
   14956                     vbi, pfx, delta, opc, "pmaxsw", False );
   14957          goto decode_success;
   14958       }
   14959       /* 66 0F EE = PMAXSW -- 16x8 signed max */
   14960       if (have66noF2noF3(pfx) && sz == 2) {
   14961          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14962                                     "pmaxsw", Iop_Max16Sx8, False );
   14963          goto decode_success;
   14964       }
   14965       break;
   14966 
   14967    case 0xEF:
   14968       /* 66 0F EF = PXOR */
   14969       if (have66noF2noF3(pfx) && sz == 2) {
   14970          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "pxor", Iop_XorV128 );
   14971          goto decode_success;
   14972       }
   14973       break;
   14974 
   14975    case 0xF1:
   14976       /* 66 0F F1 = PSLLW by E */
   14977       if (have66noF2noF3(pfx) && sz == 2) {
   14978          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psllw", Iop_ShlN16x8 );
   14979          goto decode_success;
   14980       }
   14981       break;
   14982 
   14983    case 0xF2:
   14984       /* 66 0F F2 = PSLLD by E */
   14985       if (have66noF2noF3(pfx) && sz == 2) {
   14986          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "pslld", Iop_ShlN32x4 );
   14987          goto decode_success;
   14988       }
   14989       break;
   14990 
   14991    case 0xF3:
   14992       /* 66 0F F3 = PSLLQ by E */
   14993       if (have66noF2noF3(pfx) && sz == 2) {
   14994          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psllq", Iop_ShlN64x2 );
   14995          goto decode_success;
   14996       }
   14997       break;
   14998 
   14999    case 0xF4:
   15000       /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   15001          0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
   15002          half */
   15003       if (have66noF2noF3(pfx) && sz == 2) {
   15004          IRTemp sV = newTemp(Ity_V128);
   15005          IRTemp dV = newTemp(Ity_V128);
   15006          modrm = getUChar(delta);
   15007          UInt rG = gregOfRexRM(pfx,modrm);
   15008          assign( dV, getXMMReg(rG) );
   15009          if (epartIsReg(modrm)) {
   15010             UInt rE = eregOfRexRM(pfx,modrm);
   15011             assign( sV, getXMMReg(rE) );
   15012             delta += 1;
   15013             DIP("pmuludq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   15014          } else {
   15015             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15016             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15017             delta += alen;
   15018             DIP("pmuludq %s,%s\n", dis_buf, nameXMMReg(rG));
   15019          }
   15020          putXMMReg( rG, mkexpr(math_PMULUDQ_128( sV, dV )) );
   15021          goto decode_success;
   15022       }
   15023       /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   15024       /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   15025          0 to form 64-bit result */
   15026       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15027          IRTemp sV = newTemp(Ity_I64);
   15028          IRTemp dV = newTemp(Ity_I64);
   15029          t1 = newTemp(Ity_I32);
   15030          t0 = newTemp(Ity_I32);
   15031          modrm = getUChar(delta);
   15032 
   15033          do_MMX_preamble();
   15034          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   15035 
   15036          if (epartIsReg(modrm)) {
   15037             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   15038             delta += 1;
   15039             DIP("pmuludq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   15040                                    nameMMXReg(gregLO3ofRM(modrm)));
   15041          } else {
   15042             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15043             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   15044             delta += alen;
   15045             DIP("pmuludq %s,%s\n", dis_buf,
   15046                                    nameMMXReg(gregLO3ofRM(modrm)));
   15047          }
   15048 
   15049          assign( t0, unop(Iop_64to32, mkexpr(dV)) );
   15050          assign( t1, unop(Iop_64to32, mkexpr(sV)) );
   15051          putMMXReg( gregLO3ofRM(modrm),
   15052                     binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) );
   15053          goto decode_success;
   15054       }
   15055       break;
   15056 
   15057    case 0xF5:
   15058       /* 66 0F F5 = PMADDWD -- Multiply and add packed integers from
   15059          E(xmm or mem) to G(xmm) */
   15060       if (have66noF2noF3(pfx) && sz == 2) {
   15061          IRTemp sV = newTemp(Ity_V128);
   15062          IRTemp dV = newTemp(Ity_V128);
   15063          modrm     = getUChar(delta);
   15064          UInt   rG = gregOfRexRM(pfx,modrm);
   15065          if (epartIsReg(modrm)) {
   15066             UInt rE = eregOfRexRM(pfx,modrm);
   15067             assign( sV, getXMMReg(rE) );
   15068             delta += 1;
   15069             DIP("pmaddwd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   15070          } else {
   15071             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15072             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15073             delta += alen;
   15074             DIP("pmaddwd %s,%s\n", dis_buf, nameXMMReg(rG));
   15075          }
   15076          assign( dV, getXMMReg(rG) );
   15077          putXMMReg( rG, mkexpr(math_PMADDWD_128(dV, sV)) );
   15078          goto decode_success;
   15079       }
   15080       break;
   15081 
   15082    case 0xF6:
   15083       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   15084       /* 0F F6 = PSADBW -- sum of 8Ux8 absolute differences */
   15085       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15086          do_MMX_preamble();
   15087          delta = dis_MMXop_regmem_to_reg (
   15088                     vbi, pfx, delta, opc, "psadbw", False );
   15089          goto decode_success;
   15090       }
   15091       /* 66 0F F6 = PSADBW -- 2 x (8x8 -> 48 zeroes ++ u16) Sum Abs Diffs
   15092          from E(xmm or mem) to G(xmm) */
   15093       if (have66noF2noF3(pfx) && sz == 2) {
   15094          IRTemp sV  = newTemp(Ity_V128);
   15095          IRTemp dV  = newTemp(Ity_V128);
   15096          modrm = getUChar(delta);
   15097          UInt   rG   = gregOfRexRM(pfx,modrm);
   15098          if (epartIsReg(modrm)) {
   15099             UInt rE = eregOfRexRM(pfx,modrm);
   15100             assign( sV, getXMMReg(rE) );
   15101             delta += 1;
   15102             DIP("psadbw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   15103          } else {
   15104             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15105             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15106             delta += alen;
   15107             DIP("psadbw %s,%s\n", dis_buf, nameXMMReg(rG));
   15108          }
   15109          assign( dV, getXMMReg(rG) );
   15110          putXMMReg( rG, mkexpr( math_PSADBW_128 ( dV, sV ) ) );
   15111 
   15112          goto decode_success;
   15113       }
   15114       break;
   15115 
   15116    case 0xF7:
   15117       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   15118       /* 0F F7 = MASKMOVQ -- 8x8 masked store */
   15119       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15120          Bool ok = False;
   15121          delta = dis_MMX( &ok, vbi, pfx, sz, delta-1 );
   15122          if (ok) goto decode_success;
   15123       }
   15124       /* 66 0F F7 = MASKMOVDQU -- store selected bytes of double quadword */
   15125       if (have66noF2noF3(pfx) && sz == 2 && epartIsReg(getUChar(delta))) {
   15126          delta = dis_MASKMOVDQU( vbi, pfx, delta, False/*!isAvx*/ );
   15127          goto decode_success;
   15128       }
   15129       break;
   15130 
   15131    case 0xF8:
   15132       /* 66 0F F8 = PSUBB */
   15133       if (have66noF2noF3(pfx) && sz == 2) {
   15134          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   15135                                     "psubb", Iop_Sub8x16, False );
   15136          goto decode_success;
   15137       }
   15138       break;
   15139 
   15140    case 0xF9:
   15141       /* 66 0F F9 = PSUBW */
   15142       if (have66noF2noF3(pfx) && sz == 2) {
   15143          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   15144                                     "psubw", Iop_Sub16x8, False );
   15145          goto decode_success;
   15146       }
   15147       break;
   15148 
   15149    case 0xFA:
   15150       /* 66 0F FA = PSUBD */
   15151       if (have66noF2noF3(pfx) && sz == 2) {
   15152          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   15153                                     "psubd", Iop_Sub32x4, False );
   15154          goto decode_success;
   15155       }
   15156       break;
   15157 
   15158    case 0xFB:
   15159       /* 66 0F FB = PSUBQ */
   15160       if (have66noF2noF3(pfx) && sz == 2) {
   15161          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   15162                                     "psubq", Iop_Sub64x2, False );
   15163          goto decode_success;
   15164       }
   15165       /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   15166       /* 0F FB = PSUBQ -- sub 64x1 */
   15167       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15168          do_MMX_preamble();
   15169          delta = dis_MMXop_regmem_to_reg (
   15170                    vbi, pfx, delta, opc, "psubq", False );
   15171          goto decode_success;
   15172       }
   15173       break;
   15174 
   15175    case 0xFC:
   15176       /* 66 0F FC = PADDB */
   15177       if (have66noF2noF3(pfx) && sz == 2) {
   15178          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   15179                                     "paddb", Iop_Add8x16, False );
   15180          goto decode_success;
   15181       }
   15182       break;
   15183 
   15184    case 0xFD:
   15185       /* 66 0F FD = PADDW */
   15186       if (have66noF2noF3(pfx) && sz == 2) {
   15187          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   15188                                     "paddw", Iop_Add16x8, False );
   15189          goto decode_success;
   15190       }
   15191       break;
   15192 
   15193    case 0xFE:
   15194       /* 66 0F FE = PADDD */
   15195       if (have66noF2noF3(pfx) && sz == 2) {
   15196          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   15197                                     "paddd", Iop_Add32x4, False );
   15198          goto decode_success;
   15199       }
   15200       break;
   15201 
   15202    default:
   15203       goto decode_failure;
   15204 
   15205    }
   15206 
   15207   decode_failure:
   15208    *decode_OK = False;
   15209    return deltaIN;
   15210 
   15211   decode_success:
   15212    *decode_OK = True;
   15213    return delta;
   15214 }
   15215 
   15216 
   15217 /*------------------------------------------------------------*/
   15218 /*---                                                      ---*/
   15219 /*--- Top-level SSE3 (not SupSSE3): dis_ESC_0F__SSE3       ---*/
   15220 /*---                                                      ---*/
   15221 /*------------------------------------------------------------*/
   15222 
   15223 static Long dis_MOVDDUP_128 ( const VexAbiInfo* vbi, Prefix pfx,
   15224                               Long delta, Bool isAvx )
   15225 {
   15226    IRTemp addr   = IRTemp_INVALID;
   15227    Int    alen   = 0;
   15228    HChar  dis_buf[50];
   15229    IRTemp sV    = newTemp(Ity_V128);
   15230    IRTemp d0    = newTemp(Ity_I64);
   15231    UChar  modrm = getUChar(delta);
   15232    UInt   rG    = gregOfRexRM(pfx,modrm);
   15233    if (epartIsReg(modrm)) {
   15234       UInt rE = eregOfRexRM(pfx,modrm);
   15235       assign( sV, getXMMReg(rE) );
   15236       DIP("%smovddup %s,%s\n",
   15237           isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
   15238       delta += 1;
   15239       assign ( d0, unop(Iop_V128to64, mkexpr(sV)) );
   15240    } else {
   15241       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15242       assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
   15243       DIP("%smovddup %s,%s\n",
   15244           isAvx ? "v" : "", dis_buf, nameXMMReg(rG));
   15245       delta += alen;
   15246    }
   15247    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   15248       ( rG, binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) );
   15249    return delta;
   15250 }
   15251 
   15252 
   15253 static Long dis_MOVDDUP_256 ( const VexAbiInfo* vbi, Prefix pfx,
   15254                               Long delta )
   15255 {
   15256    IRTemp addr   = IRTemp_INVALID;
   15257    Int    alen   = 0;
   15258    HChar  dis_buf[50];
   15259    IRTemp d0    = newTemp(Ity_I64);
   15260    IRTemp d1    = newTemp(Ity_I64);
   15261    UChar  modrm = getUChar(delta);
   15262    UInt   rG    = gregOfRexRM(pfx,modrm);
   15263    if (epartIsReg(modrm)) {
   15264       UInt rE = eregOfRexRM(pfx,modrm);
   15265       DIP("vmovddup %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   15266       delta += 1;
   15267       assign ( d0, getYMMRegLane64(rE, 0) );
   15268       assign ( d1, getYMMRegLane64(rE, 2) );
   15269    } else {
   15270       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15271       assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
   15272       assign( d1, loadLE(Ity_I64, binop(Iop_Add64,
   15273                                         mkexpr(addr), mkU64(16))) );
   15274       DIP("vmovddup %s,%s\n", dis_buf, nameYMMReg(rG));
   15275       delta += alen;
   15276    }
   15277    putYMMRegLane64( rG, 0, mkexpr(d0) );
   15278    putYMMRegLane64( rG, 1, mkexpr(d0) );
   15279    putYMMRegLane64( rG, 2, mkexpr(d1) );
   15280    putYMMRegLane64( rG, 3, mkexpr(d1) );
   15281    return delta;
   15282 }
   15283 
   15284 
   15285 static Long dis_MOVSxDUP_128 ( const VexAbiInfo* vbi, Prefix pfx,
   15286                                Long delta, Bool isAvx, Bool isL )
   15287 {
   15288    IRTemp addr  = IRTemp_INVALID;
   15289    Int    alen  = 0;
   15290    HChar  dis_buf[50];
   15291    IRTemp sV    = newTemp(Ity_V128);
   15292    UChar  modrm = getUChar(delta);
   15293    UInt   rG    = gregOfRexRM(pfx,modrm);
   15294    IRTemp s3, s2, s1, s0;
   15295    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   15296    if (epartIsReg(modrm)) {
   15297       UInt rE = eregOfRexRM(pfx,modrm);
   15298       assign( sV, getXMMReg(rE) );
   15299       DIP("%smovs%cdup %s,%s\n",
   15300           isAvx ? "v" : "", isL ? 'l' : 'h', nameXMMReg(rE), nameXMMReg(rG));
   15301       delta += 1;
   15302    } else {
   15303       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15304       if (!isAvx)
   15305          gen_SEGV_if_not_16_aligned( addr );
   15306       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15307       DIP("%smovs%cdup %s,%s\n",
   15308           isAvx ? "v" : "", isL ? 'l' : 'h', dis_buf, nameXMMReg(rG));
   15309       delta += alen;
   15310    }
   15311    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   15312    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   15313       ( rG, isL ? mkV128from32s( s2, s2, s0, s0 )
   15314                 : mkV128from32s( s3, s3, s1, s1 ) );
   15315    return delta;
   15316 }
   15317 
   15318 
   15319 static Long dis_MOVSxDUP_256 ( const VexAbiInfo* vbi, Prefix pfx,
   15320                                Long delta, Bool isL )
   15321 {
   15322    IRTemp addr  = IRTemp_INVALID;
   15323    Int    alen  = 0;
   15324    HChar  dis_buf[50];
   15325    IRTemp sV    = newTemp(Ity_V256);
   15326    UChar  modrm = getUChar(delta);
   15327    UInt   rG    = gregOfRexRM(pfx,modrm);
   15328    IRTemp s7, s6, s5, s4, s3, s2, s1, s0;
   15329    s7 = s6 = s5 = s4 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   15330    if (epartIsReg(modrm)) {
   15331       UInt rE = eregOfRexRM(pfx,modrm);
   15332       assign( sV, getYMMReg(rE) );
   15333       DIP("vmovs%cdup %s,%s\n",
   15334           isL ? 'l' : 'h', nameYMMReg(rE), nameYMMReg(rG));
   15335       delta += 1;
   15336    } else {
   15337       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15338       assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   15339       DIP("vmovs%cdup %s,%s\n",
   15340           isL ? 'l' : 'h', dis_buf, nameYMMReg(rG));
   15341       delta += alen;
   15342    }
   15343    breakupV256to32s( sV, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
   15344    putYMMRegLane128( rG, 1, isL ? mkV128from32s( s6, s6, s4, s4 )
   15345                                 : mkV128from32s( s7, s7, s5, s5 ) );
   15346    putYMMRegLane128( rG, 0, isL ? mkV128from32s( s2, s2, s0, s0 )
   15347                                 : mkV128from32s( s3, s3, s1, s1 ) );
   15348    return delta;
   15349 }
   15350 
   15351 
   15352 static IRTemp math_HADDPS_128 ( IRTemp dV, IRTemp sV, Bool isAdd )
   15353 {
   15354    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   15355    IRTemp leftV  = newTemp(Ity_V128);
   15356    IRTemp rightV = newTemp(Ity_V128);
   15357    IRTemp rm     = newTemp(Ity_I32);
   15358    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   15359 
   15360    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   15361    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   15362 
   15363    assign( leftV,  mkV128from32s( s2, s0, d2, d0 ) );
   15364    assign( rightV, mkV128from32s( s3, s1, d3, d1 ) );
   15365 
   15366    IRTemp res = newTemp(Ity_V128);
   15367    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   15368    assign( res, triop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4,
   15369                       mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
   15370    return res;
   15371 }
   15372 
   15373 
   15374 static IRTemp math_HADDPD_128 ( IRTemp dV, IRTemp sV, Bool isAdd )
   15375 {
   15376    IRTemp s1, s0, d1, d0;
   15377    IRTemp leftV  = newTemp(Ity_V128);
   15378    IRTemp rightV = newTemp(Ity_V128);
   15379    IRTemp rm     = newTemp(Ity_I32);
   15380    s1 = s0 = d1 = d0 = IRTemp_INVALID;
   15381 
   15382    breakupV128to64s( sV, &s1, &s0 );
   15383    breakupV128to64s( dV, &d1, &d0 );
   15384 
   15385    assign( leftV,  binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) );
   15386    assign( rightV, binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
   15387 
   15388    IRTemp res = newTemp(Ity_V128);
   15389    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   15390    assign( res, triop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2,
   15391                       mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
   15392    return res;
   15393 }
   15394 
   15395 
   15396 __attribute__((noinline))
   15397 static
   15398 Long dis_ESC_0F__SSE3 ( Bool* decode_OK,
   15399                         const VexAbiInfo* vbi,
   15400                         Prefix pfx, Int sz, Long deltaIN )
   15401 {
   15402    IRTemp addr  = IRTemp_INVALID;
   15403    UChar  modrm = 0;
   15404    Int    alen  = 0;
   15405    HChar  dis_buf[50];
   15406 
   15407    *decode_OK = False;
   15408 
   15409    Long   delta = deltaIN;
   15410    UChar  opc   = getUChar(delta);
   15411    delta++;
   15412    switch (opc) {
   15413 
   15414    case 0x12:
   15415       /* F3 0F 12 = MOVSLDUP -- move from E (mem or xmm) to G (xmm),
   15416          duplicating some lanes (2:2:0:0). */
   15417       if (haveF3no66noF2(pfx) && sz == 4) {
   15418          delta = dis_MOVSxDUP_128( vbi, pfx, delta, False/*!isAvx*/,
   15419                                    True/*isL*/ );
   15420          goto decode_success;
   15421       }
   15422       /* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
   15423          duplicating some lanes (0:1:0:1). */
   15424       if (haveF2no66noF3(pfx)
   15425           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   15426          delta = dis_MOVDDUP_128( vbi, pfx, delta, False/*!isAvx*/ );
   15427          goto decode_success;
   15428       }
   15429       break;
   15430 
   15431    case 0x16:
   15432       /* F3 0F 16 = MOVSHDUP -- move from E (mem or xmm) to G (xmm),
   15433          duplicating some lanes (3:3:1:1). */
   15434       if (haveF3no66noF2(pfx) && sz == 4) {
   15435          delta = dis_MOVSxDUP_128( vbi, pfx, delta, False/*!isAvx*/,
   15436                                    False/*!isL*/ );
   15437          goto decode_success;
   15438       }
   15439       break;
   15440 
   15441    case 0x7C:
   15442    case 0x7D:
   15443       /* F2 0F 7C = HADDPS -- 32x4 add across from E (mem or xmm) to G (xmm). */
   15444       /* F2 0F 7D = HSUBPS -- 32x4 sub across from E (mem or xmm) to G (xmm). */
   15445       if (haveF2no66noF3(pfx) && sz == 4) {
   15446          IRTemp eV     = newTemp(Ity_V128);
   15447          IRTemp gV     = newTemp(Ity_V128);
   15448          Bool   isAdd  = opc == 0x7C;
   15449          const HChar* str = isAdd ? "add" : "sub";
   15450          modrm         = getUChar(delta);
   15451          UInt   rG     = gregOfRexRM(pfx,modrm);
   15452          if (epartIsReg(modrm)) {
   15453             UInt rE = eregOfRexRM(pfx,modrm);
   15454             assign( eV, getXMMReg(rE) );
   15455             DIP("h%sps %s,%s\n", str, nameXMMReg(rE), nameXMMReg(rG));
   15456             delta += 1;
   15457          } else {
   15458             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15459             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   15460             DIP("h%sps %s,%s\n", str, dis_buf, nameXMMReg(rG));
   15461             delta += alen;
   15462          }
   15463 
   15464          assign( gV, getXMMReg(rG) );
   15465          putXMMReg( rG, mkexpr( math_HADDPS_128 ( gV, eV, isAdd ) ) );
   15466          goto decode_success;
   15467       }
   15468       /* 66 0F 7C = HADDPD -- 64x2 add across from E (mem or xmm) to G (xmm). */
   15469       /* 66 0F 7D = HSUBPD -- 64x2 sub across from E (mem or xmm) to G (xmm). */
   15470       if (have66noF2noF3(pfx) && sz == 2) {
   15471          IRTemp eV     = newTemp(Ity_V128);
   15472          IRTemp gV     = newTemp(Ity_V128);
   15473          Bool   isAdd  = opc == 0x7C;
   15474          const HChar* str = isAdd ? "add" : "sub";
   15475          modrm         = getUChar(delta);
   15476          UInt   rG     = gregOfRexRM(pfx,modrm);
   15477          if (epartIsReg(modrm)) {
   15478             UInt rE = eregOfRexRM(pfx,modrm);
   15479             assign( eV, getXMMReg(rE) );
   15480             DIP("h%spd %s,%s\n", str, nameXMMReg(rE), nameXMMReg(rG));
   15481             delta += 1;
   15482          } else {
   15483             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15484             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   15485             DIP("h%spd %s,%s\n", str, dis_buf, nameXMMReg(rG));
   15486             delta += alen;
   15487          }
   15488 
   15489          assign( gV, getXMMReg(rG) );
   15490          putXMMReg( rG, mkexpr( math_HADDPD_128 ( gV, eV, isAdd ) ) );
   15491          goto decode_success;
   15492       }
   15493       break;
   15494 
   15495    case 0xD0:
   15496       /* 66 0F D0 = ADDSUBPD -- 64x4 +/- from E (mem or xmm) to G (xmm). */
   15497       if (have66noF2noF3(pfx) && sz == 2) {
   15498          IRTemp eV   = newTemp(Ity_V128);
   15499          IRTemp gV   = newTemp(Ity_V128);
   15500          modrm       = getUChar(delta);
   15501          UInt   rG   = gregOfRexRM(pfx,modrm);
   15502          if (epartIsReg(modrm)) {
   15503             UInt rE = eregOfRexRM(pfx,modrm);
   15504             assign( eV, getXMMReg(rE) );
   15505             DIP("addsubpd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   15506             delta += 1;
   15507          } else {
   15508             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15509             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   15510             DIP("addsubpd %s,%s\n", dis_buf, nameXMMReg(rG));
   15511             delta += alen;
   15512          }
   15513 
   15514          assign( gV, getXMMReg(rG) );
   15515          putXMMReg( rG, mkexpr( math_ADDSUBPD_128 ( gV, eV ) ) );
   15516          goto decode_success;
   15517       }
   15518       /* F2 0F D0 = ADDSUBPS -- 32x4 +/-/+/- from E (mem or xmm) to G (xmm). */
   15519       if (haveF2no66noF3(pfx) && sz == 4) {
   15520          IRTemp eV   = newTemp(Ity_V128);
   15521          IRTemp gV   = newTemp(Ity_V128);
   15522          modrm       = getUChar(delta);
   15523          UInt   rG   = gregOfRexRM(pfx,modrm);
   15524 
   15525          modrm = getUChar(delta);
   15526          if (epartIsReg(modrm)) {
   15527             UInt rE = eregOfRexRM(pfx,modrm);
   15528             assign( eV, getXMMReg(rE) );
   15529             DIP("addsubps %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   15530             delta += 1;
   15531          } else {
   15532             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15533             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   15534             DIP("addsubps %s,%s\n", dis_buf, nameXMMReg(rG));
   15535             delta += alen;
   15536          }
   15537 
   15538          assign( gV, getXMMReg(rG) );
   15539          putXMMReg( rG, mkexpr( math_ADDSUBPS_128 ( gV, eV ) ) );
   15540          goto decode_success;
   15541       }
   15542       break;
   15543 
   15544    case 0xF0:
   15545       /* F2 0F F0 = LDDQU -- move from E (mem or xmm) to G (xmm). */
   15546       if (haveF2no66noF3(pfx) && sz == 4) {
   15547          modrm = getUChar(delta);
   15548          if (epartIsReg(modrm)) {
   15549             goto decode_failure;
   15550          } else {
   15551             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15552             putXMMReg( gregOfRexRM(pfx,modrm),
   15553                        loadLE(Ity_V128, mkexpr(addr)) );
   15554             DIP("lddqu %s,%s\n", dis_buf,
   15555                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   15556             delta += alen;
   15557          }
   15558          goto decode_success;
   15559       }
   15560       break;
   15561 
   15562    default:
   15563       goto decode_failure;
   15564 
   15565    }
   15566 
   15567   decode_failure:
   15568    *decode_OK = False;
   15569    return deltaIN;
   15570 
   15571   decode_success:
   15572    *decode_OK = True;
   15573    return delta;
   15574 }
   15575 
   15576 
   15577 /*------------------------------------------------------------*/
   15578 /*---                                                      ---*/
   15579 /*--- Top-level SSSE3: dis_ESC_0F38__SupSSE3               ---*/
   15580 /*---                                                      ---*/
   15581 /*------------------------------------------------------------*/
   15582 
   15583 static
   15584 IRTemp math_PSHUFB_XMM ( IRTemp dV/*data to perm*/, IRTemp sV/*perm*/ )
   15585 {
   15586    IRTemp sHi        = newTemp(Ity_I64);
   15587    IRTemp sLo        = newTemp(Ity_I64);
   15588    IRTemp dHi        = newTemp(Ity_I64);
   15589    IRTemp dLo        = newTemp(Ity_I64);
   15590    IRTemp rHi        = newTemp(Ity_I64);
   15591    IRTemp rLo        = newTemp(Ity_I64);
   15592    IRTemp sevens     = newTemp(Ity_I64);
   15593    IRTemp mask0x80hi = newTemp(Ity_I64);
   15594    IRTemp mask0x80lo = newTemp(Ity_I64);
   15595    IRTemp maskBit3hi = newTemp(Ity_I64);
   15596    IRTemp maskBit3lo = newTemp(Ity_I64);
   15597    IRTemp sAnd7hi    = newTemp(Ity_I64);
   15598    IRTemp sAnd7lo    = newTemp(Ity_I64);
   15599    IRTemp permdHi    = newTemp(Ity_I64);
   15600    IRTemp permdLo    = newTemp(Ity_I64);
   15601    IRTemp res        = newTemp(Ity_V128);
   15602 
   15603    assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   15604    assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   15605    assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   15606    assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   15607 
   15608    assign( sevens, mkU64(0x0707070707070707ULL) );
   15609 
   15610    /* mask0x80hi = Not(SarN8x8(sHi,7))
   15611       maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7)
   15612       sAnd7hi    = And(sHi,sevens)
   15613       permdHi    = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi),
   15614       And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) )
   15615       rHi        = And(permdHi,mask0x80hi)
   15616    */
   15617    assign(
   15618       mask0x80hi,
   15619       unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7))));
   15620 
   15621    assign(
   15622       maskBit3hi,
   15623       binop(Iop_SarN8x8,
   15624             binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)),
   15625             mkU8(7)));
   15626 
   15627    assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens)));
   15628 
   15629    assign(
   15630       permdHi,
   15631       binop(
   15632          Iop_Or64,
   15633          binop(Iop_And64,
   15634                binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)),
   15635                mkexpr(maskBit3hi)),
   15636          binop(Iop_And64,
   15637                binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)),
   15638                unop(Iop_Not64,mkexpr(maskBit3hi))) ));
   15639 
   15640    assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) );
   15641 
   15642    /* And the same for the lower half of the result.  What fun. */
   15643 
   15644    assign(
   15645       mask0x80lo,
   15646       unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7))));
   15647 
   15648    assign(
   15649       maskBit3lo,
   15650       binop(Iop_SarN8x8,
   15651             binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)),
   15652             mkU8(7)));
   15653 
   15654    assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens)));
   15655 
   15656    assign(
   15657       permdLo,
   15658       binop(
   15659          Iop_Or64,
   15660          binop(Iop_And64,
   15661                binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)),
   15662                mkexpr(maskBit3lo)),
   15663          binop(Iop_And64,
   15664                binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)),
   15665                unop(Iop_Not64,mkexpr(maskBit3lo))) ));
   15666 
   15667    assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) );
   15668 
   15669    assign(res, binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo)));
   15670    return res;
   15671 }
   15672 
   15673 
   15674 static
   15675 IRTemp math_PSHUFB_YMM ( IRTemp dV/*data to perm*/, IRTemp sV/*perm*/ )
   15676 {
   15677    IRTemp sHi, sLo, dHi, dLo;
   15678    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   15679    breakupV256toV128s( dV, &dHi, &dLo);
   15680    breakupV256toV128s( sV, &sHi, &sLo);
   15681    IRTemp res = newTemp(Ity_V256);
   15682    assign(res, binop(Iop_V128HLtoV256,
   15683                      mkexpr(math_PSHUFB_XMM(dHi, sHi)),
   15684                      mkexpr(math_PSHUFB_XMM(dLo, sLo))));
   15685    return res;
   15686 }
   15687 
   15688 
   15689 static Long dis_PHADD_128 ( const VexAbiInfo* vbi, Prefix pfx, Long delta,
   15690                             Bool isAvx, UChar opc )
   15691 {
   15692    IRTemp addr   = IRTemp_INVALID;
   15693    Int    alen   = 0;
   15694    HChar  dis_buf[50];
   15695    const HChar* str = "???";
   15696    IROp   opV64  = Iop_INVALID;
   15697    IROp   opCatO = Iop_CatOddLanes16x4;
   15698    IROp   opCatE = Iop_CatEvenLanes16x4;
   15699    IRTemp sV     = newTemp(Ity_V128);
   15700    IRTemp dV     = newTemp(Ity_V128);
   15701    IRTemp sHi    = newTemp(Ity_I64);
   15702    IRTemp sLo    = newTemp(Ity_I64);
   15703    IRTemp dHi    = newTemp(Ity_I64);
   15704    IRTemp dLo    = newTemp(Ity_I64);
   15705    UChar  modrm  = getUChar(delta);
   15706    UInt   rG     = gregOfRexRM(pfx,modrm);
   15707    UInt   rV     = isAvx ? getVexNvvvv(pfx) : rG;
   15708 
   15709    switch (opc) {
   15710       case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   15711       case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   15712       case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   15713       case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   15714       case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   15715       case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   15716       default: vassert(0);
   15717    }
   15718    if (opc == 0x02 || opc == 0x06) {
   15719       opCatO = Iop_InterleaveHI32x2;
   15720       opCatE = Iop_InterleaveLO32x2;
   15721    }
   15722 
   15723    assign( dV, getXMMReg(rV) );
   15724 
   15725    if (epartIsReg(modrm)) {
   15726       UInt rE = eregOfRexRM(pfx,modrm);
   15727       assign( sV, getXMMReg(rE) );
   15728       DIP("%sph%s %s,%s\n", isAvx ? "v" : "", str,
   15729           nameXMMReg(rE), nameXMMReg(rG));
   15730       delta += 1;
   15731    } else {
   15732       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15733       if (!isAvx)
   15734          gen_SEGV_if_not_16_aligned( addr );
   15735       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15736       DIP("%sph%s %s,%s\n", isAvx ? "v" : "", str,
   15737           dis_buf, nameXMMReg(rG));
   15738       delta += alen;
   15739    }
   15740 
   15741    assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   15742    assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   15743    assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   15744    assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   15745 
   15746    /* This isn't a particularly efficient way to compute the
   15747       result, but at least it avoids a proliferation of IROps,
   15748       hence avoids complication all the backends. */
   15749 
   15750    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   15751       ( rG,
   15752         binop(Iop_64HLtoV128,
   15753               binop(opV64,
   15754                     binop(opCatE,mkexpr(sHi),mkexpr(sLo)),
   15755                     binop(opCatO,mkexpr(sHi),mkexpr(sLo)) ),
   15756               binop(opV64,
   15757                     binop(opCatE,mkexpr(dHi),mkexpr(dLo)),
   15758                     binop(opCatO,mkexpr(dHi),mkexpr(dLo)) ) ) );
   15759    return delta;
   15760 }
   15761 
   15762 
   15763 static Long dis_PHADD_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta,
   15764                             UChar opc )
   15765 {
   15766    IRTemp addr   = IRTemp_INVALID;
   15767    Int    alen   = 0;
   15768    HChar  dis_buf[50];
   15769    const HChar* str = "???";
   15770    IROp   opV64  = Iop_INVALID;
   15771    IROp   opCatO = Iop_CatOddLanes16x4;
   15772    IROp   opCatE = Iop_CatEvenLanes16x4;
   15773    IRTemp sV     = newTemp(Ity_V256);
   15774    IRTemp dV     = newTemp(Ity_V256);
   15775    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   15776    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   15777    UChar  modrm  = getUChar(delta);
   15778    UInt   rG     = gregOfRexRM(pfx,modrm);
   15779    UInt   rV     = getVexNvvvv(pfx);
   15780 
   15781    switch (opc) {
   15782       case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   15783       case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   15784       case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   15785       case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   15786       case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   15787       case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   15788       default: vassert(0);
   15789    }
   15790    if (opc == 0x02 || opc == 0x06) {
   15791       opCatO = Iop_InterleaveHI32x2;
   15792       opCatE = Iop_InterleaveLO32x2;
   15793    }
   15794 
   15795    assign( dV, getYMMReg(rV) );
   15796 
   15797    if (epartIsReg(modrm)) {
   15798       UInt rE = eregOfRexRM(pfx,modrm);
   15799       assign( sV, getYMMReg(rE) );
   15800       DIP("vph%s %s,%s\n", str, nameYMMReg(rE), nameYMMReg(rG));
   15801       delta += 1;
   15802    } else {
   15803       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15804       assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   15805       DIP("vph%s %s,%s\n", str, dis_buf, nameYMMReg(rG));
   15806       delta += alen;
   15807    }
   15808 
   15809    breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
   15810    breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
   15811 
   15812    /* This isn't a particularly efficient way to compute the
   15813       result, but at least it avoids a proliferation of IROps,
   15814       hence avoids complication all the backends. */
   15815 
   15816    putYMMReg( rG,
   15817               binop(Iop_V128HLtoV256,
   15818                     binop(Iop_64HLtoV128,
   15819                           binop(opV64,
   15820                                 binop(opCatE,mkexpr(s3),mkexpr(s2)),
   15821                                 binop(opCatO,mkexpr(s3),mkexpr(s2)) ),
   15822                           binop(opV64,
   15823                                 binop(opCatE,mkexpr(d3),mkexpr(d2)),
   15824                                 binop(opCatO,mkexpr(d3),mkexpr(d2)) ) ),
   15825                     binop(Iop_64HLtoV128,
   15826                           binop(opV64,
   15827                                 binop(opCatE,mkexpr(s1),mkexpr(s0)),
   15828                                 binop(opCatO,mkexpr(s1),mkexpr(s0)) ),
   15829                           binop(opV64,
   15830                                 binop(opCatE,mkexpr(d1),mkexpr(d0)),
   15831                                 binop(opCatO,mkexpr(d1),mkexpr(d0)) ) ) ) );
   15832    return delta;
   15833 }
   15834 
   15835 
   15836 static IRTemp math_PMADDUBSW_128 ( IRTemp dV, IRTemp sV )
   15837 {
   15838    IRTemp sVoddsSX  = newTemp(Ity_V128);
   15839    IRTemp sVevensSX = newTemp(Ity_V128);
   15840    IRTemp dVoddsZX  = newTemp(Ity_V128);
   15841    IRTemp dVevensZX = newTemp(Ity_V128);
   15842    /* compute dV unsigned x sV signed */
   15843    assign( sVoddsSX, binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) );
   15844    assign( sVevensSX, binop(Iop_SarN16x8,
   15845                             binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)),
   15846                             mkU8(8)) );
   15847    assign( dVoddsZX, binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) );
   15848    assign( dVevensZX, binop(Iop_ShrN16x8,
   15849                             binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)),
   15850                             mkU8(8)) );
   15851 
   15852    IRTemp res = newTemp(Ity_V128);
   15853    assign( res, binop(Iop_QAdd16Sx8,
   15854                       binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
   15855                       binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX))
   15856                      )
   15857          );
   15858    return res;
   15859 }
   15860 
   15861 
   15862 static
   15863 IRTemp math_PMADDUBSW_256 ( IRTemp dV, IRTemp sV )
   15864 {
   15865    IRTemp sHi, sLo, dHi, dLo;
   15866    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   15867    breakupV256toV128s( dV, &dHi, &dLo);
   15868    breakupV256toV128s( sV, &sHi, &sLo);
   15869    IRTemp res = newTemp(Ity_V256);
   15870    assign(res, binop(Iop_V128HLtoV256,
   15871                      mkexpr(math_PMADDUBSW_128(dHi, sHi)),
   15872                      mkexpr(math_PMADDUBSW_128(dLo, sLo))));
   15873    return res;
   15874 }
   15875 
   15876 
   15877 __attribute__((noinline))
   15878 static
   15879 Long dis_ESC_0F38__SupSSE3 ( Bool* decode_OK,
   15880                              const VexAbiInfo* vbi,
   15881                              Prefix pfx, Int sz, Long deltaIN )
   15882 {
   15883    IRTemp addr  = IRTemp_INVALID;
   15884    UChar  modrm = 0;
   15885    Int    alen  = 0;
   15886    HChar  dis_buf[50];
   15887 
   15888    *decode_OK = False;
   15889 
   15890    Long   delta = deltaIN;
   15891    UChar  opc   = getUChar(delta);
   15892    delta++;
   15893    switch (opc) {
   15894 
   15895    case 0x00:
   15896       /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */
   15897       if (have66noF2noF3(pfx)
   15898           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   15899          IRTemp sV = newTemp(Ity_V128);
   15900          IRTemp dV = newTemp(Ity_V128);
   15901 
   15902          modrm = getUChar(delta);
   15903          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   15904 
   15905          if (epartIsReg(modrm)) {
   15906             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   15907             delta += 1;
   15908             DIP("pshufb %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   15909                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   15910          } else {
   15911             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15912             gen_SEGV_if_not_16_aligned( addr );
   15913             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15914             delta += alen;
   15915             DIP("pshufb %s,%s\n", dis_buf,
   15916                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   15917          }
   15918 
   15919          IRTemp res = math_PSHUFB_XMM( dV, sV );
   15920          putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(res));
   15921          goto decode_success;
   15922       }
   15923       /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */
   15924       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15925          IRTemp sV      = newTemp(Ity_I64);
   15926          IRTemp dV      = newTemp(Ity_I64);
   15927 
   15928          modrm = getUChar(delta);
   15929          do_MMX_preamble();
   15930          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   15931 
   15932          if (epartIsReg(modrm)) {
   15933             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   15934             delta += 1;
   15935             DIP("pshufb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   15936                                   nameMMXReg(gregLO3ofRM(modrm)));
   15937          } else {
   15938             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15939             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   15940             delta += alen;
   15941             DIP("pshufb %s,%s\n", dis_buf,
   15942                                   nameMMXReg(gregLO3ofRM(modrm)));
   15943          }
   15944 
   15945          putMMXReg(
   15946             gregLO3ofRM(modrm),
   15947             binop(
   15948                Iop_And64,
   15949                /* permute the lanes */
   15950                binop(
   15951                   Iop_Perm8x8,
   15952                   mkexpr(dV),
   15953                   binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL))
   15954                ),
   15955                /* mask off lanes which have (index & 0x80) == 0x80 */
   15956                unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7)))
   15957             )
   15958          );
   15959          goto decode_success;
   15960       }
   15961       break;
   15962 
   15963    case 0x01:
   15964    case 0x02:
   15965    case 0x03:
   15966    case 0x05:
   15967    case 0x06:
   15968    case 0x07:
   15969       /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and
   15970          G to G (xmm). */
   15971       /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and
   15972          G to G (xmm). */
   15973       /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or
   15974          xmm) and G to G (xmm). */
   15975       /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and
   15976          G to G (xmm). */
   15977       /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and
   15978          G to G (xmm). */
   15979       /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or
   15980          xmm) and G to G (xmm). */
   15981       if (have66noF2noF3(pfx)
   15982           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   15983          delta = dis_PHADD_128( vbi, pfx, delta, False/*isAvx*/, opc );
   15984          goto decode_success;
   15985       }
   15986       /* ***--- these are MMX class insns introduced in SSSE3 ---*** */
   15987       /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G
   15988          to G (mmx). */
   15989       /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G
   15990          to G (mmx). */
   15991       /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or
   15992          mmx) and G to G (mmx). */
   15993       /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G
   15994          to G (mmx). */
   15995       /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G
   15996          to G (mmx). */
   15997       /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or
   15998          mmx) and G to G (mmx). */
   15999       if (haveNo66noF2noF3(pfx) && sz == 4) {
   16000          const HChar* str = "???";
   16001          IROp   opV64  = Iop_INVALID;
   16002          IROp   opCatO = Iop_CatOddLanes16x4;
   16003          IROp   opCatE = Iop_CatEvenLanes16x4;
   16004          IRTemp sV     = newTemp(Ity_I64);
   16005          IRTemp dV     = newTemp(Ity_I64);
   16006 
   16007          modrm = getUChar(delta);
   16008 
   16009          switch (opc) {
   16010             case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   16011             case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   16012             case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   16013             case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   16014             case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   16015             case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   16016             default: vassert(0);
   16017          }
   16018          if (opc == 0x02 || opc == 0x06) {
   16019             opCatO = Iop_InterleaveHI32x2;
   16020             opCatE = Iop_InterleaveLO32x2;
   16021          }
   16022 
   16023          do_MMX_preamble();
   16024          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   16025 
   16026          if (epartIsReg(modrm)) {
   16027             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   16028             delta += 1;
   16029             DIP("ph%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
   16030                                      nameMMXReg(gregLO3ofRM(modrm)));
   16031          } else {
   16032             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   16033             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   16034             delta += alen;
   16035             DIP("ph%s %s,%s\n", str, dis_buf,
   16036                                      nameMMXReg(gregLO3ofRM(modrm)));
   16037          }
   16038 
   16039          putMMXReg(
   16040             gregLO3ofRM(modrm),
   16041             binop(opV64,
   16042                   binop(opCatE,mkexpr(sV),mkexpr(dV)),
   16043                   binop(opCatO,mkexpr(sV),mkexpr(dV))
   16044             )
   16045          );
   16046          goto decode_success;
   16047       }
   16048       break;
   16049 
   16050    case 0x04:
   16051       /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
   16052          Unsigned Bytes (XMM) */
   16053       if (have66noF2noF3(pfx)
   16054           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   16055          IRTemp sV = newTemp(Ity_V128);
   16056          IRTemp dV = newTemp(Ity_V128);
   16057          modrm     = getUChar(delta);
   16058          UInt   rG = gregOfRexRM(pfx,modrm);
   16059 
   16060          assign( dV, getXMMReg(rG) );
   16061 
   16062          if (epartIsReg(modrm)) {
   16063             UInt rE = eregOfRexRM(pfx,modrm);
   16064             assign( sV, getXMMReg(rE) );
   16065             delta += 1;
   16066             DIP("pmaddubsw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   16067          } else {
   16068             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   16069             gen_SEGV_if_not_16_aligned( addr );
   16070             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   16071             delta += alen;
   16072             DIP("pmaddubsw %s,%s\n", dis_buf, nameXMMReg(rG));
   16073          }
   16074 
   16075          putXMMReg( rG, mkexpr( math_PMADDUBSW_128( dV, sV ) ) );
   16076          goto decode_success;
   16077       }
   16078       /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
   16079          Unsigned Bytes (MMX) */
   16080       if (haveNo66noF2noF3(pfx) && sz == 4) {
   16081          IRTemp sV        = newTemp(Ity_I64);
   16082          IRTemp dV        = newTemp(Ity_I64);
   16083          IRTemp sVoddsSX  = newTemp(Ity_I64);
   16084          IRTemp sVevensSX = newTemp(Ity_I64);
   16085          IRTemp dVoddsZX  = newTemp(Ity_I64);
   16086          IRTemp dVevensZX = newTemp(Ity_I64);
   16087 
   16088          modrm = getUChar(delta);
   16089          do_MMX_preamble();
   16090          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   16091 
   16092          if (epartIsReg(modrm)) {
   16093             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   16094             delta += 1;
   16095             DIP("pmaddubsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   16096                                      nameMMXReg(gregLO3ofRM(modrm)));
   16097          } else {
   16098             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   16099             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   16100             delta += alen;
   16101             DIP("pmaddubsw %s,%s\n", dis_buf,
   16102                                      nameMMXReg(gregLO3ofRM(modrm)));
   16103          }
   16104 
   16105          /* compute dV unsigned x sV signed */
   16106          assign( sVoddsSX,
   16107                  binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) );
   16108          assign( sVevensSX,
   16109                  binop(Iop_SarN16x4,
   16110                        binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)),
   16111                        mkU8(8)) );
   16112          assign( dVoddsZX,
   16113                  binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) );
   16114          assign( dVevensZX,
   16115                  binop(Iop_ShrN16x4,
   16116                        binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)),
   16117                        mkU8(8)) );
   16118 
   16119          putMMXReg(
   16120             gregLO3ofRM(modrm),
   16121             binop(Iop_QAdd16Sx4,
   16122                   binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
   16123                   binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX))
   16124             )
   16125          );
   16126          goto decode_success;
   16127       }
   16128       break;
   16129 
   16130    case 0x08:
   16131    case 0x09:
   16132    case 0x0A:
   16133       /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */
   16134       /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */
   16135       /* 66 0F 38 0A = PSIGND -- Packed Sign 32x4 (XMM) */
   16136       if (have66noF2noF3(pfx)
   16137           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   16138          IRTemp sV      = newTemp(Ity_V128);
   16139          IRTemp dV      = newTemp(Ity_V128);
   16140          IRTemp sHi     = newTemp(Ity_I64);
   16141          IRTemp sLo     = newTemp(Ity_I64);
   16142          IRTemp dHi     = newTemp(Ity_I64);
   16143          IRTemp dLo     = newTemp(Ity_I64);
   16144          const HChar* str = "???";
   16145          Int    laneszB = 0;
   16146 
   16147          switch (opc) {
   16148             case 0x08: laneszB = 1; str = "b"; break;
   16149             case 0x09: laneszB = 2; str = "w"; break;
   16150             case 0x0A: laneszB = 4; str = "d"; break;
   16151             default: vassert(0);
   16152          }
   16153 
   16154          modrm = getUChar(delta);
   16155          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   16156 
   16157          if (epartIsReg(modrm)) {
   16158             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   16159             delta += 1;
   16160             DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
   16161                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   16162          } else {
   16163             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   16164             gen_SEGV_if_not_16_aligned( addr );
   16165             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   16166             delta += alen;
   16167             DIP("psign%s %s,%s\n", str, dis_buf,
   16168                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   16169          }
   16170 
   16171          assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   16172          assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   16173          assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   16174          assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   16175 
   16176          putXMMReg(
   16177             gregOfRexRM(pfx,modrm),
   16178             binop(Iop_64HLtoV128,
   16179                   dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
   16180                   dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
   16181             )
   16182          );
   16183          goto decode_success;
   16184       }
   16185       /* 0F 38 08 = PSIGNB -- Packed Sign 8x8  (MMX) */
   16186       /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */
   16187       /* 0F 38 0A = PSIGND -- Packed Sign 32x2 (MMX) */
   16188       if (haveNo66noF2noF3(pfx) && sz == 4) {
   16189          IRTemp sV      = newTemp(Ity_I64);
   16190          IRTemp dV      = newTemp(Ity_I64);
   16191          const HChar* str = "???";
   16192          Int    laneszB = 0;
   16193 
   16194          switch (opc) {
   16195             case 0x08: laneszB = 1; str = "b"; break;
   16196             case 0x09: laneszB = 2; str = "w"; break;
   16197             case 0x0A: laneszB = 4; str = "d"; break;
   16198             default: vassert(0);
   16199          }
   16200 
   16201          modrm = getUChar(delta);
   16202          do_MMX_preamble();
   16203          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   16204 
   16205          if (epartIsReg(modrm)) {
   16206             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   16207             delta += 1;
   16208             DIP("psign%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
   16209                                         nameMMXReg(gregLO3ofRM(modrm)));
   16210          } else {
   16211             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   16212             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   16213             delta += alen;
   16214             DIP("psign%s %s,%s\n", str, dis_buf,
   16215                                         nameMMXReg(gregLO3ofRM(modrm)));
   16216          }
   16217 
   16218          putMMXReg(
   16219             gregLO3ofRM(modrm),
   16220             dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB )
   16221          );
   16222          goto decode_success;
   16223       }
   16224       break;
   16225 
   16226    case 0x0B:
   16227       /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and
   16228          Scale (XMM) */
   16229       if (have66noF2noF3(pfx)
   16230           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   16231          IRTemp sV  = newTemp(Ity_V128);
   16232          IRTemp dV  = newTemp(Ity_V128);
   16233          IRTemp sHi = newTemp(Ity_I64);
   16234          IRTemp sLo = newTemp(Ity_I64);
   16235          IRTemp dHi = newTemp(Ity_I64);
   16236          IRTemp dLo = newTemp(Ity_I64);
   16237 
   16238          modrm = getUChar(delta);
   16239          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   16240 
   16241          if (epartIsReg(modrm)) {
   16242             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   16243             delta += 1;
   16244             DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   16245                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   16246          } else {
   16247             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   16248             gen_SEGV_if_not_16_aligned( addr );
   16249             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   16250             delta += alen;
   16251             DIP("pmulhrsw %s,%s\n", dis_buf,
   16252                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   16253          }
   16254 
   16255          assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   16256          assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   16257          assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   16258          assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   16259 
   16260          putXMMReg(
   16261             gregOfRexRM(pfx,modrm),
   16262             binop(Iop_64HLtoV128,
   16263                   dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
   16264                   dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
   16265             )
   16266          );
   16267          goto decode_success;
   16268       }
   16269       /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale
   16270          (MMX) */
   16271       if (haveNo66noF2noF3(pfx) && sz == 4) {
   16272          IRTemp sV = newTemp(Ity_I64);
   16273          IRTemp dV = newTemp(Ity_I64);
   16274 
   16275          modrm = getUChar(delta);
   16276          do_MMX_preamble();
   16277          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   16278 
   16279          if (epartIsReg(modrm)) {
   16280             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   16281             delta += 1;
   16282             DIP("pmulhrsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   16283                                     nameMMXReg(gregLO3ofRM(modrm)));
   16284          } else {
   16285             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   16286             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   16287             delta += alen;
   16288             DIP("pmulhrsw %s,%s\n", dis_buf,
   16289                                     nameMMXReg(gregLO3ofRM(modrm)));
   16290          }
   16291 
   16292          putMMXReg(
   16293             gregLO3ofRM(modrm),
   16294             dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) )
   16295          );
   16296          goto decode_success;
   16297       }
   16298       break;
   16299 
   16300    case 0x1C:
   16301    case 0x1D:
   16302    case 0x1E:
   16303       /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */
   16304       /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */
   16305       /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */
   16306       if (have66noF2noF3(pfx)
   16307           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   16308          IRTemp sV  = newTemp(Ity_V128);
   16309          const HChar* str = "???";
   16310          Int    laneszB = 0;
   16311 
   16312          switch (opc) {
   16313             case 0x1C: laneszB = 1; str = "b"; break;
   16314             case 0x1D: laneszB = 2; str = "w"; break;
   16315             case 0x1E: laneszB = 4; str = "d"; break;
   16316             default: vassert(0);
   16317          }
   16318 
   16319          modrm = getUChar(delta);
   16320          if (epartIsReg(modrm)) {
   16321             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   16322             delta += 1;
   16323             DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
   16324                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
   16325          } else {
   16326             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   16327             gen_SEGV_if_not_16_aligned( addr );
   16328             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   16329             delta += alen;
   16330             DIP("pabs%s %s,%s\n", str, dis_buf,
   16331                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
   16332          }
   16333 
   16334          putXMMReg( gregOfRexRM(pfx,modrm),
   16335                     mkexpr(math_PABS_XMM(sV, laneszB)) );
   16336          goto decode_success;
   16337       }
   16338       /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8  (MMX) */
   16339       /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */
   16340       /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */
   16341       if (haveNo66noF2noF3(pfx) && sz == 4) {
   16342          IRTemp sV      = newTemp(Ity_I64);
   16343          const HChar* str = "???";
   16344          Int    laneszB = 0;
   16345 
   16346          switch (opc) {
   16347             case 0x1C: laneszB = 1; str = "b"; break;
   16348             case 0x1D: laneszB = 2; str = "w"; break;
   16349             case 0x1E: laneszB = 4; str = "d"; break;
   16350             default: vassert(0);
   16351          }
   16352 
   16353          modrm = getUChar(delta);
   16354          do_MMX_preamble();
   16355 
   16356          if (epartIsReg(modrm)) {
   16357             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   16358             delta += 1;
   16359             DIP("pabs%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
   16360                                        nameMMXReg(gregLO3ofRM(modrm)));
   16361          } else {
   16362             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   16363             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   16364             delta += alen;
   16365             DIP("pabs%s %s,%s\n", str, dis_buf,
   16366                                        nameMMXReg(gregLO3ofRM(modrm)));
   16367          }
   16368 
   16369          putMMXReg( gregLO3ofRM(modrm),
   16370                     mkexpr(math_PABS_MMX( sV, laneszB )) );
   16371          goto decode_success;
   16372       }
   16373       break;
   16374 
   16375    default:
   16376       break;
   16377 
   16378    }
   16379 
   16380   //decode_failure:
   16381    *decode_OK = False;
   16382    return deltaIN;
   16383 
   16384   decode_success:
   16385    *decode_OK = True;
   16386    return delta;
   16387 }
   16388 
   16389 
   16390 /*------------------------------------------------------------*/
   16391 /*---                                                      ---*/
   16392 /*--- Top-level SSSE3: dis_ESC_0F3A__SupSSE3               ---*/
   16393 /*---                                                      ---*/
   16394 /*------------------------------------------------------------*/
   16395 
   16396 __attribute__((noinline))
   16397 static
   16398 Long dis_ESC_0F3A__SupSSE3 ( Bool* decode_OK,
   16399                              const VexAbiInfo* vbi,
   16400                              Prefix pfx, Int sz, Long deltaIN )
   16401 {
   16402    Long   d64   = 0;
   16403    IRTemp addr  = IRTemp_INVALID;
   16404    UChar  modrm = 0;
   16405    Int    alen  = 0;
   16406    HChar  dis_buf[50];
   16407 
   16408    *decode_OK = False;
   16409 
   16410    Long   delta = deltaIN;
   16411    UChar  opc   = getUChar(delta);
   16412    delta++;
   16413    switch (opc) {
   16414 
   16415    case 0x0F:
   16416       /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */
   16417       if (have66noF2noF3(pfx)
   16418           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   16419          IRTemp sV  = newTemp(Ity_V128);
   16420          IRTemp dV  = newTemp(Ity_V128);
   16421 
   16422          modrm = getUChar(delta);
   16423          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   16424 
   16425          if (epartIsReg(modrm)) {
   16426             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   16427             d64 = (Long)getUChar(delta+1);
   16428             delta += 1+1;
   16429             DIP("palignr $%lld,%s,%s\n", d64,
   16430                                        nameXMMReg(eregOfRexRM(pfx,modrm)),
   16431                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
   16432          } else {
   16433             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   16434             gen_SEGV_if_not_16_aligned( addr );
   16435             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   16436             d64 = (Long)getUChar(delta+alen);
   16437             delta += alen+1;
   16438             DIP("palignr $%lld,%s,%s\n", d64,
   16439                                        dis_buf,
   16440                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
   16441          }
   16442 
   16443          IRTemp res = math_PALIGNR_XMM( sV, dV, d64 );
   16444          putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
   16445          goto decode_success;
   16446       }
   16447       /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */
   16448       if (haveNo66noF2noF3(pfx) && sz == 4) {
   16449          IRTemp sV  = newTemp(Ity_I64);
   16450          IRTemp dV  = newTemp(Ity_I64);
   16451          IRTemp res = newTemp(Ity_I64);
   16452 
   16453          modrm = getUChar(delta);
   16454          do_MMX_preamble();
   16455          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   16456 
   16457          if (epartIsReg(modrm)) {
   16458             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   16459             d64 = (Long)getUChar(delta+1);
   16460             delta += 1+1;
   16461             DIP("palignr $%lld,%s,%s\n",  d64,
   16462                                         nameMMXReg(eregLO3ofRM(modrm)),
   16463                                         nameMMXReg(gregLO3ofRM(modrm)));
   16464          } else {
   16465             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   16466             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   16467             d64 = (Long)getUChar(delta+alen);
   16468             delta += alen+1;
   16469             DIP("palignr $%lld%s,%s\n", d64,
   16470                                       dis_buf,
   16471                                       nameMMXReg(gregLO3ofRM(modrm)));
   16472          }
   16473 
   16474          if (d64 == 0) {
   16475             assign( res, mkexpr(sV) );
   16476          }
   16477          else if (d64 >= 1 && d64 <= 7) {
   16478             assign(res,
   16479                    binop(Iop_Or64,
   16480                          binop(Iop_Shr64, mkexpr(sV), mkU8(8*d64)),
   16481                          binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d64))
   16482                         )));
   16483          }
   16484          else if (d64 == 8) {
   16485            assign( res, mkexpr(dV) );
   16486          }
   16487          else if (d64 >= 9 && d64 <= 15) {
   16488             assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d64-8))) );
   16489          }
   16490          else if (d64 >= 16 && d64 <= 255) {
   16491             assign( res, mkU64(0) );
   16492          }
   16493          else
   16494             vassert(0);
   16495 
   16496          putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
   16497          goto decode_success;
   16498       }
   16499       break;
   16500 
   16501    default:
   16502       break;
   16503 
   16504    }
   16505 
   16506   //decode_failure:
   16507    *decode_OK = False;
   16508    return deltaIN;
   16509 
   16510   decode_success:
   16511    *decode_OK = True;
   16512    return delta;
   16513 }
   16514 
   16515 
   16516 /*------------------------------------------------------------*/
   16517 /*---                                                      ---*/
   16518 /*--- Top-level SSE4: dis_ESC_0F__SSE4                     ---*/
   16519 /*---                                                      ---*/
   16520 /*------------------------------------------------------------*/
   16521 
   16522 __attribute__((noinline))
   16523 static
   16524 Long dis_ESC_0F__SSE4 ( Bool* decode_OK,
   16525                         const VexArchInfo* archinfo,
   16526                         const VexAbiInfo* vbi,
   16527                         Prefix pfx, Int sz, Long deltaIN )
   16528 {
   16529    IRTemp addr  = IRTemp_INVALID;
   16530    IRType ty    = Ity_INVALID;
   16531    UChar  modrm = 0;
   16532    Int    alen  = 0;
   16533    HChar  dis_buf[50];
   16534 
   16535    *decode_OK = False;
   16536 
   16537    Long   delta = deltaIN;
   16538    UChar  opc   = getUChar(delta);
   16539    delta++;
   16540    switch (opc) {
   16541 
   16542    case 0xB8:
   16543       /* F3 0F B8  = POPCNT{W,L,Q}
   16544          Count the number of 1 bits in a register
   16545       */
   16546       if (haveF3noF2(pfx) /* so both 66 and REX.W are possibilities */
   16547           && (sz == 2 || sz == 4 || sz == 8)) {
   16548          /*IRType*/ ty  = szToITy(sz);
   16549          IRTemp     src = newTemp(ty);
   16550          modrm = getUChar(delta);
   16551          if (epartIsReg(modrm)) {
   16552             assign(src, getIRegE(sz, pfx, modrm));
   16553             delta += 1;
   16554             DIP("popcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
   16555                 nameIRegG(sz, pfx, modrm));
   16556          } else {
   16557             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0);
   16558             assign(src, loadLE(ty, mkexpr(addr)));
   16559             delta += alen;
   16560             DIP("popcnt%c %s, %s\n", nameISize(sz), dis_buf,
   16561                 nameIRegG(sz, pfx, modrm));
   16562          }
   16563 
   16564          IRTemp result = gen_POPCOUNT(ty, src);
   16565          putIRegG(sz, pfx, modrm, mkexpr(result));
   16566 
   16567          // Update flags.  This is pretty lame .. perhaps can do better
   16568          // if this turns out to be performance critical.
   16569          // O S A C P are cleared.  Z is set if SRC == 0.
   16570          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   16571          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   16572          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   16573          stmt( IRStmt_Put( OFFB_CC_DEP1,
   16574                binop(Iop_Shl64,
   16575                      unop(Iop_1Uto64,
   16576                           binop(Iop_CmpEQ64,
   16577                                 widenUto64(mkexpr(src)),
   16578                                 mkU64(0))),
   16579                      mkU8(AMD64G_CC_SHIFT_Z))));
   16580 
   16581          goto decode_success;
   16582       }
   16583       break;
   16584 
   16585    case 0xBC:
   16586       /* F3 0F BC -- TZCNT (count trailing zeroes.  A BMI extension,
   16587          which we can only decode if we're sure this is a BMI1 capable cpu
   16588          that supports TZCNT, since otherwise it's BSF, which behaves
   16589          differently on zero source.  */
   16590       if (haveF3noF2(pfx) /* so both 66 and 48 are possibilities */
   16591           && (sz == 2 || sz == 4 || sz == 8)
   16592           && 0 != (archinfo->hwcaps & VEX_HWCAPS_AMD64_BMI)) {
   16593          /*IRType*/ ty  = szToITy(sz);
   16594          IRTemp     src = newTemp(ty);
   16595          modrm = getUChar(delta);
   16596          if (epartIsReg(modrm)) {
   16597             assign(src, getIRegE(sz, pfx, modrm));
   16598             delta += 1;
   16599             DIP("tzcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
   16600                 nameIRegG(sz, pfx, modrm));
   16601          } else {
   16602             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0);
   16603             assign(src, loadLE(ty, mkexpr(addr)));
   16604             delta += alen;
   16605             DIP("tzcnt%c %s, %s\n", nameISize(sz), dis_buf,
   16606                 nameIRegG(sz, pfx, modrm));
   16607          }
   16608 
   16609          IRTemp res = gen_TZCNT(ty, src);
   16610          putIRegG(sz, pfx, modrm, mkexpr(res));
   16611 
   16612          // Update flags.  This is pretty lame .. perhaps can do better
   16613          // if this turns out to be performance critical.
   16614          // O S A P are cleared.  Z is set if RESULT == 0.
   16615          // C is set if SRC is zero.
   16616          IRTemp src64 = newTemp(Ity_I64);
   16617          IRTemp res64 = newTemp(Ity_I64);
   16618          assign(src64, widenUto64(mkexpr(src)));
   16619          assign(res64, widenUto64(mkexpr(res)));
   16620 
   16621          IRTemp oszacp = newTemp(Ity_I64);
   16622          assign(
   16623             oszacp,
   16624             binop(Iop_Or64,
   16625                   binop(Iop_Shl64,
   16626                         unop(Iop_1Uto64,
   16627                              binop(Iop_CmpEQ64, mkexpr(res64), mkU64(0))),
   16628                         mkU8(AMD64G_CC_SHIFT_Z)),
   16629                   binop(Iop_Shl64,
   16630                         unop(Iop_1Uto64,
   16631                              binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0))),
   16632                         mkU8(AMD64G_CC_SHIFT_C))
   16633             )
   16634          );
   16635 
   16636          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   16637          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   16638          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   16639          stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
   16640 
   16641          goto decode_success;
   16642       }
   16643       break;
   16644 
   16645    case 0xBD:
   16646       /* F3 0F BD -- LZCNT (count leading zeroes.  An AMD extension,
   16647          which we can only decode if we're sure this is an AMD cpu
   16648          that supports LZCNT, since otherwise it's BSR, which behaves
   16649          differently.  Bizarrely, my Sandy Bridge also accepts these
   16650          instructions but produces different results. */
   16651       if (haveF3noF2(pfx) /* so both 66 and 48 are possibilities */
   16652           && (sz == 2 || sz == 4 || sz == 8)
   16653           && 0 != (archinfo->hwcaps & VEX_HWCAPS_AMD64_LZCNT)) {
   16654          /*IRType*/ ty  = szToITy(sz);
   16655          IRTemp     src = newTemp(ty);
   16656          modrm = getUChar(delta);
   16657          if (epartIsReg(modrm)) {
   16658             assign(src, getIRegE(sz, pfx, modrm));
   16659             delta += 1;
   16660             DIP("lzcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
   16661                 nameIRegG(sz, pfx, modrm));
   16662          } else {
   16663             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0);
   16664             assign(src, loadLE(ty, mkexpr(addr)));
   16665             delta += alen;
   16666             DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf,
   16667                 nameIRegG(sz, pfx, modrm));
   16668          }
   16669 
   16670          IRTemp res = gen_LZCNT(ty, src);
   16671          putIRegG(sz, pfx, modrm, mkexpr(res));
   16672 
   16673          // Update flags.  This is pretty lame .. perhaps can do better
   16674          // if this turns out to be performance critical.
   16675          // O S A P are cleared.  Z is set if RESULT == 0.
   16676          // C is set if SRC is zero.
   16677          IRTemp src64 = newTemp(Ity_I64);
   16678          IRTemp res64 = newTemp(Ity_I64);
   16679          assign(src64, widenUto64(mkexpr(src)));
   16680          assign(res64, widenUto64(mkexpr(res)));
   16681 
   16682          IRTemp oszacp = newTemp(Ity_I64);
   16683          assign(
   16684             oszacp,
   16685             binop(Iop_Or64,
   16686                   binop(Iop_Shl64,
   16687                         unop(Iop_1Uto64,
   16688                              binop(Iop_CmpEQ64, mkexpr(res64), mkU64(0))),
   16689                         mkU8(AMD64G_CC_SHIFT_Z)),
   16690                   binop(Iop_Shl64,
   16691                         unop(Iop_1Uto64,
   16692                              binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0))),
   16693                         mkU8(AMD64G_CC_SHIFT_C))
   16694             )
   16695          );
   16696 
   16697          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   16698          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   16699          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   16700          stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
   16701 
   16702          goto decode_success;
   16703       }
   16704       break;
   16705 
   16706    default:
   16707       break;
   16708 
   16709    }
   16710 
   16711   //decode_failure:
   16712    *decode_OK = False;
   16713    return deltaIN;
   16714 
   16715   decode_success:
   16716    *decode_OK = True;
   16717    return delta;
   16718 }
   16719 
   16720 
   16721 /*------------------------------------------------------------*/
   16722 /*---                                                      ---*/
   16723 /*--- Top-level SSE4: dis_ESC_0F38__SSE4                   ---*/
   16724 /*---                                                      ---*/
   16725 /*------------------------------------------------------------*/
   16726 
   16727 static IRTemp math_PBLENDVB_128 ( IRTemp vecE, IRTemp vecG,
   16728                                   IRTemp vec0/*controlling mask*/,
   16729                                   UInt gran, IROp opSAR )
   16730 {
   16731    /* The tricky bit is to convert vec0 into a suitable mask, by
   16732       copying the most significant bit of each lane into all positions
   16733       in the lane. */
   16734    IRTemp sh = newTemp(Ity_I8);
   16735    assign(sh, mkU8(8 * gran - 1));
   16736 
   16737    IRTemp mask = newTemp(Ity_V128);
   16738    assign(mask, binop(opSAR, mkexpr(vec0), mkexpr(sh)));
   16739 
   16740    IRTemp notmask = newTemp(Ity_V128);
   16741    assign(notmask, unop(Iop_NotV128, mkexpr(mask)));
   16742 
   16743    IRTemp res = newTemp(Ity_V128);
   16744    assign(res,  binop(Iop_OrV128,
   16745                       binop(Iop_AndV128, mkexpr(vecE), mkexpr(mask)),
   16746                       binop(Iop_AndV128, mkexpr(vecG), mkexpr(notmask))));
   16747    return res;
   16748 }
   16749 
   16750 static IRTemp math_PBLENDVB_256 ( IRTemp vecE, IRTemp vecG,
   16751                                   IRTemp vec0/*controlling mask*/,
   16752                                   UInt gran, IROp opSAR128 )
   16753 {
   16754    /* The tricky bit is to convert vec0 into a suitable mask, by
   16755       copying the most significant bit of each lane into all positions
   16756       in the lane. */
   16757    IRTemp sh = newTemp(Ity_I8);
   16758    assign(sh, mkU8(8 * gran - 1));
   16759 
   16760    IRTemp vec0Hi = IRTemp_INVALID;
   16761    IRTemp vec0Lo = IRTemp_INVALID;
   16762    breakupV256toV128s( vec0, &vec0Hi, &vec0Lo );
   16763 
   16764    IRTemp mask = newTemp(Ity_V256);
   16765    assign(mask, binop(Iop_V128HLtoV256,
   16766                       binop(opSAR128, mkexpr(vec0Hi), mkexpr(sh)),
   16767                       binop(opSAR128, mkexpr(vec0Lo), mkexpr(sh))));
   16768 
   16769    IRTemp notmask = newTemp(Ity_V256);
   16770    assign(notmask, unop(Iop_NotV256, mkexpr(mask)));
   16771 
   16772    IRTemp res = newTemp(Ity_V256);
   16773    assign(res,  binop(Iop_OrV256,
   16774                       binop(Iop_AndV256, mkexpr(vecE), mkexpr(mask)),
   16775                       binop(Iop_AndV256, mkexpr(vecG), mkexpr(notmask))));
   16776    return res;
   16777 }
   16778 
   16779 static Long dis_VBLENDV_128 ( const VexAbiInfo* vbi, Prefix pfx, Long delta,
   16780                               const HChar *name, UInt gran, IROp opSAR )
   16781 {
   16782    IRTemp addr   = IRTemp_INVALID;
   16783    Int    alen   = 0;
   16784    HChar  dis_buf[50];
   16785    UChar  modrm  = getUChar(delta);
   16786    UInt   rG     = gregOfRexRM(pfx, modrm);
   16787    UInt   rV     = getVexNvvvv(pfx);
   16788    UInt   rIS4   = 0xFF; /* invalid */
   16789    IRTemp vecE   = newTemp(Ity_V128);
   16790    IRTemp vecV   = newTemp(Ity_V128);
   16791    IRTemp vecIS4 = newTemp(Ity_V128);
   16792    if (epartIsReg(modrm)) {
   16793       delta++;
   16794       UInt rE = eregOfRexRM(pfx, modrm);
   16795       assign(vecE, getXMMReg(rE));
   16796       UChar ib = getUChar(delta);
   16797       rIS4 = (ib >> 4) & 0xF;
   16798       DIP("%s %s,%s,%s,%s\n",
   16799           name, nameXMMReg(rIS4), nameXMMReg(rE),
   16800           nameXMMReg(rV), nameXMMReg(rG));
   16801    } else {
   16802       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   16803       delta += alen;
   16804       assign(vecE, loadLE(Ity_V128, mkexpr(addr)));
   16805       UChar ib = getUChar(delta);
   16806       rIS4 = (ib >> 4) & 0xF;
   16807       DIP("%s %s,%s,%s,%s\n",
   16808           name, nameXMMReg(rIS4), dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   16809    }
   16810    delta++;
   16811    assign(vecV,   getXMMReg(rV));
   16812    assign(vecIS4, getXMMReg(rIS4));
   16813    IRTemp res = math_PBLENDVB_128( vecE, vecV, vecIS4, gran, opSAR );
   16814    putYMMRegLoAndZU( rG, mkexpr(res) );
   16815    return delta;
   16816 }
   16817 
   16818 static Long dis_VBLENDV_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta,
   16819                               const HChar *name, UInt gran, IROp opSAR128 )
   16820 {
   16821    IRTemp addr   = IRTemp_INVALID;
   16822    Int    alen   = 0;
   16823    HChar  dis_buf[50];
   16824    UChar  modrm  = getUChar(delta);
   16825    UInt   rG     = gregOfRexRM(pfx, modrm);
   16826    UInt   rV     = getVexNvvvv(pfx);
   16827    UInt   rIS4   = 0xFF; /* invalid */
   16828    IRTemp vecE   = newTemp(Ity_V256);
   16829    IRTemp vecV   = newTemp(Ity_V256);
   16830    IRTemp vecIS4 = newTemp(Ity_V256);
   16831    if (epartIsReg(modrm)) {
   16832       delta++;
   16833       UInt rE = eregOfRexRM(pfx, modrm);
   16834       assign(vecE, getYMMReg(rE));
   16835       UChar ib = getUChar(delta);
   16836       rIS4 = (ib >> 4) & 0xF;
   16837       DIP("%s %s,%s,%s,%s\n",
   16838           name, nameYMMReg(rIS4), nameYMMReg(rE),
   16839           nameYMMReg(rV), nameYMMReg(rG));
   16840    } else {
   16841       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   16842       delta += alen;
   16843       assign(vecE, loadLE(Ity_V256, mkexpr(addr)));
   16844       UChar ib = getUChar(delta);
   16845       rIS4 = (ib >> 4) & 0xF;
   16846       DIP("%s %s,%s,%s,%s\n",
   16847           name, nameYMMReg(rIS4), dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   16848    }
   16849    delta++;
   16850    assign(vecV,   getYMMReg(rV));
   16851    assign(vecIS4, getYMMReg(rIS4));
   16852    IRTemp res = math_PBLENDVB_256( vecE, vecV, vecIS4, gran, opSAR128 );
   16853    putYMMReg( rG, mkexpr(res) );
   16854    return delta;
   16855 }
   16856 
   16857 static void finish_xTESTy ( IRTemp andV, IRTemp andnV, Int sign )
   16858 {
   16859    /* Set Z=1 iff (vecE & vecG) == 0
   16860       Set C=1 iff (vecE & not vecG) == 0
   16861    */
   16862 
   16863    /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
   16864 
   16865    /* andV resp. andnV, reduced to 64-bit values, by or-ing the top
   16866       and bottom 64-bits together.  It relies on this trick:
   16867 
   16868       InterleaveLO64x2([a,b],[c,d]) == [b,d]    hence
   16869 
   16870       InterleaveLO64x2([a,b],[a,b]) == [b,b]    and similarly
   16871       InterleaveHI64x2([a,b],[a,b]) == [a,a]
   16872 
   16873       and so the OR of the above 2 exprs produces
   16874       [a OR b, a OR b], from which we simply take the lower half.
   16875    */
   16876    IRTemp and64  = newTemp(Ity_I64);
   16877    IRTemp andn64 = newTemp(Ity_I64);
   16878 
   16879    assign(and64,
   16880           unop(Iop_V128to64,
   16881                binop(Iop_OrV128,
   16882                      binop(Iop_InterleaveLO64x2,
   16883                            mkexpr(andV), mkexpr(andV)),
   16884                      binop(Iop_InterleaveHI64x2,
   16885                            mkexpr(andV), mkexpr(andV)))));
   16886 
   16887    assign(andn64,
   16888           unop(Iop_V128to64,
   16889                binop(Iop_OrV128,
   16890                      binop(Iop_InterleaveLO64x2,
   16891                            mkexpr(andnV), mkexpr(andnV)),
   16892                      binop(Iop_InterleaveHI64x2,
   16893                            mkexpr(andnV), mkexpr(andnV)))));
   16894 
   16895    IRTemp z64 = newTemp(Ity_I64);
   16896    IRTemp c64 = newTemp(Ity_I64);
   16897    if (sign == 64) {
   16898       /* When only interested in the most significant bit, just shift
   16899          arithmetically right and negate.  */
   16900       assign(z64,
   16901              unop(Iop_Not64,
   16902                   binop(Iop_Sar64, mkexpr(and64), mkU8(63))));
   16903 
   16904       assign(c64,
   16905              unop(Iop_Not64,
   16906                   binop(Iop_Sar64, mkexpr(andn64), mkU8(63))));
   16907    } else {
   16908       if (sign == 32) {
   16909          /* When interested in bit 31 and bit 63, mask those bits and
   16910             fallthrough into the PTEST handling.  */
   16911          IRTemp t0 = newTemp(Ity_I64);
   16912          IRTemp t1 = newTemp(Ity_I64);
   16913          IRTemp t2 = newTemp(Ity_I64);
   16914          assign(t0, mkU64(0x8000000080000000ULL));
   16915          assign(t1, binop(Iop_And64, mkexpr(and64), mkexpr(t0)));
   16916          assign(t2, binop(Iop_And64, mkexpr(andn64), mkexpr(t0)));
   16917          and64 = t1;
   16918          andn64 = t2;
   16919       }
   16920       /* Now convert and64, andn64 to all-zeroes or all-1s, so we can
   16921          slice out the Z and C bits conveniently.  We use the standard
   16922          trick all-zeroes -> all-zeroes, anything-else -> all-ones
   16923          done by "(x | -x) >>s (word-size - 1)".
   16924       */
   16925       assign(z64,
   16926              unop(Iop_Not64,
   16927                   binop(Iop_Sar64,
   16928                         binop(Iop_Or64,
   16929                               binop(Iop_Sub64, mkU64(0), mkexpr(and64)),
   16930                                     mkexpr(and64)), mkU8(63))));
   16931 
   16932       assign(c64,
   16933              unop(Iop_Not64,
   16934                   binop(Iop_Sar64,
   16935                         binop(Iop_Or64,
   16936                               binop(Iop_Sub64, mkU64(0), mkexpr(andn64)),
   16937                                     mkexpr(andn64)), mkU8(63))));
   16938    }
   16939 
   16940    /* And finally, slice out the Z and C flags and set the flags
   16941       thunk to COPY for them.  OSAP are set to zero. */
   16942    IRTemp newOSZACP = newTemp(Ity_I64);
   16943    assign(newOSZACP,
   16944           binop(Iop_Or64,
   16945                 binop(Iop_And64, mkexpr(z64), mkU64(AMD64G_CC_MASK_Z)),
   16946                 binop(Iop_And64, mkexpr(c64), mkU64(AMD64G_CC_MASK_C))));
   16947 
   16948    stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(newOSZACP)));
   16949    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   16950    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   16951    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   16952 }
   16953 
   16954 
   16955 /* Handles 128 bit versions of PTEST, VTESTPS or VTESTPD.
   16956    sign is 0 for PTEST insn, 32 for VTESTPS and 64 for VTESTPD. */
   16957 static Long dis_xTESTy_128 ( const VexAbiInfo* vbi, Prefix pfx,
   16958                              Long delta, Bool isAvx, Int sign )
   16959 {
   16960    IRTemp addr   = IRTemp_INVALID;
   16961    Int    alen   = 0;
   16962    HChar  dis_buf[50];
   16963    UChar  modrm  = getUChar(delta);
   16964    UInt   rG     = gregOfRexRM(pfx, modrm);
   16965    IRTemp vecE = newTemp(Ity_V128);
   16966    IRTemp vecG = newTemp(Ity_V128);
   16967 
   16968    if ( epartIsReg(modrm) ) {
   16969       UInt rE = eregOfRexRM(pfx, modrm);
   16970       assign(vecE, getXMMReg(rE));
   16971       delta += 1;
   16972       DIP( "%s%stest%s %s,%s\n",
   16973            isAvx ? "v" : "", sign == 0 ? "p" : "",
   16974            sign == 0 ? "" : sign == 32 ? "ps" : "pd",
   16975            nameXMMReg(rE), nameXMMReg(rG) );
   16976    } else {
   16977       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16978       if (!isAvx)
   16979          gen_SEGV_if_not_16_aligned( addr );
   16980       assign(vecE, loadLE( Ity_V128, mkexpr(addr) ));
   16981       delta += alen;
   16982       DIP( "%s%stest%s %s,%s\n",
   16983            isAvx ? "v" : "", sign == 0 ? "p" : "",
   16984            sign == 0 ? "" : sign == 32 ? "ps" : "pd",
   16985            dis_buf, nameXMMReg(rG) );
   16986    }
   16987 
   16988    assign(vecG, getXMMReg(rG));
   16989 
   16990    /* Set Z=1 iff (vecE & vecG) == 0
   16991       Set C=1 iff (vecE & not vecG) == 0
   16992    */
   16993 
   16994    /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
   16995    IRTemp andV  = newTemp(Ity_V128);
   16996    IRTemp andnV = newTemp(Ity_V128);
   16997    assign(andV,  binop(Iop_AndV128, mkexpr(vecE), mkexpr(vecG)));
   16998    assign(andnV, binop(Iop_AndV128,
   16999                        mkexpr(vecE),
   17000                        binop(Iop_XorV128, mkexpr(vecG),
   17001                                           mkV128(0xFFFF))));
   17002 
   17003    finish_xTESTy ( andV, andnV, sign );
   17004    return delta;
   17005 }
   17006 
   17007 
   17008 /* Handles 256 bit versions of PTEST, VTESTPS or VTESTPD.
   17009    sign is 0 for PTEST insn, 32 for VTESTPS and 64 for VTESTPD. */
   17010 static Long dis_xTESTy_256 ( const VexAbiInfo* vbi, Prefix pfx,
   17011                              Long delta, Int sign )
   17012 {
   17013    IRTemp addr   = IRTemp_INVALID;
   17014    Int    alen   = 0;
   17015    HChar  dis_buf[50];
   17016    UChar  modrm  = getUChar(delta);
   17017    UInt   rG     = gregOfRexRM(pfx, modrm);
   17018    IRTemp vecE   = newTemp(Ity_V256);
   17019    IRTemp vecG   = newTemp(Ity_V256);
   17020 
   17021    if ( epartIsReg(modrm) ) {
   17022       UInt rE = eregOfRexRM(pfx, modrm);
   17023       assign(vecE, getYMMReg(rE));
   17024       delta += 1;
   17025       DIP( "v%stest%s %s,%s\n", sign == 0 ? "p" : "",
   17026            sign == 0 ? "" : sign == 32 ? "ps" : "pd",
   17027            nameYMMReg(rE), nameYMMReg(rG) );
   17028    } else {
   17029       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17030       assign(vecE, loadLE( Ity_V256, mkexpr(addr) ));
   17031       delta += alen;
   17032       DIP( "v%stest%s %s,%s\n", sign == 0 ? "p" : "",
   17033            sign == 0 ? "" : sign == 32 ? "ps" : "pd",
   17034            dis_buf, nameYMMReg(rG) );
   17035    }
   17036 
   17037    assign(vecG, getYMMReg(rG));
   17038 
   17039    /* Set Z=1 iff (vecE & vecG) == 0
   17040       Set C=1 iff (vecE & not vecG) == 0
   17041    */
   17042 
   17043    /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
   17044    IRTemp andV  = newTemp(Ity_V256);
   17045    IRTemp andnV = newTemp(Ity_V256);
   17046    assign(andV,  binop(Iop_AndV256, mkexpr(vecE), mkexpr(vecG)));
   17047    assign(andnV, binop(Iop_AndV256,
   17048                        mkexpr(vecE), unop(Iop_NotV256, mkexpr(vecG))));
   17049 
   17050    IRTemp andVhi  = IRTemp_INVALID;
   17051    IRTemp andVlo  = IRTemp_INVALID;
   17052    IRTemp andnVhi = IRTemp_INVALID;
   17053    IRTemp andnVlo = IRTemp_INVALID;
   17054    breakupV256toV128s( andV, &andVhi, &andVlo );
   17055    breakupV256toV128s( andnV, &andnVhi, &andnVlo );
   17056 
   17057    IRTemp andV128  = newTemp(Ity_V128);
   17058    IRTemp andnV128 = newTemp(Ity_V128);
   17059    assign( andV128, binop( Iop_OrV128, mkexpr(andVhi), mkexpr(andVlo) ) );
   17060    assign( andnV128, binop( Iop_OrV128, mkexpr(andnVhi), mkexpr(andnVlo) ) );
   17061 
   17062    finish_xTESTy ( andV128, andnV128, sign );
   17063    return delta;
   17064 }
   17065 
   17066 
   17067 /* Handles 128 bit versions of PMOVZXBW and PMOVSXBW. */
   17068 static Long dis_PMOVxXBW_128 ( const VexAbiInfo* vbi, Prefix pfx,
   17069                                Long delta, Bool isAvx, Bool xIsZ )
   17070 {
   17071    IRTemp addr   = IRTemp_INVALID;
   17072    Int    alen   = 0;
   17073    HChar  dis_buf[50];
   17074    IRTemp srcVec = newTemp(Ity_V128);
   17075    UChar  modrm  = getUChar(delta);
   17076    const HChar* mbV    = isAvx ? "v" : "";
   17077    const HChar  how    = xIsZ ? 'z' : 's';
   17078    UInt   rG     = gregOfRexRM(pfx, modrm);
   17079    if ( epartIsReg(modrm) ) {
   17080       UInt rE = eregOfRexRM(pfx, modrm);
   17081       assign( srcVec, getXMMReg(rE) );
   17082       delta += 1;
   17083       DIP( "%spmov%cxbw %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
   17084    } else {
   17085       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17086       assign( srcVec,
   17087               unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
   17088       delta += alen;
   17089       DIP( "%spmov%cxbw %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
   17090    }
   17091 
   17092    IRExpr* res
   17093       = xIsZ /* do math for either zero or sign extend */
   17094         ? binop( Iop_InterleaveLO8x16,
   17095                  IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) )
   17096         : binop( Iop_SarN16x8,
   17097                  binop( Iop_ShlN16x8,
   17098                         binop( Iop_InterleaveLO8x16,
   17099                                IRExpr_Const( IRConst_V128(0) ),
   17100                                mkexpr(srcVec) ),
   17101                         mkU8(8) ),
   17102                  mkU8(8) );
   17103 
   17104    (isAvx ? putYMMRegLoAndZU : putXMMReg) ( rG, res );
   17105 
   17106    return delta;
   17107 }
   17108 
   17109 
   17110 /* Handles 256 bit versions of PMOVZXBW and PMOVSXBW. */
   17111 static Long dis_PMOVxXBW_256 ( const VexAbiInfo* vbi, Prefix pfx,
   17112                                Long delta, Bool xIsZ )
   17113 {
   17114    IRTemp addr   = IRTemp_INVALID;
   17115    Int    alen   = 0;
   17116    HChar  dis_buf[50];
   17117    IRTemp srcVec = newTemp(Ity_V128);
   17118    UChar  modrm  = getUChar(delta);
   17119    UChar  how    = xIsZ ? 'z' : 's';
   17120    UInt   rG     = gregOfRexRM(pfx, modrm);
   17121    if ( epartIsReg(modrm) ) {
   17122       UInt rE = eregOfRexRM(pfx, modrm);
   17123       assign( srcVec, getXMMReg(rE) );
   17124       delta += 1;
   17125       DIP( "vpmov%cxbw %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
   17126    } else {
   17127       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17128       assign( srcVec, loadLE( Ity_V128, mkexpr(addr) ) );
   17129       delta += alen;
   17130       DIP( "vpmov%cxbw %s,%s\n", how, dis_buf, nameYMMReg(rG) );
   17131    }
   17132 
   17133    /* First do zero extend.  */
   17134    IRExpr* res
   17135       = binop( Iop_V128HLtoV256,
   17136                binop( Iop_InterleaveHI8x16,
   17137                       IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ),
   17138                binop( Iop_InterleaveLO8x16,
   17139                       IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) );
   17140    /* And if needed sign extension as well.  */
   17141    if (!xIsZ)
   17142       res = binop( Iop_SarN16x16,
   17143                    binop( Iop_ShlN16x16, res, mkU8(8) ), mkU8(8) );
   17144 
   17145    putYMMReg ( rG, res );
   17146 
   17147    return delta;
   17148 }
   17149 
   17150 
   17151 static Long dis_PMOVxXWD_128 ( const VexAbiInfo* vbi, Prefix pfx,
   17152                                Long delta, Bool isAvx, Bool xIsZ )
   17153 {
   17154    IRTemp addr   = IRTemp_INVALID;
   17155    Int    alen   = 0;
   17156    HChar  dis_buf[50];
   17157    IRTemp srcVec = newTemp(Ity_V128);
   17158    UChar  modrm  = getUChar(delta);
   17159    const HChar* mbV    = isAvx ? "v" : "";
   17160    const HChar  how    = xIsZ ? 'z' : 's';
   17161    UInt   rG     = gregOfRexRM(pfx, modrm);
   17162 
   17163    if ( epartIsReg(modrm) ) {
   17164       UInt rE = eregOfRexRM(pfx, modrm);
   17165       assign( srcVec, getXMMReg(rE) );
   17166       delta += 1;
   17167       DIP( "%spmov%cxwd %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
   17168    } else {
   17169       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17170       assign( srcVec,
   17171               unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
   17172       delta += alen;
   17173       DIP( "%spmov%cxwd %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
   17174    }
   17175 
   17176    IRExpr* res
   17177       = binop( Iop_InterleaveLO16x8,
   17178                IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) );
   17179    if (!xIsZ)
   17180       res = binop(Iop_SarN32x4,
   17181                   binop(Iop_ShlN32x4, res, mkU8(16)), mkU8(16));
   17182 
   17183    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   17184       ( gregOfRexRM(pfx, modrm), res );
   17185 
   17186    return delta;
   17187 }
   17188 
   17189 
   17190 static Long dis_PMOVxXWD_256 ( const VexAbiInfo* vbi, Prefix pfx,
   17191                                Long delta, Bool xIsZ )
   17192 {
   17193    IRTemp addr   = IRTemp_INVALID;
   17194    Int    alen   = 0;
   17195    HChar  dis_buf[50];
   17196    IRTemp srcVec = newTemp(Ity_V128);
   17197    UChar  modrm  = getUChar(delta);
   17198    UChar  how    = xIsZ ? 'z' : 's';
   17199    UInt   rG     = gregOfRexRM(pfx, modrm);
   17200 
   17201    if ( epartIsReg(modrm) ) {
   17202       UInt rE = eregOfRexRM(pfx, modrm);
   17203       assign( srcVec, getXMMReg(rE) );
   17204       delta += 1;
   17205       DIP( "vpmov%cxwd %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
   17206    } else {
   17207       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17208       assign( srcVec, loadLE( Ity_V128, mkexpr(addr) ) );
   17209       delta += alen;
   17210       DIP( "vpmov%cxwd %s,%s\n", how, dis_buf, nameYMMReg(rG) );
   17211    }
   17212 
   17213    IRExpr* res
   17214       = binop( Iop_V128HLtoV256,
   17215                binop( Iop_InterleaveHI16x8,
   17216                       IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ),
   17217                binop( Iop_InterleaveLO16x8,
   17218                       IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) );
   17219    if (!xIsZ)
   17220       res = binop(Iop_SarN32x8,
   17221                   binop(Iop_ShlN32x8, res, mkU8(16)), mkU8(16));
   17222 
   17223    putYMMReg ( rG, res );
   17224 
   17225    return delta;
   17226 }
   17227 
   17228 
   17229 static Long dis_PMOVSXWQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
   17230                                Long delta, Bool isAvx )
   17231 {
   17232    IRTemp addr     = IRTemp_INVALID;
   17233    Int    alen     = 0;
   17234    HChar  dis_buf[50];
   17235    IRTemp srcBytes = newTemp(Ity_I32);
   17236    UChar  modrm    = getUChar(delta);
   17237    const HChar* mbV = isAvx ? "v" : "";
   17238    UInt   rG       = gregOfRexRM(pfx, modrm);
   17239 
   17240    if ( epartIsReg( modrm ) ) {
   17241       UInt rE = eregOfRexRM(pfx, modrm);
   17242       assign( srcBytes, getXMMRegLane32( rE, 0 ) );
   17243       delta += 1;
   17244       DIP( "%spmovsxwq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
   17245    } else {
   17246       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17247       assign( srcBytes, loadLE( Ity_I32, mkexpr(addr) ) );
   17248       delta += alen;
   17249       DIP( "%spmovsxwq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
   17250    }
   17251 
   17252    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   17253       ( rG, binop( Iop_64HLtoV128,
   17254                    unop( Iop_16Sto64,
   17255                          unop( Iop_32HIto16, mkexpr(srcBytes) ) ),
   17256                    unop( Iop_16Sto64,
   17257                          unop( Iop_32to16, mkexpr(srcBytes) ) ) ) );
   17258    return delta;
   17259 }
   17260 
   17261 
   17262 static Long dis_PMOVSXWQ_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta )
   17263 {
   17264    IRTemp addr     = IRTemp_INVALID;
   17265    Int    alen     = 0;
   17266    HChar  dis_buf[50];
   17267    IRTemp srcBytes = newTemp(Ity_I64);
   17268    UChar  modrm    = getUChar(delta);
   17269    UInt   rG       = gregOfRexRM(pfx, modrm);
   17270    IRTemp s3, s2, s1, s0;
   17271    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   17272 
   17273    if ( epartIsReg( modrm ) ) {
   17274       UInt rE = eregOfRexRM(pfx, modrm);
   17275       assign( srcBytes, getXMMRegLane64( rE, 0 ) );
   17276       delta += 1;
   17277       DIP( "vpmovsxwq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
   17278    } else {
   17279       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17280       assign( srcBytes, loadLE( Ity_I64, mkexpr(addr) ) );
   17281       delta += alen;
   17282       DIP( "vpmovsxwq %s,%s\n", dis_buf, nameYMMReg(rG) );
   17283    }
   17284 
   17285    breakup64to16s( srcBytes, &s3, &s2, &s1, &s0 );
   17286    putYMMReg( rG, binop( Iop_V128HLtoV256,
   17287                          binop( Iop_64HLtoV128,
   17288                                 unop( Iop_16Sto64, mkexpr(s3) ),
   17289                                 unop( Iop_16Sto64, mkexpr(s2) ) ),
   17290                          binop( Iop_64HLtoV128,
   17291                                 unop( Iop_16Sto64, mkexpr(s1) ),
   17292                                 unop( Iop_16Sto64, mkexpr(s0) ) ) ) );
   17293    return delta;
   17294 }
   17295 
   17296 
   17297 static Long dis_PMOVZXWQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
   17298                                Long delta, Bool isAvx )
   17299 {
   17300    IRTemp addr     = IRTemp_INVALID;
   17301    Int    alen     = 0;
   17302    HChar  dis_buf[50];
   17303    IRTemp srcVec = newTemp(Ity_V128);
   17304    UChar  modrm    = getUChar(delta);
   17305    const HChar* mbV = isAvx ? "v" : "";
   17306    UInt   rG       = gregOfRexRM(pfx, modrm);
   17307 
   17308    if ( epartIsReg( modrm ) ) {
   17309       UInt rE = eregOfRexRM(pfx, modrm);
   17310       assign( srcVec, getXMMReg(rE) );
   17311       delta += 1;
   17312       DIP( "%spmovzxwq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
   17313    } else {
   17314       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17315       assign( srcVec,
   17316               unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
   17317       delta += alen;
   17318       DIP( "%spmovzxwq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
   17319    }
   17320 
   17321    IRTemp zeroVec = newTemp( Ity_V128 );
   17322    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   17323 
   17324    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   17325       ( rG, binop( Iop_InterleaveLO16x8,
   17326                    mkexpr(zeroVec),
   17327                    binop( Iop_InterleaveLO16x8,
   17328                           mkexpr(zeroVec), mkexpr(srcVec) ) ) );
   17329    return delta;
   17330 }
   17331 
   17332 
   17333 static Long dis_PMOVZXWQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
   17334                                Long delta )
   17335 {
   17336    IRTemp addr     = IRTemp_INVALID;
   17337    Int    alen     = 0;
   17338    HChar  dis_buf[50];
   17339    IRTemp srcVec = newTemp(Ity_V128);
   17340    UChar  modrm    = getUChar(delta);
   17341    UInt   rG       = gregOfRexRM(pfx, modrm);
   17342 
   17343    if ( epartIsReg( modrm ) ) {
   17344       UInt rE = eregOfRexRM(pfx, modrm);
   17345       assign( srcVec, getXMMReg(rE) );
   17346       delta += 1;
   17347       DIP( "vpmovzxwq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
   17348    } else {
   17349       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17350       assign( srcVec,
   17351               unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
   17352       delta += alen;
   17353       DIP( "vpmovzxwq %s,%s\n", dis_buf, nameYMMReg(rG) );
   17354    }
   17355 
   17356    IRTemp zeroVec = newTemp( Ity_V128 );
   17357    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   17358 
   17359    putYMMReg( rG, binop( Iop_V128HLtoV256,
   17360                          binop( Iop_InterleaveHI16x8,
   17361                                 mkexpr(zeroVec),
   17362                                 binop( Iop_InterleaveLO16x8,
   17363                                        mkexpr(zeroVec), mkexpr(srcVec) ) ),
   17364                          binop( Iop_InterleaveLO16x8,
   17365                                 mkexpr(zeroVec),
   17366                                 binop( Iop_InterleaveLO16x8,
   17367                                        mkexpr(zeroVec), mkexpr(srcVec) ) ) ) );
   17368    return delta;
   17369 }
   17370 
   17371 
   17372 /* Handles 128 bit versions of PMOVZXDQ and PMOVSXDQ. */
   17373 static Long dis_PMOVxXDQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
   17374                                Long delta, Bool isAvx, Bool xIsZ )
   17375 {
   17376    IRTemp addr   = IRTemp_INVALID;
   17377    Int    alen   = 0;
   17378    HChar  dis_buf[50];
   17379    IRTemp srcI64 = newTemp(Ity_I64);
   17380    IRTemp srcVec = newTemp(Ity_V128);
   17381    UChar  modrm  = getUChar(delta);
   17382    const HChar* mbV = isAvx ? "v" : "";
   17383    const HChar  how = xIsZ ? 'z' : 's';
   17384    UInt   rG     = gregOfRexRM(pfx, modrm);
   17385    /* Compute both srcI64 -- the value to expand -- and srcVec -- same
   17386       thing in a V128, with arbitrary junk in the top 64 bits.  Use
   17387       one or both of them and let iropt clean up afterwards (as
   17388       usual). */
   17389    if ( epartIsReg(modrm) ) {
   17390       UInt rE = eregOfRexRM(pfx, modrm);
   17391       assign( srcVec, getXMMReg(rE) );
   17392       assign( srcI64, unop(Iop_V128to64, mkexpr(srcVec)) );
   17393       delta += 1;
   17394       DIP( "%spmov%cxdq %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
   17395    } else {
   17396       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17397       assign( srcI64, loadLE(Ity_I64, mkexpr(addr)) );
   17398       assign( srcVec, unop( Iop_64UtoV128, mkexpr(srcI64)) );
   17399       delta += alen;
   17400       DIP( "%spmov%cxdq %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
   17401    }
   17402 
   17403    IRExpr* res
   17404       = xIsZ /* do math for either zero or sign extend */
   17405         ? binop( Iop_InterleaveLO32x4,
   17406                  IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) )
   17407         : binop( Iop_64HLtoV128,
   17408                  unop( Iop_32Sto64,
   17409                        unop( Iop_64HIto32, mkexpr(srcI64) ) ),
   17410                  unop( Iop_32Sto64,
   17411                        unop( Iop_64to32, mkexpr(srcI64) ) ) );
   17412 
   17413    (isAvx ? putYMMRegLoAndZU : putXMMReg) ( rG, res );
   17414 
   17415    return delta;
   17416 }
   17417 
   17418 
   17419 /* Handles 256 bit versions of PMOVZXDQ and PMOVSXDQ. */
   17420 static Long dis_PMOVxXDQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
   17421                                Long delta, Bool xIsZ )
   17422 {
   17423    IRTemp addr   = IRTemp_INVALID;
   17424    Int    alen   = 0;
   17425    HChar  dis_buf[50];
   17426    IRTemp srcVec = newTemp(Ity_V128);
   17427    UChar  modrm  = getUChar(delta);
   17428    UChar  how    = xIsZ ? 'z' : 's';
   17429    UInt   rG     = gregOfRexRM(pfx, modrm);
   17430    /* Compute both srcI64 -- the value to expand -- and srcVec -- same
   17431       thing in a V128, with arbitrary junk in the top 64 bits.  Use
   17432       one or both of them and let iropt clean up afterwards (as
   17433       usual). */
   17434    if ( epartIsReg(modrm) ) {
   17435       UInt rE = eregOfRexRM(pfx, modrm);
   17436       assign( srcVec, getXMMReg(rE) );
   17437       delta += 1;
   17438       DIP( "vpmov%cxdq %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
   17439    } else {
   17440       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17441       assign( srcVec, loadLE(Ity_V128, mkexpr(addr)) );
   17442       delta += alen;
   17443       DIP( "vpmov%cxdq %s,%s\n", how, dis_buf, nameYMMReg(rG) );
   17444    }
   17445 
   17446    IRExpr* res;
   17447    if (xIsZ)
   17448       res = binop( Iop_V128HLtoV256,
   17449                    binop( Iop_InterleaveHI32x4,
   17450                           IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ),
   17451                    binop( Iop_InterleaveLO32x4,
   17452                           IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) );
   17453    else {
   17454       IRTemp s3, s2, s1, s0;
   17455       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   17456       breakupV128to32s( srcVec, &s3, &s2, &s1, &s0 );
   17457       res = binop( Iop_V128HLtoV256,
   17458                    binop( Iop_64HLtoV128,
   17459                           unop( Iop_32Sto64, mkexpr(s3) ),
   17460                           unop( Iop_32Sto64, mkexpr(s2) ) ),
   17461                    binop( Iop_64HLtoV128,
   17462                           unop( Iop_32Sto64, mkexpr(s1) ),
   17463                           unop( Iop_32Sto64, mkexpr(s0) ) ) );
   17464    }
   17465 
   17466    putYMMReg ( rG, res );
   17467 
   17468    return delta;
   17469 }
   17470 
   17471 
   17472 /* Handles 128 bit versions of PMOVZXBD and PMOVSXBD. */
   17473 static Long dis_PMOVxXBD_128 ( const VexAbiInfo* vbi, Prefix pfx,
   17474                                Long delta, Bool isAvx, Bool xIsZ )
   17475 {
   17476    IRTemp addr   = IRTemp_INVALID;
   17477    Int    alen   = 0;
   17478    HChar  dis_buf[50];
   17479    IRTemp srcVec = newTemp(Ity_V128);
   17480    UChar  modrm  = getUChar(delta);
   17481    const HChar* mbV = isAvx ? "v" : "";
   17482    const HChar  how = xIsZ ? 'z' : 's';
   17483    UInt   rG     = gregOfRexRM(pfx, modrm);
   17484    if ( epartIsReg(modrm) ) {
   17485       UInt rE = eregOfRexRM(pfx, modrm);
   17486       assign( srcVec, getXMMReg(rE) );
   17487       delta += 1;
   17488       DIP( "%spmov%cxbd %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
   17489    } else {
   17490       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17491       assign( srcVec,
   17492               unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
   17493       delta += alen;
   17494       DIP( "%spmov%cxbd %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
   17495    }
   17496 
   17497    IRTemp zeroVec = newTemp(Ity_V128);
   17498    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   17499 
   17500    IRExpr* res
   17501       = binop(Iop_InterleaveLO8x16,
   17502               mkexpr(zeroVec),
   17503               binop(Iop_InterleaveLO8x16,
   17504                     mkexpr(zeroVec), mkexpr(srcVec)));
   17505    if (!xIsZ)
   17506       res = binop(Iop_SarN32x4,
   17507                   binop(Iop_ShlN32x4, res, mkU8(24)), mkU8(24));
   17508 
   17509    (isAvx ? putYMMRegLoAndZU : putXMMReg) ( rG, res );
   17510 
   17511    return delta;
   17512 }
   17513 
   17514 
   17515 /* Handles 256 bit versions of PMOVZXBD and PMOVSXBD. */
   17516 static Long dis_PMOVxXBD_256 ( const VexAbiInfo* vbi, Prefix pfx,
   17517                                Long delta, Bool xIsZ )
   17518 {
   17519    IRTemp addr   = IRTemp_INVALID;
   17520    Int    alen   = 0;
   17521    HChar  dis_buf[50];
   17522    IRTemp srcVec = newTemp(Ity_V128);
   17523    UChar  modrm  = getUChar(delta);
   17524    UChar  how    = xIsZ ? 'z' : 's';
   17525    UInt   rG     = gregOfRexRM(pfx, modrm);
   17526    if ( epartIsReg(modrm) ) {
   17527       UInt rE = eregOfRexRM(pfx, modrm);
   17528       assign( srcVec, getXMMReg(rE) );
   17529       delta += 1;
   17530       DIP( "vpmov%cxbd %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
   17531    } else {
   17532       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17533       assign( srcVec,
   17534               unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
   17535       delta += alen;
   17536       DIP( "vpmov%cxbd %s,%s\n", how, dis_buf, nameYMMReg(rG) );
   17537    }
   17538 
   17539    IRTemp zeroVec = newTemp(Ity_V128);
   17540    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   17541 
   17542    IRExpr* res
   17543       = binop( Iop_V128HLtoV256,
   17544                binop(Iop_InterleaveHI8x16,
   17545                      mkexpr(zeroVec),
   17546                      binop(Iop_InterleaveLO8x16,
   17547                            mkexpr(zeroVec), mkexpr(srcVec)) ),
   17548                binop(Iop_InterleaveLO8x16,
   17549                      mkexpr(zeroVec),
   17550                      binop(Iop_InterleaveLO8x16,
   17551                            mkexpr(zeroVec), mkexpr(srcVec)) ) );
   17552    if (!xIsZ)
   17553       res = binop(Iop_SarN32x8,
   17554                   binop(Iop_ShlN32x8, res, mkU8(24)), mkU8(24));
   17555 
   17556    putYMMReg ( rG, res );
   17557 
   17558    return delta;
   17559 }
   17560 
   17561 
   17562 /* Handles 128 bit versions of PMOVSXBQ. */
   17563 static Long dis_PMOVSXBQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
   17564                                Long delta, Bool isAvx )
   17565 {
   17566    IRTemp addr     = IRTemp_INVALID;
   17567    Int    alen     = 0;
   17568    HChar  dis_buf[50];
   17569    IRTemp srcBytes = newTemp(Ity_I16);
   17570    UChar  modrm    = getUChar(delta);
   17571    const HChar* mbV = isAvx ? "v" : "";
   17572    UInt   rG       = gregOfRexRM(pfx, modrm);
   17573    if ( epartIsReg(modrm) ) {
   17574       UInt rE = eregOfRexRM(pfx, modrm);
   17575       assign( srcBytes, getXMMRegLane16( rE, 0 ) );
   17576       delta += 1;
   17577       DIP( "%spmovsxbq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
   17578    } else {
   17579       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17580       assign( srcBytes, loadLE( Ity_I16, mkexpr(addr) ) );
   17581       delta += alen;
   17582       DIP( "%spmovsxbq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
   17583    }
   17584 
   17585    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   17586       ( rG, binop( Iop_64HLtoV128,
   17587                    unop( Iop_8Sto64,
   17588                          unop( Iop_16HIto8, mkexpr(srcBytes) ) ),
   17589                    unop( Iop_8Sto64,
   17590                          unop( Iop_16to8, mkexpr(srcBytes) ) ) ) );
   17591    return delta;
   17592 }
   17593 
   17594 
   17595 /* Handles 256 bit versions of PMOVSXBQ. */
   17596 static Long dis_PMOVSXBQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
   17597                                Long delta )
   17598 {
   17599    IRTemp addr     = IRTemp_INVALID;
   17600    Int    alen     = 0;
   17601    HChar  dis_buf[50];
   17602    IRTemp srcBytes = newTemp(Ity_I32);
   17603    UChar  modrm    = getUChar(delta);
   17604    UInt   rG       = gregOfRexRM(pfx, modrm);
   17605    if ( epartIsReg(modrm) ) {
   17606       UInt rE = eregOfRexRM(pfx, modrm);
   17607       assign( srcBytes, getXMMRegLane32( rE, 0 ) );
   17608       delta += 1;
   17609       DIP( "vpmovsxbq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
   17610    } else {
   17611       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17612       assign( srcBytes, loadLE( Ity_I32, mkexpr(addr) ) );
   17613       delta += alen;
   17614       DIP( "vpmovsxbq %s,%s\n", dis_buf, nameYMMReg(rG) );
   17615    }
   17616 
   17617    putYMMReg
   17618       ( rG, binop( Iop_V128HLtoV256,
   17619                    binop( Iop_64HLtoV128,
   17620                           unop( Iop_8Sto64,
   17621                                 unop( Iop_16HIto8,
   17622                                       unop( Iop_32HIto16,
   17623                                             mkexpr(srcBytes) ) ) ),
   17624                           unop( Iop_8Sto64,
   17625                                 unop( Iop_16to8,
   17626                                       unop( Iop_32HIto16,
   17627                                             mkexpr(srcBytes) ) ) ) ),
   17628                    binop( Iop_64HLtoV128,
   17629                           unop( Iop_8Sto64,
   17630                                 unop( Iop_16HIto8,
   17631                                       unop( Iop_32to16,
   17632                                             mkexpr(srcBytes) ) ) ),
   17633                           unop( Iop_8Sto64,
   17634                                 unop( Iop_16to8,
   17635                                       unop( Iop_32to16,
   17636                                             mkexpr(srcBytes) ) ) ) ) ) );
   17637    return delta;
   17638 }
   17639 
   17640 
   17641 /* Handles 128 bit versions of PMOVZXBQ. */
   17642 static Long dis_PMOVZXBQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
   17643                                Long delta, Bool isAvx )
   17644 {
   17645    IRTemp addr     = IRTemp_INVALID;
   17646    Int    alen     = 0;
   17647    HChar  dis_buf[50];
   17648    IRTemp srcVec   = newTemp(Ity_V128);
   17649    UChar  modrm    = getUChar(delta);
   17650    const HChar* mbV = isAvx ? "v" : "";
   17651    UInt   rG       = gregOfRexRM(pfx, modrm);
   17652    if ( epartIsReg(modrm) ) {
   17653       UInt rE = eregOfRexRM(pfx, modrm);
   17654       assign( srcVec, getXMMReg(rE) );
   17655       delta += 1;
   17656       DIP( "%spmovzxbq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
   17657    } else {
   17658       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17659       assign( srcVec,
   17660               unop( Iop_32UtoV128,
   17661                     unop( Iop_16Uto32, loadLE( Ity_I16, mkexpr(addr) ))));
   17662       delta += alen;
   17663       DIP( "%spmovzxbq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
   17664    }
   17665 
   17666    IRTemp zeroVec = newTemp(Ity_V128);
   17667    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   17668 
   17669    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   17670       ( rG, binop( Iop_InterleaveLO8x16,
   17671                    mkexpr(zeroVec),
   17672                    binop( Iop_InterleaveLO8x16,
   17673                           mkexpr(zeroVec),
   17674                           binop( Iop_InterleaveLO8x16,
   17675                                  mkexpr(zeroVec), mkexpr(srcVec) ) ) ) );
   17676    return delta;
   17677 }
   17678 
   17679 
   17680 /* Handles 256 bit versions of PMOVZXBQ. */
   17681 static Long dis_PMOVZXBQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
   17682                                Long delta )
   17683 {
   17684    IRTemp addr     = IRTemp_INVALID;
   17685    Int    alen     = 0;
   17686    HChar  dis_buf[50];
   17687    IRTemp srcVec   = newTemp(Ity_V128);
   17688    UChar  modrm    = getUChar(delta);
   17689    UInt   rG       = gregOfRexRM(pfx, modrm);
   17690    if ( epartIsReg(modrm) ) {
   17691       UInt rE = eregOfRexRM(pfx, modrm);
   17692       assign( srcVec, getXMMReg(rE) );
   17693       delta += 1;
   17694       DIP( "vpmovzxbq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
   17695    } else {
   17696       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17697       assign( srcVec,
   17698               unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) )));
   17699       delta += alen;
   17700       DIP( "vpmovzxbq %s,%s\n", dis_buf, nameYMMReg(rG) );
   17701    }
   17702 
   17703    IRTemp zeroVec = newTemp(Ity_V128);
   17704    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   17705 
   17706    putYMMReg
   17707       ( rG, binop( Iop_V128HLtoV256,
   17708                    binop( Iop_InterleaveHI8x16,
   17709                           mkexpr(zeroVec),
   17710                           binop( Iop_InterleaveLO8x16,
   17711                                  mkexpr(zeroVec),
   17712                                  binop( Iop_InterleaveLO8x16,
   17713                                         mkexpr(zeroVec), mkexpr(srcVec) ) ) ),
   17714                    binop( Iop_InterleaveLO8x16,
   17715                           mkexpr(zeroVec),
   17716                           binop( Iop_InterleaveLO8x16,
   17717                                  mkexpr(zeroVec),
   17718                                  binop( Iop_InterleaveLO8x16,
   17719                                         mkexpr(zeroVec), mkexpr(srcVec) ) ) )
   17720                  ) );
   17721    return delta;
   17722 }
   17723 
   17724 
   17725 static Long dis_PHMINPOSUW_128 ( const VexAbiInfo* vbi, Prefix pfx,
   17726                                  Long delta, Bool isAvx )
   17727 {
   17728    IRTemp addr   = IRTemp_INVALID;
   17729    Int    alen   = 0;
   17730    HChar  dis_buf[50];
   17731    UChar  modrm  = getUChar(delta);
   17732    const HChar* mbV = isAvx ? "v" : "";
   17733    IRTemp sV     = newTemp(Ity_V128);
   17734    IRTemp sHi    = newTemp(Ity_I64);
   17735    IRTemp sLo    = newTemp(Ity_I64);
   17736    IRTemp dLo    = newTemp(Ity_I64);
   17737    UInt   rG     = gregOfRexRM(pfx,modrm);
   17738    if (epartIsReg(modrm)) {
   17739       UInt rE = eregOfRexRM(pfx,modrm);
   17740       assign( sV, getXMMReg(rE) );
   17741       delta += 1;
   17742       DIP("%sphminposuw %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG));
   17743    } else {
   17744       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   17745       if (!isAvx)
   17746          gen_SEGV_if_not_16_aligned(addr);
   17747       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   17748       delta += alen;
   17749       DIP("%sphminposuw %s,%s\n", mbV, dis_buf, nameXMMReg(rG));
   17750    }
   17751    assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   17752    assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   17753    assign( dLo, mkIRExprCCall(
   17754                    Ity_I64, 0/*regparms*/,
   17755                    "amd64g_calculate_sse_phminposuw",
   17756                    &amd64g_calculate_sse_phminposuw,
   17757                    mkIRExprVec_2( mkexpr(sLo), mkexpr(sHi) )
   17758          ));
   17759    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   17760       (rG, unop(Iop_64UtoV128, mkexpr(dLo)));
   17761    return delta;
   17762 }
   17763 
   17764 
   17765 static Long dis_AESx ( const VexAbiInfo* vbi, Prefix pfx,
   17766                        Long delta, Bool isAvx, UChar opc )
   17767 {
   17768    IRTemp addr   = IRTemp_INVALID;
   17769    Int    alen   = 0;
   17770    HChar  dis_buf[50];
   17771    UChar  modrm  = getUChar(delta);
   17772    UInt   rG     = gregOfRexRM(pfx, modrm);
   17773    UInt   regNoL = 0;
   17774    UInt   regNoR = (isAvx && opc != 0xDB) ? getVexNvvvv(pfx) : rG;
   17775 
   17776    /* This is a nasty kludge.  We need to pass 2 x V128 to the
   17777       helper.  Since we can't do that, use a dirty
   17778       helper to compute the results directly from the XMM regs in
   17779       the guest state.  That means for the memory case, we need to
   17780       move the left operand into a pseudo-register (XMM16, let's
   17781       call it). */
   17782    if (epartIsReg(modrm)) {
   17783       regNoL = eregOfRexRM(pfx, modrm);
   17784       delta += 1;
   17785    } else {
   17786       regNoL = 16; /* use XMM16 as an intermediary */
   17787       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17788       /* alignment check needed ???? */
   17789       stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) ));
   17790       delta += alen;
   17791    }
   17792 
   17793    void*  fn = &amd64g_dirtyhelper_AES;
   17794    const HChar* nm = "amd64g_dirtyhelper_AES";
   17795 
   17796    /* Round up the arguments.  Note that this is a kludge -- the
   17797       use of mkU64 rather than mkIRExpr_HWord implies the
   17798       assumption that the host's word size is 64-bit. */
   17799    UInt gstOffD = ymmGuestRegOffset(rG);
   17800    UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL);
   17801    UInt gstOffR = ymmGuestRegOffset(regNoR);
   17802    IRExpr*  opc4         = mkU64(opc);
   17803    IRExpr*  gstOffDe     = mkU64(gstOffD);
   17804    IRExpr*  gstOffLe     = mkU64(gstOffL);
   17805    IRExpr*  gstOffRe     = mkU64(gstOffR);
   17806    IRExpr** args
   17807       = mkIRExprVec_5( IRExpr_GSPTR(), opc4, gstOffDe, gstOffLe, gstOffRe );
   17808 
   17809    IRDirty* d    = unsafeIRDirty_0_N( 0/*regparms*/, nm, fn, args );
   17810    /* It's not really a dirty call, but we can't use the clean helper
   17811       mechanism here for the very lame reason that we can't pass 2 x
   17812       V128s by value to a helper.  Hence this roundabout scheme. */
   17813    d->nFxState = 2;
   17814    vex_bzero(&d->fxState, sizeof(d->fxState));
   17815    /* AES{ENC,ENCLAST,DEC,DECLAST} read both registers, and writes
   17816       the second for !isAvx or the third for isAvx.
   17817       AESIMC (0xDB) reads the first register, and writes the second. */
   17818    d->fxState[0].fx     = Ifx_Read;
   17819    d->fxState[0].offset = gstOffL;
   17820    d->fxState[0].size   = sizeof(U128);
   17821    d->fxState[1].offset = gstOffR;
   17822    d->fxState[1].size   = sizeof(U128);
   17823    if (opc == 0xDB)
   17824       d->fxState[1].fx   = Ifx_Write;
   17825    else if (!isAvx || rG == regNoR)
   17826       d->fxState[1].fx   = Ifx_Modify;
   17827    else {
   17828       d->fxState[1].fx     = Ifx_Read;
   17829       d->nFxState++;
   17830       d->fxState[2].fx     = Ifx_Write;
   17831       d->fxState[2].offset = gstOffD;
   17832       d->fxState[2].size   = sizeof(U128);
   17833    }
   17834 
   17835    stmt( IRStmt_Dirty(d) );
   17836    {
   17837       const HChar* opsuf;
   17838       switch (opc) {
   17839          case 0xDC: opsuf = "enc"; break;
   17840          case 0XDD: opsuf = "enclast"; break;
   17841          case 0xDE: opsuf = "dec"; break;
   17842          case 0xDF: opsuf = "declast"; break;
   17843          case 0xDB: opsuf = "imc"; break;
   17844          default: vassert(0);
   17845       }
   17846       DIP("%saes%s %s,%s%s%s\n", isAvx ? "v" : "", opsuf,
   17847           (regNoL == 16 ? dis_buf : nameXMMReg(regNoL)),
   17848           nameXMMReg(regNoR),
   17849           (isAvx && opc != 0xDB) ? "," : "",
   17850           (isAvx && opc != 0xDB) ? nameXMMReg(rG) : "");
   17851    }
   17852    if (isAvx)
   17853       putYMMRegLane128( rG, 1, mkV128(0) );
   17854    return delta;
   17855 }
   17856 
   17857 static Long dis_AESKEYGENASSIST ( const VexAbiInfo* vbi, Prefix pfx,
   17858                                   Long delta, Bool isAvx )
   17859 {
   17860    IRTemp addr   = IRTemp_INVALID;
   17861    Int    alen   = 0;
   17862    HChar  dis_buf[50];
   17863    UChar  modrm  = getUChar(delta);
   17864    UInt   regNoL = 0;
   17865    UInt   regNoR = gregOfRexRM(pfx, modrm);
   17866    UChar  imm    = 0;
   17867 
   17868    /* This is a nasty kludge.  See AESENC et al. instructions. */
   17869    modrm = getUChar(delta);
   17870    if (epartIsReg(modrm)) {
   17871       regNoL = eregOfRexRM(pfx, modrm);
   17872       imm = getUChar(delta+1);
   17873       delta += 1+1;
   17874    } else {
   17875       regNoL = 16; /* use XMM16 as an intermediary */
   17876       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   17877       /* alignment check ???? . */
   17878       stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) ));
   17879       imm = getUChar(delta+alen);
   17880       delta += alen+1;
   17881    }
   17882 
   17883    /* Who ya gonna call?  Presumably not Ghostbusters. */
   17884    void*  fn = &amd64g_dirtyhelper_AESKEYGENASSIST;
   17885    const HChar* nm = "amd64g_dirtyhelper_AESKEYGENASSIST";
   17886 
   17887    /* Round up the arguments.  Note that this is a kludge -- the
   17888       use of mkU64 rather than mkIRExpr_HWord implies the
   17889       assumption that the host's word size is 64-bit. */
   17890    UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL);
   17891    UInt gstOffR = ymmGuestRegOffset(regNoR);
   17892 
   17893    IRExpr*  imme          = mkU64(imm & 0xFF);
   17894    IRExpr*  gstOffLe     = mkU64(gstOffL);
   17895    IRExpr*  gstOffRe     = mkU64(gstOffR);
   17896    IRExpr** args
   17897       = mkIRExprVec_4( IRExpr_GSPTR(), imme, gstOffLe, gstOffRe );
   17898 
   17899    IRDirty* d    = unsafeIRDirty_0_N( 0/*regparms*/, nm, fn, args );
   17900    /* It's not really a dirty call, but we can't use the clean helper
   17901       mechanism here for the very lame reason that we can't pass 2 x
   17902       V128s by value to a helper.  Hence this roundabout scheme. */
   17903    d->nFxState = 2;
   17904    vex_bzero(&d->fxState, sizeof(d->fxState));
   17905    d->fxState[0].fx     = Ifx_Read;
   17906    d->fxState[0].offset = gstOffL;
   17907    d->fxState[0].size   = sizeof(U128);
   17908    d->fxState[1].fx     = Ifx_Write;
   17909    d->fxState[1].offset = gstOffR;
   17910    d->fxState[1].size   = sizeof(U128);
   17911    stmt( IRStmt_Dirty(d) );
   17912 
   17913    DIP("%saeskeygenassist $%x,%s,%s\n", isAvx ? "v" : "", (UInt)imm,
   17914        (regNoL == 16 ? dis_buf : nameXMMReg(regNoL)),
   17915        nameXMMReg(regNoR));
   17916    if (isAvx)
   17917       putYMMRegLane128( regNoR, 1, mkV128(0) );
   17918    return delta;
   17919 }
   17920 
   17921 
   17922 __attribute__((noinline))
   17923 static
   17924 Long dis_ESC_0F38__SSE4 ( Bool* decode_OK,
   17925                           const VexAbiInfo* vbi,
   17926                           Prefix pfx, Int sz, Long deltaIN )
   17927 {
   17928    IRTemp addr  = IRTemp_INVALID;
   17929    UChar  modrm = 0;
   17930    Int    alen  = 0;
   17931    HChar  dis_buf[50];
   17932 
   17933    *decode_OK = False;
   17934 
   17935    Long   delta = deltaIN;
   17936    UChar  opc   = getUChar(delta);
   17937    delta++;
   17938    switch (opc) {
   17939 
   17940    case 0x10:
   17941    case 0x14:
   17942    case 0x15:
   17943       /* 66 0F 38 10 /r = PBLENDVB xmm1, xmm2/m128  (byte gran)
   17944          66 0F 38 14 /r = BLENDVPS xmm1, xmm2/m128  (float gran)
   17945          66 0F 38 15 /r = BLENDVPD xmm1, xmm2/m128  (double gran)
   17946          Blend at various granularities, with XMM0 (implicit operand)
   17947          providing the controlling mask.
   17948       */
   17949       if (have66noF2noF3(pfx) && sz == 2) {
   17950          modrm = getUChar(delta);
   17951 
   17952          const HChar* nm    = NULL;
   17953          UInt   gran  = 0;
   17954          IROp   opSAR = Iop_INVALID;
   17955          switch (opc) {
   17956             case 0x10:
   17957                nm = "pblendvb"; gran = 1; opSAR = Iop_SarN8x16;
   17958                break;
   17959             case 0x14:
   17960                nm = "blendvps"; gran = 4; opSAR = Iop_SarN32x4;
   17961                break;
   17962             case 0x15:
   17963                nm = "blendvpd"; gran = 8; opSAR = Iop_SarN64x2;
   17964                break;
   17965          }
   17966          vassert(nm);
   17967 
   17968          IRTemp vecE = newTemp(Ity_V128);
   17969          IRTemp vecG = newTemp(Ity_V128);
   17970          IRTemp vec0 = newTemp(Ity_V128);
   17971 
   17972          if ( epartIsReg(modrm) ) {
   17973             assign(vecE, getXMMReg(eregOfRexRM(pfx, modrm)));
   17974             delta += 1;
   17975             DIP( "%s %s,%s\n", nm,
   17976                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   17977                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   17978          } else {
   17979             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17980             gen_SEGV_if_not_16_aligned( addr );
   17981             assign(vecE, loadLE( Ity_V128, mkexpr(addr) ));
   17982             delta += alen;
   17983             DIP( "%s %s,%s\n", nm,
   17984                  dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   17985          }
   17986 
   17987          assign(vecG, getXMMReg(gregOfRexRM(pfx, modrm)));
   17988          assign(vec0, getXMMReg(0));
   17989 
   17990          IRTemp res = math_PBLENDVB_128( vecE, vecG, vec0, gran, opSAR );
   17991          putXMMReg(gregOfRexRM(pfx, modrm), mkexpr(res));
   17992 
   17993          goto decode_success;
   17994       }
   17995       break;
   17996 
   17997    case 0x17:
   17998       /* 66 0F 38 17 /r = PTEST xmm1, xmm2/m128
   17999          Logical compare (set ZF and CF from AND/ANDN of the operands) */
   18000       if (have66noF2noF3(pfx)
   18001           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   18002          delta = dis_xTESTy_128( vbi, pfx, delta, False/*!isAvx*/, 0 );
   18003          goto decode_success;
   18004       }
   18005       break;
   18006 
   18007    case 0x20:
   18008       /* 66 0F 38 20 /r = PMOVSXBW xmm1, xmm2/m64
   18009          Packed Move with Sign Extend from Byte to Word (XMM) */
   18010       if (have66noF2noF3(pfx) && sz == 2) {
   18011          delta = dis_PMOVxXBW_128( vbi, pfx, delta,
   18012                                    False/*!isAvx*/, False/*!xIsZ*/ );
   18013          goto decode_success;
   18014       }
   18015       break;
   18016 
   18017    case 0x21:
   18018       /* 66 0F 38 21 /r = PMOVSXBD xmm1, xmm2/m32
   18019          Packed Move with Sign Extend from Byte to DWord (XMM) */
   18020       if (have66noF2noF3(pfx) && sz == 2) {
   18021          delta = dis_PMOVxXBD_128( vbi, pfx, delta,
   18022                                    False/*!isAvx*/, False/*!xIsZ*/ );
   18023          goto decode_success;
   18024       }
   18025       break;
   18026 
   18027    case 0x22:
   18028       /* 66 0F 38 22 /r = PMOVSXBQ xmm1, xmm2/m16
   18029          Packed Move with Sign Extend from Byte to QWord (XMM) */
   18030       if (have66noF2noF3(pfx) && sz == 2) {
   18031          delta = dis_PMOVSXBQ_128( vbi, pfx, delta, False/*!isAvx*/ );
   18032          goto decode_success;
   18033       }
   18034       break;
   18035 
   18036    case 0x23:
   18037       /* 66 0F 38 23 /r = PMOVSXWD xmm1, xmm2/m64
   18038          Packed Move with Sign Extend from Word to DWord (XMM) */
   18039       if (have66noF2noF3(pfx) && sz == 2) {
   18040          delta = dis_PMOVxXWD_128(vbi, pfx, delta,
   18041                                   False/*!isAvx*/, False/*!xIsZ*/);
   18042          goto decode_success;
   18043       }
   18044       break;
   18045 
   18046    case 0x24:
   18047       /* 66 0F 38 24 /r = PMOVSXWQ xmm1, xmm2/m32
   18048          Packed Move with Sign Extend from Word to QWord (XMM) */
   18049       if (have66noF2noF3(pfx) && sz == 2) {
   18050          delta = dis_PMOVSXWQ_128( vbi, pfx, delta, False/*!isAvx*/ );
   18051          goto decode_success;
   18052       }
   18053       break;
   18054 
   18055    case 0x25:
   18056       /* 66 0F 38 25 /r = PMOVSXDQ xmm1, xmm2/m64
   18057          Packed Move with Sign Extend from Double Word to Quad Word (XMM) */
   18058       if (have66noF2noF3(pfx) && sz == 2) {
   18059          delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
   18060                                    False/*!isAvx*/, False/*!xIsZ*/ );
   18061          goto decode_success;
   18062       }
   18063       break;
   18064 
   18065    case 0x28:
   18066       /* 66 0F 38 28 = PMULDQ -- signed widening multiply of 32-lanes
   18067          0 x 0 to form lower 64-bit half and lanes 2 x 2 to form upper
   18068          64-bit half */
   18069       /* This is a really poor translation -- could be improved if
   18070          performance critical.  It's a copy-paste of PMULUDQ, too. */
   18071       if (have66noF2noF3(pfx) && sz == 2) {
   18072          IRTemp sV = newTemp(Ity_V128);
   18073          IRTemp dV = newTemp(Ity_V128);
   18074          modrm = getUChar(delta);
   18075          UInt rG = gregOfRexRM(pfx,modrm);
   18076          assign( dV, getXMMReg(rG) );
   18077          if (epartIsReg(modrm)) {
   18078             UInt rE = eregOfRexRM(pfx,modrm);
   18079             assign( sV, getXMMReg(rE) );
   18080             delta += 1;
   18081             DIP("pmuldq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   18082          } else {
   18083             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   18084             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   18085             delta += alen;
   18086             DIP("pmuldq %s,%s\n", dis_buf, nameXMMReg(rG));
   18087          }
   18088 
   18089          putXMMReg( rG, mkexpr(math_PMULDQ_128( dV, sV )) );
   18090          goto decode_success;
   18091       }
   18092       break;
   18093 
   18094    case 0x29:
   18095       /* 66 0F 38 29 = PCMPEQQ
   18096          64x2 equality comparison */
   18097       if (have66noF2noF3(pfx) && sz == 2) {
   18098          /* FIXME: this needs an alignment check */
   18099          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   18100                                     "pcmpeqq", Iop_CmpEQ64x2, False );
   18101          goto decode_success;
   18102       }
   18103       break;
   18104 
   18105    case 0x2A:
   18106       /* 66 0F 38 2A = MOVNTDQA
   18107          "non-temporal" "streaming" load
   18108          Handle like MOVDQA but only memory operand is allowed */
   18109       if (have66noF2noF3(pfx) && sz == 2) {
   18110          modrm = getUChar(delta);
   18111          if (!epartIsReg(modrm)) {
   18112             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   18113             gen_SEGV_if_not_16_aligned( addr );
   18114             putXMMReg( gregOfRexRM(pfx,modrm),
   18115                        loadLE(Ity_V128, mkexpr(addr)) );
   18116             DIP("movntdqa %s,%s\n", dis_buf,
   18117                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   18118             delta += alen;
   18119             goto decode_success;
   18120          }
   18121       }
   18122       break;
   18123 
   18124    case 0x2B:
   18125       /* 66 0f 38 2B /r = PACKUSDW xmm1, xmm2/m128
   18126          2x 32x4 S->U saturating narrow from xmm2/m128 to xmm1 */
   18127       if (have66noF2noF3(pfx) && sz == 2) {
   18128 
   18129          modrm = getUChar(delta);
   18130 
   18131          IRTemp argL = newTemp(Ity_V128);
   18132          IRTemp argR = newTemp(Ity_V128);
   18133 
   18134          if ( epartIsReg(modrm) ) {
   18135             assign( argL, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   18136             delta += 1;
   18137             DIP( "packusdw %s,%s\n",
   18138                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   18139                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18140          } else {
   18141             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   18142             gen_SEGV_if_not_16_aligned( addr );
   18143             assign( argL, loadLE( Ity_V128, mkexpr(addr) ));
   18144             delta += alen;
   18145             DIP( "packusdw %s,%s\n",
   18146                  dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18147          }
   18148 
   18149          assign(argR, getXMMReg( gregOfRexRM(pfx, modrm) ));
   18150 
   18151          putXMMReg( gregOfRexRM(pfx, modrm),
   18152                     binop( Iop_QNarrowBin32Sto16Ux8,
   18153                            mkexpr(argL), mkexpr(argR)) );
   18154 
   18155          goto decode_success;
   18156       }
   18157       break;
   18158 
   18159    case 0x30:
   18160       /* 66 0F 38 30 /r = PMOVZXBW xmm1, xmm2/m64
   18161          Packed Move with Zero Extend from Byte to Word (XMM) */
   18162       if (have66noF2noF3(pfx) && sz == 2) {
   18163          delta = dis_PMOVxXBW_128( vbi, pfx, delta,
   18164                                    False/*!isAvx*/, True/*xIsZ*/ );
   18165          goto decode_success;
   18166       }
   18167       break;
   18168 
   18169    case 0x31:
   18170       /* 66 0F 38 31 /r = PMOVZXBD xmm1, xmm2/m32
   18171          Packed Move with Zero Extend from Byte to DWord (XMM) */
   18172       if (have66noF2noF3(pfx) && sz == 2) {
   18173          delta = dis_PMOVxXBD_128( vbi, pfx, delta,
   18174                                    False/*!isAvx*/, True/*xIsZ*/ );
   18175          goto decode_success;
   18176       }
   18177       break;
   18178 
   18179    case 0x32:
   18180       /* 66 0F 38 32 /r = PMOVZXBQ xmm1, xmm2/m16
   18181          Packed Move with Zero Extend from Byte to QWord (XMM) */
   18182       if (have66noF2noF3(pfx) && sz == 2) {
   18183          delta = dis_PMOVZXBQ_128( vbi, pfx, delta, False/*!isAvx*/ );
   18184          goto decode_success;
   18185       }
   18186       break;
   18187 
   18188    case 0x33:
   18189       /* 66 0F 38 33 /r = PMOVZXWD xmm1, xmm2/m64
   18190          Packed Move with Zero Extend from Word to DWord (XMM) */
   18191       if (have66noF2noF3(pfx) && sz == 2) {
   18192          delta = dis_PMOVxXWD_128( vbi, pfx, delta,
   18193                                    False/*!isAvx*/, True/*xIsZ*/ );
   18194          goto decode_success;
   18195       }
   18196       break;
   18197 
   18198    case 0x34:
   18199       /* 66 0F 38 34 /r = PMOVZXWQ xmm1, xmm2/m32
   18200          Packed Move with Zero Extend from Word to QWord (XMM) */
   18201       if (have66noF2noF3(pfx) && sz == 2) {
   18202          delta = dis_PMOVZXWQ_128( vbi, pfx, delta, False/*!isAvx*/ );
   18203          goto decode_success;
   18204       }
   18205       break;
   18206 
   18207    case 0x35:
   18208       /* 66 0F 38 35 /r = PMOVZXDQ xmm1, xmm2/m64
   18209          Packed Move with Zero Extend from DWord to QWord (XMM) */
   18210       if (have66noF2noF3(pfx) && sz == 2) {
   18211          delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
   18212                                    False/*!isAvx*/, True/*xIsZ*/ );
   18213          goto decode_success;
   18214       }
   18215       break;
   18216 
   18217    case 0x37:
   18218       /* 66 0F 38 37 = PCMPGTQ
   18219          64x2 comparison (signed, presumably; the Intel docs don't say :-)
   18220       */
   18221       if (have66noF2noF3(pfx) && sz == 2) {
   18222          /* FIXME: this needs an alignment check */
   18223          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   18224                                     "pcmpgtq", Iop_CmpGT64Sx2, False );
   18225          goto decode_success;
   18226       }
   18227       break;
   18228 
   18229    case 0x38:
   18230    case 0x3C:
   18231       /* 66 0F 38 38 /r = PMINSB xmm1, xmm2/m128    8Sx16 (signed) min
   18232          66 0F 38 3C /r = PMAXSB xmm1, xmm2/m128    8Sx16 (signed) max
   18233       */
   18234       if (have66noF2noF3(pfx) && sz == 2) {
   18235          /* FIXME: this needs an alignment check */
   18236          Bool isMAX = opc == 0x3C;
   18237          delta = dis_SSEint_E_to_G(
   18238                     vbi, pfx, delta,
   18239                     isMAX ? "pmaxsb" : "pminsb",
   18240                     isMAX ? Iop_Max8Sx16 : Iop_Min8Sx16,
   18241                     False
   18242                  );
   18243          goto decode_success;
   18244       }
   18245       break;
   18246 
   18247    case 0x39:
   18248    case 0x3D:
   18249       /* 66 0F 38 39 /r = PMINSD xmm1, xmm2/m128
   18250          Minimum of Packed Signed Double Word Integers (XMM)
   18251          66 0F 38 3D /r = PMAXSD xmm1, xmm2/m128
   18252          Maximum of Packed Signed Double Word Integers (XMM)
   18253       */
   18254       if (have66noF2noF3(pfx) && sz == 2) {
   18255          /* FIXME: this needs an alignment check */
   18256          Bool isMAX = opc == 0x3D;
   18257          delta = dis_SSEint_E_to_G(
   18258                     vbi, pfx, delta,
   18259                     isMAX ? "pmaxsd" : "pminsd",
   18260                     isMAX ? Iop_Max32Sx4 : Iop_Min32Sx4,
   18261                     False
   18262                  );
   18263          goto decode_success;
   18264       }
   18265       break;
   18266 
   18267    case 0x3A:
   18268    case 0x3E:
   18269       /* 66 0F 38 3A /r = PMINUW xmm1, xmm2/m128
   18270          Minimum of Packed Unsigned Word Integers (XMM)
   18271          66 0F 38 3E /r = PMAXUW xmm1, xmm2/m128
   18272          Maximum of Packed Unsigned Word Integers (XMM)
   18273       */
   18274       if (have66noF2noF3(pfx) && sz == 2) {
   18275          /* FIXME: this needs an alignment check */
   18276          Bool isMAX = opc == 0x3E;
   18277          delta = dis_SSEint_E_to_G(
   18278                     vbi, pfx, delta,
   18279                     isMAX ? "pmaxuw" : "pminuw",
   18280                     isMAX ? Iop_Max16Ux8 : Iop_Min16Ux8,
   18281                     False
   18282                  );
   18283          goto decode_success;
   18284       }
   18285       break;
   18286 
   18287    case 0x3B:
   18288    case 0x3F:
   18289       /* 66 0F 38 3B /r = PMINUD xmm1, xmm2/m128
   18290          Minimum of Packed Unsigned Doubleword Integers (XMM)
   18291          66 0F 38 3F /r = PMAXUD xmm1, xmm2/m128
   18292          Maximum of Packed Unsigned Doubleword Integers (XMM)
   18293       */
   18294       if (have66noF2noF3(pfx) && sz == 2) {
   18295          /* FIXME: this needs an alignment check */
   18296          Bool isMAX = opc == 0x3F;
   18297          delta = dis_SSEint_E_to_G(
   18298                     vbi, pfx, delta,
   18299                     isMAX ? "pmaxud" : "pminud",
   18300                     isMAX ? Iop_Max32Ux4 : Iop_Min32Ux4,
   18301                     False
   18302                  );
   18303          goto decode_success;
   18304       }
   18305       break;
   18306 
   18307    case 0x40:
   18308       /* 66 0F 38 40 /r = PMULLD xmm1, xmm2/m128
   18309          32x4 integer multiply from xmm2/m128 to xmm1 */
   18310       if (have66noF2noF3(pfx) && sz == 2) {
   18311 
   18312          modrm = getUChar(delta);
   18313 
   18314          IRTemp argL = newTemp(Ity_V128);
   18315          IRTemp argR = newTemp(Ity_V128);
   18316 
   18317          if ( epartIsReg(modrm) ) {
   18318             assign( argL, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   18319             delta += 1;
   18320             DIP( "pmulld %s,%s\n",
   18321                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   18322                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18323          } else {
   18324             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   18325             gen_SEGV_if_not_16_aligned( addr );
   18326             assign( argL, loadLE( Ity_V128, mkexpr(addr) ));
   18327             delta += alen;
   18328             DIP( "pmulld %s,%s\n",
   18329                  dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18330          }
   18331 
   18332          assign(argR, getXMMReg( gregOfRexRM(pfx, modrm) ));
   18333 
   18334          putXMMReg( gregOfRexRM(pfx, modrm),
   18335                     binop( Iop_Mul32x4, mkexpr(argL), mkexpr(argR)) );
   18336 
   18337          goto decode_success;
   18338       }
   18339       break;
   18340 
   18341    case 0x41:
   18342       /* 66 0F 38 41 /r = PHMINPOSUW xmm1, xmm2/m128
   18343          Packed Horizontal Word Minimum from xmm2/m128 to xmm1 */
   18344       if (have66noF2noF3(pfx) && sz == 2) {
   18345          delta = dis_PHMINPOSUW_128( vbi, pfx, delta, False/*!isAvx*/ );
   18346          goto decode_success;
   18347       }
   18348       break;
   18349 
   18350    case 0xDC:
   18351    case 0xDD:
   18352    case 0xDE:
   18353    case 0xDF:
   18354    case 0xDB:
   18355       /* 66 0F 38 DC /r = AESENC xmm1, xmm2/m128
   18356                   DD /r = AESENCLAST xmm1, xmm2/m128
   18357                   DE /r = AESDEC xmm1, xmm2/m128
   18358                   DF /r = AESDECLAST xmm1, xmm2/m128
   18359 
   18360                   DB /r = AESIMC xmm1, xmm2/m128 */
   18361       if (have66noF2noF3(pfx) && sz == 2) {
   18362          delta = dis_AESx( vbi, pfx, delta, False/*!isAvx*/, opc );
   18363          goto decode_success;
   18364       }
   18365       break;
   18366 
   18367    case 0xF0:
   18368    case 0xF1:
   18369       /* F2 0F 38 F0 /r = CRC32 r/m8, r32 (REX.W ok, 66 not ok)
   18370          F2 0F 38 F1 /r = CRC32 r/m{16,32,64}, r32
   18371          The decoding on this is a bit unusual.
   18372       */
   18373       if (haveF2noF3(pfx)
   18374           && (opc == 0xF1 || (opc == 0xF0 && !have66(pfx)))) {
   18375          modrm = getUChar(delta);
   18376 
   18377          if (opc == 0xF0)
   18378             sz = 1;
   18379          else
   18380             vassert(sz == 2 || sz == 4 || sz == 8);
   18381 
   18382          IRType tyE = szToITy(sz);
   18383          IRTemp valE = newTemp(tyE);
   18384 
   18385          if (epartIsReg(modrm)) {
   18386             assign(valE, getIRegE(sz, pfx, modrm));
   18387             delta += 1;
   18388             DIP("crc32b %s,%s\n", nameIRegE(sz, pfx, modrm),
   18389                 nameIRegG(1==getRexW(pfx) ? 8 : 4, pfx, modrm));
   18390          } else {
   18391             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   18392             assign(valE, loadLE(tyE, mkexpr(addr)));
   18393             delta += alen;
   18394             DIP("crc32b %s,%s\n", dis_buf,
   18395                 nameIRegG(1==getRexW(pfx) ? 8 : 4, pfx, modrm));
   18396          }
   18397 
   18398          /* Somewhat funny getting/putting of the crc32 value, in order
   18399             to ensure that it turns into 64-bit gets and puts.  However,
   18400             mask off the upper 32 bits so as to not get memcheck false
   18401             +ves around the helper call. */
   18402          IRTemp valG0 = newTemp(Ity_I64);
   18403          assign(valG0, binop(Iop_And64, getIRegG(8, pfx, modrm),
   18404                              mkU64(0xFFFFFFFF)));
   18405 
   18406          const HChar* nm = NULL;
   18407          void*  fn = NULL;
   18408          switch (sz) {
   18409             case 1: nm = "amd64g_calc_crc32b";
   18410                     fn = &amd64g_calc_crc32b; break;
   18411             case 2: nm = "amd64g_calc_crc32w";
   18412                     fn = &amd64g_calc_crc32w; break;
   18413             case 4: nm = "amd64g_calc_crc32l";
   18414                     fn = &amd64g_calc_crc32l; break;
   18415             case 8: nm = "amd64g_calc_crc32q";
   18416                     fn = &amd64g_calc_crc32q; break;
   18417          }
   18418          vassert(nm && fn);
   18419          IRTemp valG1 = newTemp(Ity_I64);
   18420          assign(valG1,
   18421                 mkIRExprCCall(Ity_I64, 0/*regparm*/, nm, fn,
   18422                               mkIRExprVec_2(mkexpr(valG0),
   18423                                             widenUto64(mkexpr(valE)))));
   18424 
   18425          putIRegG(4, pfx, modrm, unop(Iop_64to32, mkexpr(valG1)));
   18426          goto decode_success;
   18427       }
   18428       break;
   18429 
   18430    default:
   18431       break;
   18432 
   18433    }
   18434 
   18435   //decode_failure:
   18436    *decode_OK = False;
   18437    return deltaIN;
   18438 
   18439   decode_success:
   18440    *decode_OK = True;
   18441    return delta;
   18442 }
   18443 
   18444 
   18445 /*------------------------------------------------------------*/
   18446 /*---                                                      ---*/
   18447 /*--- Top-level SSE4: dis_ESC_0F3A__SSE4                   ---*/
   18448 /*---                                                      ---*/
   18449 /*------------------------------------------------------------*/
   18450 
   18451 static Long dis_PEXTRW ( const VexAbiInfo* vbi, Prefix pfx,
   18452                          Long delta, Bool isAvx )
   18453 {
   18454    IRTemp addr  = IRTemp_INVALID;
   18455    IRTemp t0    = IRTemp_INVALID;
   18456    IRTemp t1    = IRTemp_INVALID;
   18457    IRTemp t2    = IRTemp_INVALID;
   18458    IRTemp t3    = IRTemp_INVALID;
   18459    UChar  modrm = getUChar(delta);
   18460    Int    alen  = 0;
   18461    HChar  dis_buf[50];
   18462    UInt   rG    = gregOfRexRM(pfx,modrm);
   18463    Int    imm8_20;
   18464    IRTemp xmm_vec = newTemp(Ity_V128);
   18465    IRTemp d16   = newTemp(Ity_I16);
   18466    const HChar* mbV = isAvx ? "v" : "";
   18467 
   18468    vassert(0==getRexW(pfx)); /* ensured by caller */
   18469    assign( xmm_vec, getXMMReg(rG) );
   18470    breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
   18471 
   18472    if ( epartIsReg( modrm ) ) {
   18473       imm8_20 = (Int)(getUChar(delta+1) & 7);
   18474    } else {
   18475       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   18476       imm8_20 = (Int)(getUChar(delta+alen) & 7);
   18477    }
   18478 
   18479    switch (imm8_20) {
   18480       case 0:  assign(d16, unop(Iop_32to16,   mkexpr(t0))); break;
   18481       case 1:  assign(d16, unop(Iop_32HIto16, mkexpr(t0))); break;
   18482       case 2:  assign(d16, unop(Iop_32to16,   mkexpr(t1))); break;
   18483       case 3:  assign(d16, unop(Iop_32HIto16, mkexpr(t1))); break;
   18484       case 4:  assign(d16, unop(Iop_32to16,   mkexpr(t2))); break;
   18485       case 5:  assign(d16, unop(Iop_32HIto16, mkexpr(t2))); break;
   18486       case 6:  assign(d16, unop(Iop_32to16,   mkexpr(t3))); break;
   18487       case 7:  assign(d16, unop(Iop_32HIto16, mkexpr(t3))); break;
   18488       default: vassert(0);
   18489    }
   18490 
   18491    if ( epartIsReg( modrm ) ) {
   18492       UInt rE = eregOfRexRM(pfx,modrm);
   18493       putIReg32( rE, unop(Iop_16Uto32, mkexpr(d16)) );
   18494       delta += 1+1;
   18495       DIP( "%spextrw $%d, %s,%s\n", mbV, imm8_20,
   18496            nameXMMReg( rG ), nameIReg32( rE ) );
   18497    } else {
   18498       storeLE( mkexpr(addr), mkexpr(d16) );
   18499       delta += alen+1;
   18500       DIP( "%spextrw $%d, %s,%s\n", mbV, imm8_20, nameXMMReg( rG ), dis_buf );
   18501    }
   18502    return delta;
   18503 }
   18504 
   18505 
   18506 static Long dis_PEXTRD ( const VexAbiInfo* vbi, Prefix pfx,
   18507                          Long delta, Bool isAvx )
   18508 {
   18509    IRTemp addr  = IRTemp_INVALID;
   18510    IRTemp t0    = IRTemp_INVALID;
   18511    IRTemp t1    = IRTemp_INVALID;
   18512    IRTemp t2    = IRTemp_INVALID;
   18513    IRTemp t3    = IRTemp_INVALID;
   18514    UChar  modrm = 0;
   18515    Int    alen  = 0;
   18516    HChar  dis_buf[50];
   18517 
   18518    Int    imm8_10;
   18519    IRTemp xmm_vec   = newTemp(Ity_V128);
   18520    IRTemp src_dword = newTemp(Ity_I32);
   18521    const HChar* mbV = isAvx ? "v" : "";
   18522 
   18523    vassert(0==getRexW(pfx)); /* ensured by caller */
   18524    modrm = getUChar(delta);
   18525    assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
   18526    breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
   18527 
   18528    if ( epartIsReg( modrm ) ) {
   18529       imm8_10 = (Int)(getUChar(delta+1) & 3);
   18530    } else {
   18531       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   18532       imm8_10 = (Int)(getUChar(delta+alen) & 3);
   18533    }
   18534 
   18535    switch ( imm8_10 ) {
   18536       case 0:  assign( src_dword, mkexpr(t0) ); break;
   18537       case 1:  assign( src_dword, mkexpr(t1) ); break;
   18538       case 2:  assign( src_dword, mkexpr(t2) ); break;
   18539       case 3:  assign( src_dword, mkexpr(t3) ); break;
   18540       default: vassert(0);
   18541    }
   18542 
   18543    if ( epartIsReg( modrm ) ) {
   18544       putIReg32( eregOfRexRM(pfx,modrm), mkexpr(src_dword) );
   18545       delta += 1+1;
   18546       DIP( "%spextrd $%d, %s,%s\n", mbV, imm8_10,
   18547            nameXMMReg( gregOfRexRM(pfx, modrm) ),
   18548            nameIReg32( eregOfRexRM(pfx, modrm) ) );
   18549    } else {
   18550       storeLE( mkexpr(addr), mkexpr(src_dword) );
   18551       delta += alen+1;
   18552       DIP( "%spextrd $%d, %s,%s\n", mbV,
   18553            imm8_10, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
   18554    }
   18555    return delta;
   18556 }
   18557 
   18558 
   18559 static Long dis_PEXTRQ ( const VexAbiInfo* vbi, Prefix pfx,
   18560                          Long delta, Bool isAvx )
   18561 {
   18562    IRTemp addr  = IRTemp_INVALID;
   18563    UChar  modrm = 0;
   18564    Int    alen  = 0;
   18565    HChar  dis_buf[50];
   18566 
   18567    Int imm8_0;
   18568    IRTemp xmm_vec   = newTemp(Ity_V128);
   18569    IRTemp src_qword = newTemp(Ity_I64);
   18570    const HChar* mbV = isAvx ? "v" : "";
   18571 
   18572    vassert(1==getRexW(pfx)); /* ensured by caller */
   18573    modrm = getUChar(delta);
   18574    assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
   18575 
   18576    if ( epartIsReg( modrm ) ) {
   18577       imm8_0 = (Int)(getUChar(delta+1) & 1);
   18578    } else {
   18579       addr   = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   18580       imm8_0 = (Int)(getUChar(delta+alen) & 1);
   18581    }
   18582 
   18583    switch ( imm8_0 ) {
   18584       case 0:  assign( src_qword, unop(Iop_V128to64,   mkexpr(xmm_vec)) );
   18585                break;
   18586       case 1:  assign( src_qword, unop(Iop_V128HIto64, mkexpr(xmm_vec)) );
   18587                break;
   18588       default: vassert(0);
   18589    }
   18590 
   18591    if ( epartIsReg( modrm ) ) {
   18592       putIReg64( eregOfRexRM(pfx,modrm), mkexpr(src_qword) );
   18593       delta += 1+1;
   18594       DIP( "%spextrq $%d, %s,%s\n", mbV, imm8_0,
   18595            nameXMMReg( gregOfRexRM(pfx, modrm) ),
   18596            nameIReg64( eregOfRexRM(pfx, modrm) ) );
   18597    } else {
   18598       storeLE( mkexpr(addr), mkexpr(src_qword) );
   18599       delta += alen+1;
   18600       DIP( "%spextrq $%d, %s,%s\n", mbV,
   18601            imm8_0, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
   18602    }
   18603    return delta;
   18604 }
   18605 
   18606 static IRExpr* math_CTZ32(IRExpr *exp)
   18607 {
   18608    /* Iop_Ctz32 isn't implemented by the amd64 back end, so use Iop_Ctz64. */
   18609    return unop(Iop_64to32, unop(Iop_Ctz64, unop(Iop_32Uto64, exp)));
   18610 }
   18611 
   18612 static Long dis_PCMPISTRI_3A ( UChar modrm, UInt regNoL, UInt regNoR,
   18613                                Long delta, UChar opc, UChar imm,
   18614                                HChar dis_buf[])
   18615 {
   18616    /* We only handle PCMPISTRI for now */
   18617    vassert((opc & 0x03) == 0x03);
   18618    /* And only an immediate byte of 0x38 or 0x3A */
   18619    vassert((imm & ~0x02) == 0x38);
   18620 
   18621    /* FIXME: Is this correct when RegNoL == 16 ? */
   18622    IRTemp argL = newTemp(Ity_V128);
   18623    assign(argL, getXMMReg(regNoL));
   18624    IRTemp argR = newTemp(Ity_V128);
   18625    assign(argR, getXMMReg(regNoR));
   18626 
   18627    IRTemp zmaskL = newTemp(Ity_I32);
   18628    assign(zmaskL, unop(Iop_16Uto32,
   18629                        unop(Iop_GetMSBs8x16,
   18630                             binop(Iop_CmpEQ8x16, mkexpr(argL), mkV128(0)))));
   18631    IRTemp zmaskR = newTemp(Ity_I32);
   18632    assign(zmaskR, unop(Iop_16Uto32,
   18633                        unop(Iop_GetMSBs8x16,
   18634                             binop(Iop_CmpEQ8x16, mkexpr(argR), mkV128(0)))));
   18635 
   18636    /* We want validL = ~(zmaskL | -zmaskL)
   18637 
   18638       But this formulation kills memcheck's validity tracking when any
   18639       bits above the first "1" are invalid.  So reformulate as:
   18640 
   18641       validL = (zmaskL ? (1 << ctz(zmaskL)) : 0) - 1
   18642    */
   18643 
   18644    IRExpr *ctzL = unop(Iop_32to8, math_CTZ32(mkexpr(zmaskL)));
   18645 
   18646    /* Generate a bool expression which is zero iff the original is
   18647       zero.  Do this carefully so memcheck can propagate validity bits
   18648       correctly.
   18649     */
   18650    IRTemp zmaskL_zero = newTemp(Ity_I1);
   18651    assign(zmaskL_zero, binop(Iop_ExpCmpNE32, mkexpr(zmaskL), mkU32(0)));
   18652 
   18653    IRTemp validL = newTemp(Ity_I32);
   18654    assign(validL, binop(Iop_Sub32,
   18655                         IRExpr_ITE(mkexpr(zmaskL_zero),
   18656                                    binop(Iop_Shl32, mkU32(1), ctzL),
   18657                                    mkU32(0)),
   18658                         mkU32(1)));
   18659 
   18660    /* And similarly for validR. */
   18661    IRExpr *ctzR = unop(Iop_32to8, math_CTZ32(mkexpr(zmaskR)));
   18662    IRTemp zmaskR_zero = newTemp(Ity_I1);
   18663    assign(zmaskR_zero, binop(Iop_ExpCmpNE32, mkexpr(zmaskR), mkU32(0)));
   18664    IRTemp validR = newTemp(Ity_I32);
   18665    assign(validR, binop(Iop_Sub32,
   18666                         IRExpr_ITE(mkexpr(zmaskR_zero),
   18667                                    binop(Iop_Shl32, mkU32(1), ctzR),
   18668                                    mkU32(0)),
   18669                         mkU32(1)));
   18670 
   18671    /* Do the actual comparison. */
   18672    IRExpr *boolResII = unop(Iop_16Uto32,
   18673                             unop(Iop_GetMSBs8x16,
   18674                                  binop(Iop_CmpEQ8x16, mkexpr(argL),
   18675                                                       mkexpr(argR))));
   18676 
   18677    /* Compute boolresII & validL & validR (i.e., if both valid, use
   18678       comparison result) */
   18679    IRExpr *intRes1_a = binop(Iop_And32, boolResII,
   18680                              binop(Iop_And32,
   18681                                    mkexpr(validL), mkexpr(validR)));
   18682 
   18683    /* Compute ~(validL | validR); i.e., if both invalid, force 1. */
   18684    IRExpr *intRes1_b = unop(Iop_Not32, binop(Iop_Or32,
   18685                                              mkexpr(validL), mkexpr(validR)));
   18686    /* Otherwise, zero. */
   18687    IRExpr *intRes1 = binop(Iop_And32, mkU32(0xFFFF),
   18688                            binop(Iop_Or32, intRes1_a, intRes1_b));
   18689 
   18690    /* The "0x30" in imm=0x3A means "polarity=3" means XOR validL with
   18691       result. */
   18692    IRTemp intRes2 = newTemp(Ity_I32);
   18693    assign(intRes2, binop(Iop_And32, mkU32(0xFFFF),
   18694                          binop(Iop_Xor32, intRes1, mkexpr(validL))));
   18695 
   18696    /* If the 0x40 bit were set in imm=0x3A, we would return the index
   18697       of the msb.  Since it is clear, we return the index of the
   18698       lsb. */
   18699    IRExpr *newECX = math_CTZ32(binop(Iop_Or32,
   18700                                      mkexpr(intRes2), mkU32(0x10000)));
   18701 
   18702    /* And thats our rcx. */
   18703    putIReg32(R_RCX, newECX);
   18704 
   18705    /* Now for the condition codes... */
   18706 
   18707    /* C == 0 iff intRes2 == 0 */
   18708    IRExpr *c_bit = IRExpr_ITE( binop(Iop_ExpCmpNE32, mkexpr(intRes2),
   18709                                      mkU32(0)),
   18710                                mkU32(1 << AMD64G_CC_SHIFT_C),
   18711                                mkU32(0));
   18712    /* Z == 1 iff any in argL is 0 */
   18713    IRExpr *z_bit = IRExpr_ITE( mkexpr(zmaskL_zero),
   18714                                mkU32(1 << AMD64G_CC_SHIFT_Z),
   18715                                mkU32(0));
   18716    /* S == 1 iff any in argR is 0 */
   18717    IRExpr *s_bit = IRExpr_ITE( mkexpr(zmaskR_zero),
   18718                                mkU32(1 << AMD64G_CC_SHIFT_S),
   18719                                mkU32(0));
   18720    /* O == IntRes2[0] */
   18721    IRExpr *o_bit = binop(Iop_Shl32, binop(Iop_And32, mkexpr(intRes2),
   18722                                           mkU32(0x01)),
   18723                          mkU8(AMD64G_CC_SHIFT_O));
   18724 
   18725    /* Put them all together */
   18726    IRTemp cc = newTemp(Ity_I64);
   18727    assign(cc, widenUto64(binop(Iop_Or32,
   18728                                binop(Iop_Or32, c_bit, z_bit),
   18729                                binop(Iop_Or32, s_bit, o_bit))));
   18730    stmt(IRStmt_Put(OFFB_CC_OP, mkU64(AMD64G_CC_OP_COPY)));
   18731    stmt(IRStmt_Put(OFFB_CC_DEP1, mkexpr(cc)));
   18732    stmt(IRStmt_Put(OFFB_CC_DEP2, mkU64(0)));
   18733    stmt(IRStmt_Put(OFFB_CC_NDEP, mkU64(0)));
   18734 
   18735    return delta;
   18736 }
   18737 
   18738 /* This can fail, in which case it returns the original (unchanged)
   18739    delta. */
   18740 static Long dis_PCMPxSTRx ( const VexAbiInfo* vbi, Prefix pfx,
   18741                             Long delta, Bool isAvx, UChar opc )
   18742 {
   18743    Long   delta0  = delta;
   18744    UInt   isISTRx = opc & 2;
   18745    UInt   isxSTRM = (opc & 1) ^ 1;
   18746    UInt   regNoL  = 0;
   18747    UInt   regNoR  = 0;
   18748    UChar  imm     = 0;
   18749    IRTemp addr    = IRTemp_INVALID;
   18750    Int    alen    = 0;
   18751    HChar  dis_buf[50];
   18752 
   18753    /* This is a nasty kludge.  We need to pass 2 x V128 to the helper
   18754       (which is clean).  Since we can't do that, use a dirty helper to
   18755       compute the results directly from the XMM regs in the guest
   18756       state.  That means for the memory case, we need to move the left
   18757       operand into a pseudo-register (XMM16, let's call it). */
   18758    UChar modrm = getUChar(delta);
   18759    if (epartIsReg(modrm)) {
   18760       regNoL = eregOfRexRM(pfx, modrm);
   18761       regNoR = gregOfRexRM(pfx, modrm);
   18762       imm = getUChar(delta+1);
   18763       delta += 1+1;
   18764    } else {
   18765       regNoL = 16; /* use XMM16 as an intermediary */
   18766       regNoR = gregOfRexRM(pfx, modrm);
   18767       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   18768       /* No alignment check; I guess that makes sense, given that
   18769          these insns are for dealing with C style strings. */
   18770       stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) ));
   18771       imm = getUChar(delta+alen);
   18772       delta += alen+1;
   18773    }
   18774 
   18775    /* Print the insn here, since dis_PCMPISTRI_3A doesn't do so
   18776       itself. */
   18777    if (regNoL == 16) {
   18778       DIP("%spcmp%cstr%c $%x,%s,%s\n",
   18779           isAvx ? "v" : "", isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i',
   18780           (UInt)imm, dis_buf, nameXMMReg(regNoR));
   18781    } else {
   18782       DIP("%spcmp%cstr%c $%x,%s,%s\n",
   18783           isAvx ? "v" : "", isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i',
   18784           (UInt)imm, nameXMMReg(regNoL), nameXMMReg(regNoR));
   18785    }
   18786 
   18787    /* Handle special case(s). */
   18788    if (imm == 0x3A && isISTRx && !isxSTRM) {
   18789       return dis_PCMPISTRI_3A ( modrm, regNoL, regNoR, delta,
   18790                                 opc, imm, dis_buf);
   18791    }
   18792 
   18793    /* Now we know the XMM reg numbers for the operands, and the
   18794       immediate byte.  Is it one we can actually handle? Throw out any
   18795       cases for which the helper function has not been verified. */
   18796    switch (imm) {
   18797       case 0x00: case 0x02:
   18798       case 0x08: case 0x0A: case 0x0C: case 0x0E:
   18799       case 0x10: case 0x12: case 0x14:
   18800       case 0x18: case 0x1A:
   18801       case 0x30:            case 0x34:
   18802       case 0x38: case 0x3A:
   18803       case 0x40: case 0x42: case 0x44: case 0x46:
   18804                  case 0x4A:
   18805                  case 0x62:
   18806       case 0x70: case 0x72:
   18807          break;
   18808       // the 16-bit character versions of the above
   18809       case 0x01: case 0x03:
   18810       case 0x09: case 0x0B: case 0x0D:
   18811                  case 0x13:
   18812       case 0x19: case 0x1B:
   18813       case 0x39: case 0x3B:
   18814                             case 0x45:
   18815                  case 0x4B:
   18816          break;
   18817       default:
   18818          return delta0; /*FAIL*/
   18819    }
   18820 
   18821    /* Who ya gonna call?  Presumably not Ghostbusters. */
   18822    void*  fn = &amd64g_dirtyhelper_PCMPxSTRx;
   18823    const HChar* nm = "amd64g_dirtyhelper_PCMPxSTRx";
   18824 
   18825    /* Round up the arguments.  Note that this is a kludge -- the use
   18826       of mkU64 rather than mkIRExpr_HWord implies the assumption that
   18827       the host's word size is 64-bit. */
   18828    UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL);
   18829    UInt gstOffR = ymmGuestRegOffset(regNoR);
   18830 
   18831    IRExpr*  opc4_and_imm = mkU64((opc << 8) | (imm & 0xFF));
   18832    IRExpr*  gstOffLe     = mkU64(gstOffL);
   18833    IRExpr*  gstOffRe     = mkU64(gstOffR);
   18834    IRExpr*  edxIN        = isISTRx ? mkU64(0) : getIRegRDX(8);
   18835    IRExpr*  eaxIN        = isISTRx ? mkU64(0) : getIRegRAX(8);
   18836    IRExpr** args
   18837       = mkIRExprVec_6( IRExpr_GSPTR(),
   18838                        opc4_and_imm, gstOffLe, gstOffRe, edxIN, eaxIN );
   18839 
   18840    IRTemp   resT = newTemp(Ity_I64);
   18841    IRDirty* d    = unsafeIRDirty_1_N( resT, 0/*regparms*/, nm, fn, args );
   18842    /* It's not really a dirty call, but we can't use the clean helper
   18843       mechanism here for the very lame reason that we can't pass 2 x
   18844       V128s by value to a helper.  Hence this roundabout scheme. */
   18845    d->nFxState = 2;
   18846    vex_bzero(&d->fxState, sizeof(d->fxState));
   18847    d->fxState[0].fx     = Ifx_Read;
   18848    d->fxState[0].offset = gstOffL;
   18849    d->fxState[0].size   = sizeof(U128);
   18850    d->fxState[1].fx     = Ifx_Read;
   18851    d->fxState[1].offset = gstOffR;
   18852    d->fxState[1].size   = sizeof(U128);
   18853    if (isxSTRM) {
   18854       /* Declare that the helper writes XMM0. */
   18855       d->nFxState = 3;
   18856       d->fxState[2].fx     = Ifx_Write;
   18857       d->fxState[2].offset = ymmGuestRegOffset(0);
   18858       d->fxState[2].size   = sizeof(U128);
   18859    }
   18860 
   18861    stmt( IRStmt_Dirty(d) );
   18862 
   18863    /* Now resT[15:0] holds the new OSZACP values, so the condition
   18864       codes must be updated. And for a xSTRI case, resT[31:16] holds
   18865       the new ECX value, so stash that too. */
   18866    if (!isxSTRM) {
   18867       putIReg64(R_RCX, binop(Iop_And64,
   18868                              binop(Iop_Shr64, mkexpr(resT), mkU8(16)),
   18869                              mkU64(0xFFFF)));
   18870    }
   18871 
   18872    /* Zap the upper half of the dest reg as per AVX conventions. */
   18873    if (isxSTRM && isAvx)
   18874       putYMMRegLane128(/*YMM*/0, 1, mkV128(0));
   18875 
   18876    stmt( IRStmt_Put(
   18877             OFFB_CC_DEP1,
   18878             binop(Iop_And64, mkexpr(resT), mkU64(0xFFFF))
   18879    ));
   18880    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   18881    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   18882    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   18883 
   18884    return delta;
   18885 }
   18886 
   18887 
   18888 static IRTemp math_PINSRB_128 ( IRTemp v128, IRTemp u8, UInt imm8 )
   18889 {
   18890    vassert(imm8 >= 0 && imm8 <= 15);
   18891 
   18892    // Create a V128 value which has the selected byte in the
   18893    // specified lane, and zeroes everywhere else.
   18894    IRTemp tmp128    = newTemp(Ity_V128);
   18895    IRTemp halfshift = newTemp(Ity_I64);
   18896    assign(halfshift, binop(Iop_Shl64,
   18897                            unop(Iop_8Uto64, mkexpr(u8)),
   18898                            mkU8(8 * (imm8 & 7))));
   18899    if (imm8 < 8) {
   18900       assign(tmp128, binop(Iop_64HLtoV128, mkU64(0), mkexpr(halfshift)));
   18901    } else {
   18902       assign(tmp128, binop(Iop_64HLtoV128, mkexpr(halfshift), mkU64(0)));
   18903    }
   18904 
   18905    UShort mask = ~(1 << imm8);
   18906    IRTemp res  = newTemp(Ity_V128);
   18907    assign( res, binop(Iop_OrV128,
   18908                       mkexpr(tmp128),
   18909                       binop(Iop_AndV128, mkexpr(v128), mkV128(mask))) );
   18910    return res;
   18911 }
   18912 
   18913 
   18914 static IRTemp math_PINSRD_128 ( IRTemp v128, IRTemp u32, UInt imm8 )
   18915 {
   18916    IRTemp z32 = newTemp(Ity_I32);
   18917    assign(z32, mkU32(0));
   18918 
   18919    /* Surround u32 with zeroes as per imm, giving us something we can
   18920       OR into a suitably masked-out v128.*/
   18921    IRTemp withZs = newTemp(Ity_V128);
   18922    UShort mask = 0;
   18923    switch (imm8) {
   18924       case 3:  mask = 0x0FFF;
   18925                assign(withZs, mkV128from32s(u32, z32, z32, z32));
   18926                break;
   18927       case 2:  mask = 0xF0FF;
   18928                assign(withZs, mkV128from32s(z32, u32, z32, z32));
   18929                break;
   18930       case 1:  mask = 0xFF0F;
   18931                assign(withZs, mkV128from32s(z32, z32, u32, z32));
   18932                break;
   18933       case 0:  mask = 0xFFF0;
   18934                assign(withZs, mkV128from32s(z32, z32, z32, u32));
   18935                break;
   18936       default: vassert(0);
   18937    }
   18938 
   18939    IRTemp res = newTemp(Ity_V128);
   18940    assign(res, binop( Iop_OrV128,
   18941                       mkexpr(withZs),
   18942                       binop( Iop_AndV128, mkexpr(v128), mkV128(mask) ) ) );
   18943    return res;
   18944 }
   18945 
   18946 
   18947 static IRTemp math_PINSRQ_128 ( IRTemp v128, IRTemp u64, UInt imm8 )
   18948 {
   18949    /* Surround u64 with zeroes as per imm, giving us something we can
   18950       OR into a suitably masked-out v128.*/
   18951    IRTemp withZs = newTemp(Ity_V128);
   18952    UShort mask = 0;
   18953    if (imm8 == 0) {
   18954       mask = 0xFF00;
   18955       assign(withZs, binop(Iop_64HLtoV128, mkU64(0), mkexpr(u64)));
   18956    } else {
   18957       vassert(imm8 == 1);
   18958       mask = 0x00FF;
   18959       assign( withZs, binop(Iop_64HLtoV128, mkexpr(u64), mkU64(0)));
   18960    }
   18961 
   18962    IRTemp res = newTemp(Ity_V128);
   18963    assign( res, binop( Iop_OrV128,
   18964                        mkexpr(withZs),
   18965                        binop( Iop_AndV128, mkexpr(v128), mkV128(mask) ) ) );
   18966    return res;
   18967 }
   18968 
   18969 
   18970 static IRTemp math_INSERTPS ( IRTemp dstV, IRTemp toInsertD, UInt imm8 )
   18971 {
   18972    const IRTemp inval = IRTemp_INVALID;
   18973    IRTemp dstDs[4] = { inval, inval, inval, inval };
   18974    breakupV128to32s( dstV, &dstDs[3], &dstDs[2], &dstDs[1], &dstDs[0] );
   18975 
   18976    vassert(imm8 <= 255);
   18977    dstDs[(imm8 >> 4) & 3] = toInsertD; /* "imm8_count_d" */
   18978 
   18979    UInt imm8_zmask = (imm8 & 15);
   18980    IRTemp zero_32 = newTemp(Ity_I32);
   18981    assign( zero_32, mkU32(0) );
   18982    IRTemp resV = newTemp(Ity_V128);
   18983    assign( resV, mkV128from32s(
   18984                     ((imm8_zmask & 8) == 8) ? zero_32 : dstDs[3],
   18985                     ((imm8_zmask & 4) == 4) ? zero_32 : dstDs[2],
   18986                     ((imm8_zmask & 2) == 2) ? zero_32 : dstDs[1],
   18987                     ((imm8_zmask & 1) == 1) ? zero_32 : dstDs[0]) );
   18988    return resV;
   18989 }
   18990 
   18991 
   18992 static Long dis_PEXTRB_128_GtoE ( const VexAbiInfo* vbi, Prefix pfx,
   18993                                   Long delta, Bool isAvx )
   18994 {
   18995    IRTemp addr     = IRTemp_INVALID;
   18996    Int    alen     = 0;
   18997    HChar  dis_buf[50];
   18998    IRTemp xmm_vec  = newTemp(Ity_V128);
   18999    IRTemp sel_lane = newTemp(Ity_I32);
   19000    IRTemp shr_lane = newTemp(Ity_I32);
   19001    const HChar* mbV = isAvx ? "v" : "";
   19002    UChar  modrm    = getUChar(delta);
   19003    IRTemp t3, t2, t1, t0;
   19004    Int    imm8;
   19005    assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
   19006    t3 = t2 = t1 = t0 = IRTemp_INVALID;
   19007    breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
   19008 
   19009    if ( epartIsReg( modrm ) ) {
   19010       imm8 = (Int)getUChar(delta+1);
   19011    } else {
   19012       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   19013       imm8 = (Int)getUChar(delta+alen);
   19014    }
   19015    switch ( (imm8 >> 2) & 3 ) {
   19016       case 0:  assign( sel_lane, mkexpr(t0) ); break;
   19017       case 1:  assign( sel_lane, mkexpr(t1) ); break;
   19018       case 2:  assign( sel_lane, mkexpr(t2) ); break;
   19019       case 3:  assign( sel_lane, mkexpr(t3) ); break;
   19020       default: vassert(0);
   19021    }
   19022    assign( shr_lane,
   19023            binop( Iop_Shr32, mkexpr(sel_lane), mkU8(((imm8 & 3)*8)) ) );
   19024 
   19025    if ( epartIsReg( modrm ) ) {
   19026       putIReg64( eregOfRexRM(pfx,modrm),
   19027                  unop( Iop_32Uto64,
   19028                        binop(Iop_And32, mkexpr(shr_lane), mkU32(255)) ) );
   19029       delta += 1+1;
   19030       DIP( "%spextrb $%d, %s,%s\n", mbV, imm8,
   19031            nameXMMReg( gregOfRexRM(pfx, modrm) ),
   19032            nameIReg64( eregOfRexRM(pfx, modrm) ) );
   19033    } else {
   19034       storeLE( mkexpr(addr), unop(Iop_32to8, mkexpr(shr_lane) ) );
   19035       delta += alen+1;
   19036       DIP( "%spextrb $%d,%s,%s\n", mbV,
   19037            imm8, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
   19038    }
   19039 
   19040    return delta;
   19041 }
   19042 
   19043 
   19044 static IRTemp math_DPPD_128 ( IRTemp src_vec, IRTemp dst_vec, UInt imm8 )
   19045 {
   19046    vassert(imm8 < 256);
   19047    UShort imm8_perms[4] = { 0x0000, 0x00FF, 0xFF00, 0xFFFF };
   19048    IRTemp and_vec = newTemp(Ity_V128);
   19049    IRTemp sum_vec = newTemp(Ity_V128);
   19050    IRTemp rm      = newTemp(Ity_I32);
   19051    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   19052    assign( and_vec, binop( Iop_AndV128,
   19053                            triop( Iop_Mul64Fx2,
   19054                                   mkexpr(rm),
   19055                                   mkexpr(dst_vec), mkexpr(src_vec) ),
   19056                            mkV128( imm8_perms[ ((imm8 >> 4) & 3) ] ) ) );
   19057 
   19058    assign( sum_vec, binop( Iop_Add64F0x2,
   19059                            binop( Iop_InterleaveHI64x2,
   19060                                   mkexpr(and_vec), mkexpr(and_vec) ),
   19061                            binop( Iop_InterleaveLO64x2,
   19062                                   mkexpr(and_vec), mkexpr(and_vec) ) ) );
   19063    IRTemp res = newTemp(Ity_V128);
   19064    assign(res, binop( Iop_AndV128,
   19065                       binop( Iop_InterleaveLO64x2,
   19066                              mkexpr(sum_vec), mkexpr(sum_vec) ),
   19067                       mkV128( imm8_perms[ (imm8 & 3) ] ) ) );
   19068    return res;
   19069 }
   19070 
   19071 
   19072 static IRTemp math_DPPS_128 ( IRTemp src_vec, IRTemp dst_vec, UInt imm8 )
   19073 {
   19074    vassert(imm8 < 256);
   19075    IRTemp tmp_prod_vec = newTemp(Ity_V128);
   19076    IRTemp prod_vec     = newTemp(Ity_V128);
   19077    IRTemp sum_vec      = newTemp(Ity_V128);
   19078    IRTemp rm           = newTemp(Ity_I32);
   19079    IRTemp v3, v2, v1, v0;
   19080    v3 = v2 = v1 = v0   = IRTemp_INVALID;
   19081    UShort imm8_perms[16] = { 0x0000, 0x000F, 0x00F0, 0x00FF, 0x0F00,
   19082                              0x0F0F, 0x0FF0, 0x0FFF, 0xF000, 0xF00F,
   19083                              0xF0F0, 0xF0FF, 0xFF00, 0xFF0F, 0xFFF0,
   19084                              0xFFFF };
   19085 
   19086    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   19087    assign( tmp_prod_vec,
   19088            binop( Iop_AndV128,
   19089                   triop( Iop_Mul32Fx4,
   19090                          mkexpr(rm), mkexpr(dst_vec), mkexpr(src_vec) ),
   19091                   mkV128( imm8_perms[((imm8 >> 4)& 15)] ) ) );
   19092    breakupV128to32s( tmp_prod_vec, &v3, &v2, &v1, &v0 );
   19093    assign( prod_vec, mkV128from32s( v3, v1, v2, v0 ) );
   19094 
   19095    assign( sum_vec, triop( Iop_Add32Fx4,
   19096                            mkexpr(rm),
   19097                            binop( Iop_InterleaveHI32x4,
   19098                                   mkexpr(prod_vec), mkexpr(prod_vec) ),
   19099                            binop( Iop_InterleaveLO32x4,
   19100                                   mkexpr(prod_vec), mkexpr(prod_vec) ) ) );
   19101 
   19102    IRTemp res = newTemp(Ity_V128);
   19103    assign( res, binop( Iop_AndV128,
   19104                        triop( Iop_Add32Fx4,
   19105                               mkexpr(rm),
   19106                               binop( Iop_InterleaveHI32x4,
   19107                                      mkexpr(sum_vec), mkexpr(sum_vec) ),
   19108                               binop( Iop_InterleaveLO32x4,
   19109                                      mkexpr(sum_vec), mkexpr(sum_vec) ) ),
   19110                        mkV128( imm8_perms[ (imm8 & 15) ] ) ) );
   19111    return res;
   19112 }
   19113 
   19114 
   19115 static IRTemp math_MPSADBW_128 ( IRTemp dst_vec, IRTemp src_vec, UInt imm8 )
   19116 {
   19117    /* Mask out bits of the operands we don't need.  This isn't
   19118       strictly necessary, but it does ensure Memcheck doesn't
   19119       give us any false uninitialised value errors as a
   19120       result. */
   19121    UShort src_mask[4] = { 0x000F, 0x00F0, 0x0F00, 0xF000 };
   19122    UShort dst_mask[2] = { 0x07FF, 0x7FF0 };
   19123 
   19124    IRTemp src_maskV = newTemp(Ity_V128);
   19125    IRTemp dst_maskV = newTemp(Ity_V128);
   19126    assign(src_maskV, mkV128( src_mask[ imm8 & 3 ] ));
   19127    assign(dst_maskV, mkV128( dst_mask[ (imm8 >> 2) & 1 ] ));
   19128 
   19129    IRTemp src_masked = newTemp(Ity_V128);
   19130    IRTemp dst_masked = newTemp(Ity_V128);
   19131    assign(src_masked, binop(Iop_AndV128, mkexpr(src_vec), mkexpr(src_maskV)));
   19132    assign(dst_masked, binop(Iop_AndV128, mkexpr(dst_vec), mkexpr(dst_maskV)));
   19133 
   19134    /* Generate 4 64 bit values that we can hand to a clean helper */
   19135    IRTemp sHi = newTemp(Ity_I64);
   19136    IRTemp sLo = newTemp(Ity_I64);
   19137    assign( sHi, unop(Iop_V128HIto64, mkexpr(src_masked)) );
   19138    assign( sLo, unop(Iop_V128to64,   mkexpr(src_masked)) );
   19139 
   19140    IRTemp dHi = newTemp(Ity_I64);
   19141    IRTemp dLo = newTemp(Ity_I64);
   19142    assign( dHi, unop(Iop_V128HIto64, mkexpr(dst_masked)) );
   19143    assign( dLo, unop(Iop_V128to64,   mkexpr(dst_masked)) );
   19144 
   19145    /* Compute halves of the result separately */
   19146    IRTemp resHi = newTemp(Ity_I64);
   19147    IRTemp resLo = newTemp(Ity_I64);
   19148 
   19149    IRExpr** argsHi
   19150       = mkIRExprVec_5( mkexpr(sHi), mkexpr(sLo), mkexpr(dHi), mkexpr(dLo),
   19151                        mkU64( 0x80 | (imm8 & 7) ));
   19152    IRExpr** argsLo
   19153       = mkIRExprVec_5( mkexpr(sHi), mkexpr(sLo), mkexpr(dHi), mkexpr(dLo),
   19154                        mkU64( 0x00 | (imm8 & 7) ));
   19155 
   19156    assign(resHi, mkIRExprCCall( Ity_I64, 0/*regparm*/,
   19157                                 "amd64g_calc_mpsadbw",
   19158                                 &amd64g_calc_mpsadbw, argsHi ));
   19159    assign(resLo, mkIRExprCCall( Ity_I64, 0/*regparm*/,
   19160                                 "amd64g_calc_mpsadbw",
   19161                                 &amd64g_calc_mpsadbw, argsLo ));
   19162 
   19163    IRTemp res = newTemp(Ity_V128);
   19164    assign(res, binop(Iop_64HLtoV128, mkexpr(resHi), mkexpr(resLo)));
   19165    return res;
   19166 }
   19167 
   19168 static Long dis_EXTRACTPS ( const VexAbiInfo* vbi, Prefix pfx,
   19169                             Long delta, Bool isAvx )
   19170 {
   19171    IRTemp addr       = IRTemp_INVALID;
   19172    Int    alen       = 0;
   19173    HChar  dis_buf[50];
   19174    UChar  modrm      = getUChar(delta);
   19175    Int imm8_10;
   19176    IRTemp xmm_vec    = newTemp(Ity_V128);
   19177    IRTemp src_dword  = newTemp(Ity_I32);
   19178    UInt   rG         = gregOfRexRM(pfx,modrm);
   19179    IRTemp t3, t2, t1, t0;
   19180    t3 = t2 = t1 = t0 = IRTemp_INVALID;
   19181 
   19182    assign( xmm_vec, getXMMReg( rG ) );
   19183    breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
   19184 
   19185    if ( epartIsReg( modrm ) ) {
   19186       imm8_10 = (Int)(getUChar(delta+1) & 3);
   19187    } else {
   19188       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   19189       imm8_10 = (Int)(getUChar(delta+alen) & 3);
   19190    }
   19191 
   19192    switch ( imm8_10 ) {
   19193       case 0:  assign( src_dword, mkexpr(t0) ); break;
   19194       case 1:  assign( src_dword, mkexpr(t1) ); break;
   19195       case 2:  assign( src_dword, mkexpr(t2) ); break;
   19196       case 3:  assign( src_dword, mkexpr(t3) ); break;
   19197       default: vassert(0);
   19198    }
   19199 
   19200    if ( epartIsReg( modrm ) ) {
   19201       UInt rE = eregOfRexRM(pfx,modrm);
   19202       putIReg32( rE, mkexpr(src_dword) );
   19203       delta += 1+1;
   19204       DIP( "%sextractps $%d, %s,%s\n", isAvx ? "v" : "", imm8_10,
   19205            nameXMMReg( rG ), nameIReg32( rE ) );
   19206    } else {
   19207       storeLE( mkexpr(addr), mkexpr(src_dword) );
   19208       delta += alen+1;
   19209       DIP( "%sextractps $%d, %s,%s\n", isAvx ? "v" : "", imm8_10,
   19210            nameXMMReg( rG ), dis_buf );
   19211    }
   19212 
   19213    return delta;
   19214 }
   19215 
   19216 
   19217 static IRTemp math_PCLMULQDQ( IRTemp dV, IRTemp sV, UInt imm8 )
   19218 {
   19219    IRTemp t0 = newTemp(Ity_I64);
   19220    IRTemp t1 = newTemp(Ity_I64);
   19221    assign(t0, unop((imm8&1)? Iop_V128HIto64 : Iop_V128to64,
   19222               mkexpr(dV)));
   19223    assign(t1, unop((imm8&16) ? Iop_V128HIto64 : Iop_V128to64,
   19224               mkexpr(sV)));
   19225 
   19226    IRTemp t2 = newTemp(Ity_I64);
   19227    IRTemp t3 = newTemp(Ity_I64);
   19228 
   19229    IRExpr** args;
   19230 
   19231    args = mkIRExprVec_3(mkexpr(t0), mkexpr(t1), mkU64(0));
   19232    assign(t2, mkIRExprCCall(Ity_I64,0, "amd64g_calculate_pclmul",
   19233                             &amd64g_calculate_pclmul, args));
   19234    args = mkIRExprVec_3(mkexpr(t0), mkexpr(t1), mkU64(1));
   19235    assign(t3, mkIRExprCCall(Ity_I64,0, "amd64g_calculate_pclmul",
   19236                             &amd64g_calculate_pclmul, args));
   19237 
   19238    IRTemp res     = newTemp(Ity_V128);
   19239    assign(res, binop(Iop_64HLtoV128, mkexpr(t3), mkexpr(t2)));
   19240    return res;
   19241 }
   19242 
   19243 
   19244 __attribute__((noinline))
   19245 static
   19246 Long dis_ESC_0F3A__SSE4 ( Bool* decode_OK,
   19247                           const VexAbiInfo* vbi,
   19248                           Prefix pfx, Int sz, Long deltaIN )
   19249 {
   19250    IRTemp addr  = IRTemp_INVALID;
   19251    UChar  modrm = 0;
   19252    Int    alen  = 0;
   19253    HChar  dis_buf[50];
   19254 
   19255    *decode_OK = False;
   19256 
   19257    Long   delta = deltaIN;
   19258    UChar  opc   = getUChar(delta);
   19259    delta++;
   19260    switch (opc) {
   19261 
   19262    case 0x08:
   19263       /* 66 0F 3A 08 /r ib = ROUNDPS imm8, xmm2/m128, xmm1 */
   19264       if (have66noF2noF3(pfx) && sz == 2) {
   19265 
   19266          IRTemp src0 = newTemp(Ity_F32);
   19267          IRTemp src1 = newTemp(Ity_F32);
   19268          IRTemp src2 = newTemp(Ity_F32);
   19269          IRTemp src3 = newTemp(Ity_F32);
   19270          IRTemp res0 = newTemp(Ity_F32);
   19271          IRTemp res1 = newTemp(Ity_F32);
   19272          IRTemp res2 = newTemp(Ity_F32);
   19273          IRTemp res3 = newTemp(Ity_F32);
   19274          IRTemp rm   = newTemp(Ity_I32);
   19275          Int    imm  = 0;
   19276 
   19277          modrm = getUChar(delta);
   19278 
   19279          if (epartIsReg(modrm)) {
   19280             assign( src0,
   19281                     getXMMRegLane32F( eregOfRexRM(pfx, modrm), 0 ) );
   19282             assign( src1,
   19283                     getXMMRegLane32F( eregOfRexRM(pfx, modrm), 1 ) );
   19284             assign( src2,
   19285                     getXMMRegLane32F( eregOfRexRM(pfx, modrm), 2 ) );
   19286             assign( src3,
   19287                     getXMMRegLane32F( eregOfRexRM(pfx, modrm), 3 ) );
   19288             imm = getUChar(delta+1);
   19289             if (imm & ~15) goto decode_failure;
   19290             delta += 1+1;
   19291             DIP( "roundps $%d,%s,%s\n",
   19292                  imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
   19293                       nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   19294          } else {
   19295             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   19296             gen_SEGV_if_not_16_aligned(addr);
   19297             assign( src0, loadLE(Ity_F32,
   19298                                  binop(Iop_Add64, mkexpr(addr), mkU64(0) )));
   19299             assign( src1, loadLE(Ity_F32,
   19300                                  binop(Iop_Add64, mkexpr(addr), mkU64(4) )));
   19301             assign( src2, loadLE(Ity_F32,
   19302                                  binop(Iop_Add64, mkexpr(addr), mkU64(8) )));
   19303             assign( src3, loadLE(Ity_F32,
   19304                                  binop(Iop_Add64, mkexpr(addr), mkU64(12) )));
   19305             imm = getUChar(delta+alen);
   19306             if (imm & ~15) goto decode_failure;
   19307             delta += alen+1;
   19308             DIP( "roundps $%d,%s,%s\n",
   19309                  imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   19310          }
   19311 
   19312          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   19313             that encoding is the same as the encoding for IRRoundingMode,
   19314             we can use that value directly in the IR as a rounding
   19315             mode. */
   19316          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   19317 
   19318          assign(res0, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src0)) );
   19319          assign(res1, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src1)) );
   19320          assign(res2, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src2)) );
   19321          assign(res3, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src3)) );
   19322 
   19323          putXMMRegLane32F( gregOfRexRM(pfx, modrm), 0, mkexpr(res0) );
   19324          putXMMRegLane32F( gregOfRexRM(pfx, modrm), 1, mkexpr(res1) );
   19325          putXMMRegLane32F( gregOfRexRM(pfx, modrm), 2, mkexpr(res2) );
   19326          putXMMRegLane32F( gregOfRexRM(pfx, modrm), 3, mkexpr(res3) );
   19327 
   19328          goto decode_success;
   19329       }
   19330       break;
   19331 
   19332    case 0x09:
   19333       /* 66 0F 3A 09 /r ib = ROUNDPD imm8, xmm2/m128, xmm1 */
   19334       if (have66noF2noF3(pfx) && sz == 2) {
   19335 
   19336          IRTemp src0 = newTemp(Ity_F64);
   19337          IRTemp src1 = newTemp(Ity_F64);
   19338          IRTemp res0 = newTemp(Ity_F64);
   19339          IRTemp res1 = newTemp(Ity_F64);
   19340          IRTemp rm   = newTemp(Ity_I32);
   19341          Int    imm  = 0;
   19342 
   19343          modrm = getUChar(delta);
   19344 
   19345          if (epartIsReg(modrm)) {
   19346             assign( src0,
   19347                     getXMMRegLane64F( eregOfRexRM(pfx, modrm), 0 ) );
   19348             assign( src1,
   19349                     getXMMRegLane64F( eregOfRexRM(pfx, modrm), 1 ) );
   19350             imm = getUChar(delta+1);
   19351             if (imm & ~15) goto decode_failure;
   19352             delta += 1+1;
   19353             DIP( "roundpd $%d,%s,%s\n",
   19354                  imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
   19355                       nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   19356          } else {
   19357             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   19358             gen_SEGV_if_not_16_aligned(addr);
   19359             assign( src0, loadLE(Ity_F64,
   19360                                  binop(Iop_Add64, mkexpr(addr), mkU64(0) )));
   19361             assign( src1, loadLE(Ity_F64,
   19362                                  binop(Iop_Add64, mkexpr(addr), mkU64(8) )));
   19363             imm = getUChar(delta+alen);
   19364             if (imm & ~15) goto decode_failure;
   19365             delta += alen+1;
   19366             DIP( "roundpd $%d,%s,%s\n",
   19367                  imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   19368          }
   19369 
   19370          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   19371             that encoding is the same as the encoding for IRRoundingMode,
   19372             we can use that value directly in the IR as a rounding
   19373             mode. */
   19374          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   19375 
   19376          assign(res0, binop(Iop_RoundF64toInt, mkexpr(rm), mkexpr(src0)) );
   19377          assign(res1, binop(Iop_RoundF64toInt, mkexpr(rm), mkexpr(src1)) );
   19378 
   19379          putXMMRegLane64F( gregOfRexRM(pfx, modrm), 0, mkexpr(res0) );
   19380          putXMMRegLane64F( gregOfRexRM(pfx, modrm), 1, mkexpr(res1) );
   19381 
   19382          goto decode_success;
   19383       }
   19384       break;
   19385 
   19386    case 0x0A:
   19387    case 0x0B:
   19388       /* 66 0F 3A 0A /r ib = ROUNDSS imm8, xmm2/m32, xmm1
   19389          66 0F 3A 0B /r ib = ROUNDSD imm8, xmm2/m64, xmm1
   19390       */
   19391       if (have66noF2noF3(pfx) && sz == 2) {
   19392 
   19393          Bool   isD = opc == 0x0B;
   19394          IRTemp src = newTemp(isD ? Ity_F64 : Ity_F32);
   19395          IRTemp res = newTemp(isD ? Ity_F64 : Ity_F32);
   19396          Int    imm = 0;
   19397 
   19398          modrm = getUChar(delta);
   19399 
   19400          if (epartIsReg(modrm)) {
   19401             assign( src,
   19402                     isD ? getXMMRegLane64F( eregOfRexRM(pfx, modrm), 0 )
   19403                         : getXMMRegLane32F( eregOfRexRM(pfx, modrm), 0 ) );
   19404             imm = getUChar(delta+1);
   19405             if (imm & ~15) goto decode_failure;
   19406             delta += 1+1;
   19407             DIP( "rounds%c $%d,%s,%s\n",
   19408                  isD ? 'd' : 's',
   19409                  imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
   19410                       nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   19411          } else {
   19412             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   19413             assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
   19414             imm = getUChar(delta+alen);
   19415             if (imm & ~15) goto decode_failure;
   19416             delta += alen+1;
   19417             DIP( "rounds%c $%d,%s,%s\n",
   19418                  isD ? 'd' : 's',
   19419                  imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   19420          }
   19421 
   19422          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   19423             that encoding is the same as the encoding for IRRoundingMode,
   19424             we can use that value directly in the IR as a rounding
   19425             mode. */
   19426          assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
   19427                            (imm & 4) ? get_sse_roundingmode()
   19428                                      : mkU32(imm & 3),
   19429                            mkexpr(src)) );
   19430 
   19431          if (isD)
   19432             putXMMRegLane64F( gregOfRexRM(pfx, modrm), 0, mkexpr(res) );
   19433          else
   19434             putXMMRegLane32F( gregOfRexRM(pfx, modrm), 0, mkexpr(res) );
   19435 
   19436          goto decode_success;
   19437       }
   19438       break;
   19439 
   19440    case 0x0C:
   19441       /* 66 0F 3A 0C /r ib = BLENDPS xmm1, xmm2/m128, imm8
   19442          Blend Packed Single Precision Floating-Point Values (XMM) */
   19443       if (have66noF2noF3(pfx) && sz == 2) {
   19444 
   19445          Int imm8;
   19446          IRTemp dst_vec = newTemp(Ity_V128);
   19447          IRTemp src_vec = newTemp(Ity_V128);
   19448 
   19449          modrm = getUChar(delta);
   19450 
   19451          assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
   19452 
   19453          if ( epartIsReg( modrm ) ) {
   19454             imm8 = (Int)getUChar(delta+1);
   19455             assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   19456             delta += 1+1;
   19457             DIP( "blendps $%d, %s,%s\n", imm8,
   19458                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   19459                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   19460          } else {
   19461             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   19462                              1/* imm8 is 1 byte after the amode */ );
   19463             gen_SEGV_if_not_16_aligned( addr );
   19464             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   19465             imm8 = (Int)getUChar(delta+alen);
   19466             delta += alen+1;
   19467             DIP( "blendpd $%d, %s,%s\n",
   19468                  imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   19469          }
   19470 
   19471          putXMMReg( gregOfRexRM(pfx, modrm),
   19472                     mkexpr( math_BLENDPS_128( src_vec, dst_vec, imm8) ) );
   19473          goto decode_success;
   19474       }
   19475       break;
   19476 
   19477    case 0x0D:
   19478       /* 66 0F 3A 0D /r ib = BLENDPD xmm1, xmm2/m128, imm8
   19479          Blend Packed Double Precision Floating-Point Values (XMM) */
   19480       if (have66noF2noF3(pfx) && sz == 2) {
   19481 
   19482          Int imm8;
   19483          IRTemp dst_vec = newTemp(Ity_V128);
   19484          IRTemp src_vec = newTemp(Ity_V128);
   19485 
   19486          modrm = getUChar(delta);
   19487          assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
   19488 
   19489          if ( epartIsReg( modrm ) ) {
   19490             imm8 = (Int)getUChar(delta+1);
   19491             assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   19492             delta += 1+1;
   19493             DIP( "blendpd $%d, %s,%s\n", imm8,
   19494                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   19495                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   19496          } else {
   19497             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   19498                              1/* imm8 is 1 byte after the amode */ );
   19499             gen_SEGV_if_not_16_aligned( addr );
   19500             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   19501             imm8 = (Int)getUChar(delta+alen);
   19502             delta += alen+1;
   19503             DIP( "blendpd $%d, %s,%s\n",
   19504                  imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   19505          }
   19506 
   19507          putXMMReg( gregOfRexRM(pfx, modrm),
   19508                     mkexpr( math_BLENDPD_128( src_vec, dst_vec, imm8) ) );
   19509          goto decode_success;
   19510       }
   19511       break;
   19512 
   19513    case 0x0E:
   19514       /* 66 0F 3A 0E /r ib = PBLENDW xmm1, xmm2/m128, imm8
   19515          Blend Packed Words (XMM) */
   19516       if (have66noF2noF3(pfx) && sz == 2) {
   19517 
   19518          Int imm8;
   19519          IRTemp dst_vec = newTemp(Ity_V128);
   19520          IRTemp src_vec = newTemp(Ity_V128);
   19521 
   19522          modrm = getUChar(delta);
   19523 
   19524          assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
   19525 
   19526          if ( epartIsReg( modrm ) ) {
   19527             imm8 = (Int)getUChar(delta+1);
   19528             assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   19529             delta += 1+1;
   19530             DIP( "pblendw $%d, %s,%s\n", imm8,
   19531                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   19532                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   19533          } else {
   19534             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   19535                              1/* imm8 is 1 byte after the amode */ );
   19536             gen_SEGV_if_not_16_aligned( addr );
   19537             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   19538             imm8 = (Int)getUChar(delta+alen);
   19539             delta += alen+1;
   19540             DIP( "pblendw $%d, %s,%s\n",
   19541                  imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   19542          }
   19543 
   19544          putXMMReg( gregOfRexRM(pfx, modrm),
   19545                     mkexpr( math_PBLENDW_128( src_vec, dst_vec, imm8) ) );
   19546          goto decode_success;
   19547       }
   19548       break;
   19549 
   19550    case 0x14:
   19551       /* 66 0F 3A 14 /r ib = PEXTRB r/m16, xmm, imm8
   19552          Extract Byte from xmm, store in mem or zero-extend + store in gen.reg.
   19553          (XMM) */
   19554       if (have66noF2noF3(pfx) && sz == 2) {
   19555          delta = dis_PEXTRB_128_GtoE( vbi, pfx, delta, False/*!isAvx*/ );
   19556          goto decode_success;
   19557       }
   19558       break;
   19559 
   19560    case 0x15:
   19561       /* 66 0F 3A 15 /r ib = PEXTRW r/m16, xmm, imm8
   19562          Extract Word from xmm, store in mem or zero-extend + store in gen.reg.
   19563          (XMM) */
   19564       if (have66noF2noF3(pfx) && sz == 2) {
   19565          delta = dis_PEXTRW( vbi, pfx, delta, False/*!isAvx*/ );
   19566          goto decode_success;
   19567       }
   19568       break;
   19569 
   19570    case 0x16:
   19571       /* 66 no-REX.W 0F 3A 16 /r ib = PEXTRD reg/mem32, xmm2, imm8
   19572          Extract Doubleword int from xmm reg and store in gen.reg or mem. (XMM)
   19573          Note that this insn has the same opcodes as PEXTRQ, but
   19574          here the REX.W bit is _not_ present */
   19575       if (have66noF2noF3(pfx)
   19576           && sz == 2 /* REX.W is _not_ present */) {
   19577          delta = dis_PEXTRD( vbi, pfx, delta, False/*!isAvx*/ );
   19578          goto decode_success;
   19579       }
   19580       /* 66 REX.W 0F 3A 16 /r ib = PEXTRQ reg/mem64, xmm2, imm8
   19581          Extract Quadword int from xmm reg and store in gen.reg or mem. (XMM)
   19582          Note that this insn has the same opcodes as PEXTRD, but
   19583          here the REX.W bit is present */
   19584       if (have66noF2noF3(pfx)
   19585           && sz == 8 /* REX.W is present */) {
   19586          delta = dis_PEXTRQ( vbi, pfx, delta, False/*!isAvx*/);
   19587          goto decode_success;
   19588       }
   19589       break;
   19590 
   19591    case 0x17:
   19592       /* 66 0F 3A 17 /r ib = EXTRACTPS reg/mem32, xmm2, imm8 Extract
   19593          float from xmm reg and store in gen.reg or mem.  This is
   19594          identical to PEXTRD, except that REX.W appears to be ignored.
   19595       */
   19596       if (have66noF2noF3(pfx)
   19597           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   19598          delta = dis_EXTRACTPS( vbi, pfx, delta, False/*!isAvx*/ );
   19599          goto decode_success;
   19600       }
   19601       break;
   19602 
   19603    case 0x20:
   19604       /* 66 0F 3A 20 /r ib = PINSRB xmm1, r32/m8, imm8
   19605          Extract byte from r32/m8 and insert into xmm1 */
   19606       if (have66noF2noF3(pfx) && sz == 2) {
   19607          Int    imm8;
   19608          IRTemp new8 = newTemp(Ity_I8);
   19609          modrm = getUChar(delta);
   19610          UInt rG = gregOfRexRM(pfx, modrm);
   19611          if ( epartIsReg( modrm ) ) {
   19612             UInt rE = eregOfRexRM(pfx,modrm);
   19613             imm8 = (Int)(getUChar(delta+1) & 0xF);
   19614             assign( new8, unop(Iop_32to8, getIReg32(rE)) );
   19615             delta += 1+1;
   19616             DIP( "pinsrb $%d,%s,%s\n", imm8,
   19617                  nameIReg32(rE), nameXMMReg(rG) );
   19618          } else {
   19619             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   19620             imm8 = (Int)(getUChar(delta+alen) & 0xF);
   19621             assign( new8, loadLE( Ity_I8, mkexpr(addr) ) );
   19622             delta += alen+1;
   19623             DIP( "pinsrb $%d,%s,%s\n",
   19624                  imm8, dis_buf, nameXMMReg(rG) );
   19625          }
   19626          IRTemp src_vec = newTemp(Ity_V128);
   19627          assign(src_vec, getXMMReg( gregOfRexRM(pfx, modrm) ));
   19628          IRTemp res = math_PINSRB_128( src_vec, new8, imm8 );
   19629          putXMMReg( rG, mkexpr(res) );
   19630          goto decode_success;
   19631       }
   19632       break;
   19633 
   19634    case 0x21:
   19635       /* 66 0F 3A 21 /r ib = INSERTPS imm8, xmm2/m32, xmm1
   19636          Insert Packed Single Precision Floating-Point Value (XMM) */
   19637       if (have66noF2noF3(pfx) && sz == 2) {
   19638          UInt   imm8;
   19639          IRTemp d2ins = newTemp(Ity_I32); /* comes from the E part */
   19640          const IRTemp inval = IRTemp_INVALID;
   19641 
   19642          modrm = getUChar(delta);
   19643          UInt rG = gregOfRexRM(pfx, modrm);
   19644 
   19645          if ( epartIsReg( modrm ) ) {
   19646             UInt   rE = eregOfRexRM(pfx, modrm);
   19647             IRTemp vE = newTemp(Ity_V128);
   19648             assign( vE, getXMMReg(rE) );
   19649             IRTemp dsE[4] = { inval, inval, inval, inval };
   19650             breakupV128to32s( vE, &dsE[3], &dsE[2], &dsE[1], &dsE[0] );
   19651             imm8 = getUChar(delta+1);
   19652             d2ins = dsE[(imm8 >> 6) & 3]; /* "imm8_count_s" */
   19653             delta += 1+1;
   19654             DIP( "insertps $%u, %s,%s\n",
   19655                  imm8, nameXMMReg(rE), nameXMMReg(rG) );
   19656          } else {
   19657             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   19658             assign( d2ins, loadLE( Ity_I32, mkexpr(addr) ) );
   19659             imm8 = getUChar(delta+alen);
   19660             delta += alen+1;
   19661             DIP( "insertps $%u, %s,%s\n",
   19662                  imm8, dis_buf, nameXMMReg(rG) );
   19663          }
   19664 
   19665          IRTemp vG = newTemp(Ity_V128);
   19666          assign( vG, getXMMReg(rG) );
   19667 
   19668          putXMMReg( rG, mkexpr(math_INSERTPS( vG, d2ins, imm8 )) );
   19669          goto decode_success;
   19670       }
   19671       break;
   19672 
   19673    case 0x22:
   19674       /* 66 no-REX.W 0F 3A 22 /r ib = PINSRD xmm1, r/m32, imm8
   19675          Extract Doubleword int from gen.reg/mem32 and insert into xmm1 */
   19676       if (have66noF2noF3(pfx)
   19677           && sz == 2 /* REX.W is NOT present */) {
   19678          Int    imm8_10;
   19679          IRTemp src_u32 = newTemp(Ity_I32);
   19680          modrm = getUChar(delta);
   19681          UInt rG = gregOfRexRM(pfx, modrm);
   19682 
   19683          if ( epartIsReg( modrm ) ) {
   19684             UInt rE = eregOfRexRM(pfx,modrm);
   19685             imm8_10 = (Int)(getUChar(delta+1) & 3);
   19686             assign( src_u32, getIReg32( rE ) );
   19687             delta += 1+1;
   19688             DIP( "pinsrd $%d, %s,%s\n",
   19689                  imm8_10, nameIReg32(rE), nameXMMReg(rG) );
   19690          } else {
   19691             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   19692             imm8_10 = (Int)(getUChar(delta+alen) & 3);
   19693             assign( src_u32, loadLE( Ity_I32, mkexpr(addr) ) );
   19694             delta += alen+1;
   19695             DIP( "pinsrd $%d, %s,%s\n",
   19696                  imm8_10, dis_buf, nameXMMReg(rG) );
   19697          }
   19698 
   19699          IRTemp src_vec = newTemp(Ity_V128);
   19700          assign(src_vec, getXMMReg( rG ));
   19701          IRTemp res_vec = math_PINSRD_128( src_vec, src_u32, imm8_10 );
   19702          putXMMReg( rG, mkexpr(res_vec) );
   19703          goto decode_success;
   19704       }
   19705       /* 66 REX.W 0F 3A 22 /r ib = PINSRQ xmm1, r/m64, imm8
   19706          Extract Quadword int from gen.reg/mem64 and insert into xmm1 */
   19707       if (have66noF2noF3(pfx)
   19708           && sz == 8 /* REX.W is present */) {
   19709          Int imm8_0;
   19710          IRTemp src_u64 = newTemp(Ity_I64);
   19711          modrm = getUChar(delta);
   19712          UInt rG = gregOfRexRM(pfx, modrm);
   19713 
   19714          if ( epartIsReg( modrm ) ) {
   19715             UInt rE = eregOfRexRM(pfx,modrm);
   19716             imm8_0 = (Int)(getUChar(delta+1) & 1);
   19717             assign( src_u64, getIReg64( rE ) );
   19718             delta += 1+1;
   19719             DIP( "pinsrq $%d, %s,%s\n",
   19720                  imm8_0, nameIReg64(rE), nameXMMReg(rG) );
   19721          } else {
   19722             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   19723             imm8_0 = (Int)(getUChar(delta+alen) & 1);
   19724             assign( src_u64, loadLE( Ity_I64, mkexpr(addr) ) );
   19725             delta += alen+1;
   19726             DIP( "pinsrq $%d, %s,%s\n",
   19727                  imm8_0, dis_buf, nameXMMReg(rG) );
   19728          }
   19729 
   19730          IRTemp src_vec = newTemp(Ity_V128);
   19731          assign(src_vec, getXMMReg( rG ));
   19732          IRTemp res_vec = math_PINSRQ_128( src_vec, src_u64, imm8_0 );
   19733          putXMMReg( rG, mkexpr(res_vec) );
   19734          goto decode_success;
   19735       }
   19736       break;
   19737 
   19738    case 0x40:
   19739       /* 66 0F 3A 40 /r ib = DPPS xmm1, xmm2/m128, imm8
   19740          Dot Product of Packed Single Precision Floating-Point Values (XMM) */
   19741       if (have66noF2noF3(pfx) && sz == 2) {
   19742          modrm = getUChar(delta);
   19743          Int    imm8;
   19744          IRTemp src_vec = newTemp(Ity_V128);
   19745          IRTemp dst_vec = newTemp(Ity_V128);
   19746          UInt   rG      = gregOfRexRM(pfx, modrm);
   19747          assign( dst_vec, getXMMReg( rG ) );
   19748          if ( epartIsReg( modrm ) ) {
   19749             UInt rE = eregOfRexRM(pfx, modrm);
   19750             imm8 = (Int)getUChar(delta+1);
   19751             assign( src_vec, getXMMReg(rE) );
   19752             delta += 1+1;
   19753             DIP( "dpps $%d, %s,%s\n",
   19754                  imm8, nameXMMReg(rE), nameXMMReg(rG) );
   19755          } else {
   19756             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   19757                              1/* imm8 is 1 byte after the amode */ );
   19758             gen_SEGV_if_not_16_aligned( addr );
   19759             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   19760             imm8 = (Int)getUChar(delta+alen);
   19761             delta += alen+1;
   19762             DIP( "dpps $%d, %s,%s\n",
   19763                  imm8, dis_buf, nameXMMReg(rG) );
   19764          }
   19765          IRTemp res = math_DPPS_128( src_vec, dst_vec, imm8 );
   19766          putXMMReg( rG, mkexpr(res) );
   19767          goto decode_success;
   19768       }
   19769       break;
   19770 
   19771    case 0x41:
   19772       /* 66 0F 3A 41 /r ib = DPPD xmm1, xmm2/m128, imm8
   19773          Dot Product of Packed Double Precision Floating-Point Values (XMM) */
   19774       if (have66noF2noF3(pfx) && sz == 2) {
   19775          modrm = getUChar(delta);
   19776          Int    imm8;
   19777          IRTemp src_vec = newTemp(Ity_V128);
   19778          IRTemp dst_vec = newTemp(Ity_V128);
   19779          UInt   rG      = gregOfRexRM(pfx, modrm);
   19780          assign( dst_vec, getXMMReg( rG ) );
   19781          if ( epartIsReg( modrm ) ) {
   19782             UInt rE = eregOfRexRM(pfx, modrm);
   19783             imm8 = (Int)getUChar(delta+1);
   19784             assign( src_vec, getXMMReg(rE) );
   19785             delta += 1+1;
   19786             DIP( "dppd $%d, %s,%s\n",
   19787                  imm8, nameXMMReg(rE), nameXMMReg(rG) );
   19788          } else {
   19789             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   19790                              1/* imm8 is 1 byte after the amode */ );
   19791             gen_SEGV_if_not_16_aligned( addr );
   19792             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   19793             imm8 = (Int)getUChar(delta+alen);
   19794             delta += alen+1;
   19795             DIP( "dppd $%d, %s,%s\n",
   19796                  imm8, dis_buf, nameXMMReg(rG) );
   19797          }
   19798          IRTemp res = math_DPPD_128( src_vec, dst_vec, imm8 );
   19799          putXMMReg( rG, mkexpr(res) );
   19800          goto decode_success;
   19801       }
   19802       break;
   19803 
   19804    case 0x42:
   19805       /* 66 0F 3A 42 /r ib = MPSADBW xmm1, xmm2/m128, imm8
   19806          Multiple Packed Sums of Absolule Difference (XMM) */
   19807       if (have66noF2noF3(pfx) && sz == 2) {
   19808          Int    imm8;
   19809          IRTemp src_vec = newTemp(Ity_V128);
   19810          IRTemp dst_vec = newTemp(Ity_V128);
   19811          modrm          = getUChar(delta);
   19812          UInt   rG      = gregOfRexRM(pfx, modrm);
   19813 
   19814          assign( dst_vec, getXMMReg(rG) );
   19815 
   19816          if ( epartIsReg( modrm ) ) {
   19817             UInt rE = eregOfRexRM(pfx, modrm);
   19818 
   19819             imm8 = (Int)getUChar(delta+1);
   19820             assign( src_vec, getXMMReg(rE) );
   19821             delta += 1+1;
   19822             DIP( "mpsadbw $%d, %s,%s\n", imm8,
   19823                  nameXMMReg(rE), nameXMMReg(rG) );
   19824          } else {
   19825             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   19826                              1/* imm8 is 1 byte after the amode */ );
   19827             gen_SEGV_if_not_16_aligned( addr );
   19828             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   19829             imm8 = (Int)getUChar(delta+alen);
   19830             delta += alen+1;
   19831             DIP( "mpsadbw $%d, %s,%s\n", imm8, dis_buf, nameXMMReg(rG) );
   19832          }
   19833 
   19834          putXMMReg( rG, mkexpr( math_MPSADBW_128(dst_vec, src_vec, imm8) ) );
   19835          goto decode_success;
   19836       }
   19837       break;
   19838 
   19839    case 0x44:
   19840       /* 66 0F 3A 44 /r ib = PCLMULQDQ xmm1, xmm2/m128, imm8
   19841        * Carry-less multiplication of selected XMM quadwords into XMM
   19842        * registers (a.k.a multiplication of polynomials over GF(2))
   19843        */
   19844       if (have66noF2noF3(pfx) && sz == 2) {
   19845 
   19846          Int imm8;
   19847          IRTemp svec = newTemp(Ity_V128);
   19848          IRTemp dvec = newTemp(Ity_V128);
   19849          modrm       = getUChar(delta);
   19850          UInt   rG   = gregOfRexRM(pfx, modrm);
   19851 
   19852          assign( dvec, getXMMReg(rG) );
   19853 
   19854          if ( epartIsReg( modrm ) ) {
   19855             UInt rE = eregOfRexRM(pfx, modrm);
   19856             imm8 = (Int)getUChar(delta+1);
   19857             assign( svec, getXMMReg(rE) );
   19858             delta += 1+1;
   19859             DIP( "pclmulqdq $%d, %s,%s\n", imm8,
   19860                  nameXMMReg(rE), nameXMMReg(rG) );
   19861          } else {
   19862             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   19863                              1/* imm8 is 1 byte after the amode */ );
   19864             gen_SEGV_if_not_16_aligned( addr );
   19865             assign( svec, loadLE( Ity_V128, mkexpr(addr) ) );
   19866             imm8 = (Int)getUChar(delta+alen);
   19867             delta += alen+1;
   19868             DIP( "pclmulqdq $%d, %s,%s\n",
   19869                  imm8, dis_buf, nameXMMReg(rG) );
   19870          }
   19871 
   19872          putXMMReg( rG, mkexpr( math_PCLMULQDQ(dvec, svec, imm8) ) );
   19873          goto decode_success;
   19874       }
   19875       break;
   19876 
   19877    case 0x60:
   19878    case 0x61:
   19879    case 0x62:
   19880    case 0x63:
   19881       /* 66 0F 3A 63 /r ib = PCMPISTRI imm8, xmm2/m128, xmm1
   19882          66 0F 3A 62 /r ib = PCMPISTRM imm8, xmm2/m128, xmm1
   19883          66 0F 3A 61 /r ib = PCMPESTRI imm8, xmm2/m128, xmm1
   19884          66 0F 3A 60 /r ib = PCMPESTRM imm8, xmm2/m128, xmm1
   19885          (selected special cases that actually occur in glibc,
   19886           not by any means a complete implementation.)
   19887       */
   19888       if (have66noF2noF3(pfx) && sz == 2) {
   19889          Long delta0 = delta;
   19890          delta = dis_PCMPxSTRx( vbi, pfx, delta, False/*!isAvx*/, opc );
   19891          if (delta > delta0) goto decode_success;
   19892          /* else fall though; dis_PCMPxSTRx failed to decode it */
   19893       }
   19894       break;
   19895 
   19896    case 0xDF:
   19897       /* 66 0F 3A DF /r ib = AESKEYGENASSIST imm8, xmm2/m128, xmm1 */
   19898       if (have66noF2noF3(pfx) && sz == 2) {
   19899          delta = dis_AESKEYGENASSIST( vbi, pfx, delta, False/*!isAvx*/ );
   19900          goto decode_success;
   19901       }
   19902       break;
   19903 
   19904    default:
   19905       break;
   19906 
   19907    }
   19908 
   19909   decode_failure:
   19910    *decode_OK = False;
   19911    return deltaIN;
   19912 
   19913   decode_success:
   19914    *decode_OK = True;
   19915    return delta;
   19916 }
   19917 
   19918 
   19919 /*------------------------------------------------------------*/
   19920 /*---                                                      ---*/
   19921 /*--- Top-level post-escape decoders: dis_ESC_NONE         ---*/
   19922 /*---                                                      ---*/
   19923 /*------------------------------------------------------------*/
   19924 
   19925 __attribute__((noinline))
   19926 static
   19927 Long dis_ESC_NONE (
   19928         /*MB_OUT*/DisResult* dres,
   19929         /*MB_OUT*/Bool*      expect_CAS,
   19930         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
   19931         Bool         resteerCisOk,
   19932         void*        callback_opaque,
   19933         const VexArchInfo* archinfo,
   19934         const VexAbiInfo*  vbi,
   19935         Prefix pfx, Int sz, Long deltaIN
   19936      )
   19937 {
   19938    Long   d64   = 0;
   19939    UChar  abyte = 0;
   19940    IRTemp addr  = IRTemp_INVALID;
   19941    IRTemp t1    = IRTemp_INVALID;
   19942    IRTemp t2    = IRTemp_INVALID;
   19943    IRTemp t3    = IRTemp_INVALID;
   19944    IRTemp t4    = IRTemp_INVALID;
   19945    IRTemp t5    = IRTemp_INVALID;
   19946    IRType ty    = Ity_INVALID;
   19947    UChar  modrm = 0;
   19948    Int    am_sz = 0;
   19949    Int    d_sz  = 0;
   19950    Int    alen  = 0;
   19951    HChar  dis_buf[50];
   19952 
   19953    Long   delta = deltaIN;
   19954    UChar  opc   = getUChar(delta); delta++;
   19955 
   19956    /* delta now points at the modrm byte.  In most of the cases that
   19957       follow, neither the F2 nor F3 prefixes are allowed.  However,
   19958       for some basic arithmetic operations we have to allow F2/XACQ or
   19959       F3/XREL in the case where the destination is memory and the LOCK
   19960       prefix is also present.  Do this check by looking at the modrm
   19961       byte but not advancing delta over it. */
   19962    /* By default, F2 and F3 are not allowed, so let's start off with
   19963       that setting. */
   19964    Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
   19965    { UChar tmp_modrm = getUChar(delta);
   19966      switch (opc) {
   19967         case 0x00: /* ADD Gb,Eb */  case 0x01: /* ADD Gv,Ev */
   19968         case 0x08: /* OR  Gb,Eb */  case 0x09: /* OR  Gv,Ev */
   19969         case 0x10: /* ADC Gb,Eb */  case 0x11: /* ADC Gv,Ev */
   19970         case 0x18: /* SBB Gb,Eb */  case 0x19: /* SBB Gv,Ev */
   19971         case 0x20: /* AND Gb,Eb */  case 0x21: /* AND Gv,Ev */
   19972         case 0x28: /* SUB Gb,Eb */  case 0x29: /* SUB Gv,Ev */
   19973         case 0x30: /* XOR Gb,Eb */  case 0x31: /* XOR Gv,Ev */
   19974            if (!epartIsReg(tmp_modrm)
   19975                && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
   19976               /* dst is mem, and we have F2 or F3 but not both */
   19977               validF2orF3 = True;
   19978            }
   19979            break;
   19980         default:
   19981            break;
   19982      }
   19983    }
   19984 
   19985    /* Now, in the switch below, for the opc values examined by the
   19986       switch above, use validF2orF3 rather than looking at pfx
   19987       directly. */
   19988    switch (opc) {
   19989 
   19990    case 0x00: /* ADD Gb,Eb */
   19991       if (!validF2orF3) goto decode_failure;
   19992       delta = dis_op2_G_E ( vbi, pfx, Iop_Add8, WithFlagNone, True, 1, delta, "add" );
   19993       return delta;
   19994    case 0x01: /* ADD Gv,Ev */
   19995       if (!validF2orF3) goto decode_failure;
   19996       delta = dis_op2_G_E ( vbi, pfx, Iop_Add8, WithFlagNone, True, sz, delta, "add" );
   19997       return delta;
   19998 
   19999    case 0x02: /* ADD Eb,Gb */
   20000       if (haveF2orF3(pfx)) goto decode_failure;
   20001       delta = dis_op2_E_G ( vbi, pfx, Iop_Add8, WithFlagNone, True, 1, delta, "add" );
   20002       return delta;
   20003    case 0x03: /* ADD Ev,Gv */
   20004       if (haveF2orF3(pfx)) goto decode_failure;
   20005       delta = dis_op2_E_G ( vbi, pfx, Iop_Add8, WithFlagNone, True, sz, delta, "add" );
   20006       return delta;
   20007 
   20008    case 0x04: /* ADD Ib, AL */
   20009       if (haveF2orF3(pfx)) goto decode_failure;
   20010       delta = dis_op_imm_A( 1, False, Iop_Add8, True, delta, "add" );
   20011       return delta;
   20012    case 0x05: /* ADD Iv, eAX */
   20013       if (haveF2orF3(pfx)) goto decode_failure;
   20014       delta = dis_op_imm_A(sz, False, Iop_Add8, True, delta, "add" );
   20015       return delta;
   20016 
   20017    case 0x08: /* OR Gb,Eb */
   20018       if (!validF2orF3) goto decode_failure;
   20019       delta = dis_op2_G_E ( vbi, pfx, Iop_Or8, WithFlagNone, True, 1, delta, "or" );
   20020       return delta;
   20021    case 0x09: /* OR Gv,Ev */
   20022       if (!validF2orF3) goto decode_failure;
   20023       delta = dis_op2_G_E ( vbi, pfx, Iop_Or8, WithFlagNone, True, sz, delta, "or" );
   20024       return delta;
   20025 
   20026    case 0x0A: /* OR Eb,Gb */
   20027       if (haveF2orF3(pfx)) goto decode_failure;
   20028       delta = dis_op2_E_G ( vbi, pfx, Iop_Or8, WithFlagNone, True, 1, delta, "or" );
   20029       return delta;
   20030    case 0x0B: /* OR Ev,Gv */
   20031       if (haveF2orF3(pfx)) goto decode_failure;
   20032       delta = dis_op2_E_G ( vbi, pfx, Iop_Or8, WithFlagNone, True, sz, delta, "or" );
   20033       return delta;
   20034 
   20035    case 0x0C: /* OR Ib, AL */
   20036       if (haveF2orF3(pfx)) goto decode_failure;
   20037       delta = dis_op_imm_A( 1, False, Iop_Or8, True, delta, "or" );
   20038       return delta;
   20039    case 0x0D: /* OR Iv, eAX */
   20040       if (haveF2orF3(pfx)) goto decode_failure;
   20041       delta = dis_op_imm_A( sz, False, Iop_Or8, True, delta, "or" );
   20042       return delta;
   20043 
   20044    case 0x10: /* ADC Gb,Eb */
   20045       if (!validF2orF3) goto decode_failure;
   20046       delta = dis_op2_G_E ( vbi, pfx, Iop_Add8, WithFlagCarry, True, 1, delta, "adc" );
   20047       return delta;
   20048    case 0x11: /* ADC Gv,Ev */
   20049       if (!validF2orF3) goto decode_failure;
   20050       delta = dis_op2_G_E ( vbi, pfx, Iop_Add8, WithFlagCarry, True, sz, delta, "adc" );
   20051       return delta;
   20052 
   20053    case 0x12: /* ADC Eb,Gb */
   20054       if (haveF2orF3(pfx)) goto decode_failure;
   20055       delta = dis_op2_E_G ( vbi, pfx, Iop_Add8, WithFlagCarry, True, 1, delta, "adc" );
   20056       return delta;
   20057    case 0x13: /* ADC Ev,Gv */
   20058       if (haveF2orF3(pfx)) goto decode_failure;
   20059       delta = dis_op2_E_G ( vbi, pfx, Iop_Add8, WithFlagCarry, True, sz, delta, "adc" );
   20060       return delta;
   20061 
   20062    case 0x14: /* ADC Ib, AL */
   20063       if (haveF2orF3(pfx)) goto decode_failure;
   20064       delta = dis_op_imm_A( 1, True, Iop_Add8, True, delta, "adc" );
   20065       return delta;
   20066    case 0x15: /* ADC Iv, eAX */
   20067       if (haveF2orF3(pfx)) goto decode_failure;
   20068       delta = dis_op_imm_A( sz, True, Iop_Add8, True, delta, "adc" );
   20069       return delta;
   20070 
   20071    case 0x18: /* SBB Gb,Eb */
   20072       if (!validF2orF3) goto decode_failure;
   20073       delta = dis_op2_G_E ( vbi, pfx, Iop_Sub8, WithFlagCarry, True, 1, delta, "sbb" );
   20074       return delta;
   20075    case 0x19: /* SBB Gv,Ev */
   20076       if (!validF2orF3) goto decode_failure;
   20077       delta = dis_op2_G_E ( vbi, pfx, Iop_Sub8, WithFlagCarry, True, sz, delta, "sbb" );
   20078       return delta;
   20079 
   20080    case 0x1A: /* SBB Eb,Gb */
   20081       if (haveF2orF3(pfx)) goto decode_failure;
   20082       delta = dis_op2_E_G ( vbi, pfx, Iop_Sub8, WithFlagCarry, True, 1, delta, "sbb" );
   20083       return delta;
   20084    case 0x1B: /* SBB Ev,Gv */
   20085       if (haveF2orF3(pfx)) goto decode_failure;
   20086       delta = dis_op2_E_G ( vbi, pfx, Iop_Sub8, WithFlagCarry, True, sz, delta, "sbb" );
   20087       return delta;
   20088 
   20089    case 0x1C: /* SBB Ib, AL */
   20090       if (haveF2orF3(pfx)) goto decode_failure;
   20091       delta = dis_op_imm_A( 1, True, Iop_Sub8, True, delta, "sbb" );
   20092       return delta;
   20093    case 0x1D: /* SBB Iv, eAX */
   20094       if (haveF2orF3(pfx)) goto decode_failure;
   20095       delta = dis_op_imm_A( sz, True, Iop_Sub8, True, delta, "sbb" );
   20096       return delta;
   20097 
   20098    case 0x20: /* AND Gb,Eb */
   20099       if (!validF2orF3) goto decode_failure;
   20100       delta = dis_op2_G_E ( vbi, pfx, Iop_And8, WithFlagNone, True, 1, delta, "and" );
   20101       return delta;
   20102    case 0x21: /* AND Gv,Ev */
   20103       if (!validF2orF3) goto decode_failure;
   20104       delta = dis_op2_G_E ( vbi, pfx, Iop_And8, WithFlagNone, True, sz, delta, "and" );
   20105       return delta;
   20106 
   20107    case 0x22: /* AND Eb,Gb */
   20108       if (haveF2orF3(pfx)) goto decode_failure;
   20109       delta = dis_op2_E_G ( vbi, pfx, Iop_And8, WithFlagNone, True, 1, delta, "and" );
   20110       return delta;
   20111    case 0x23: /* AND Ev,Gv */
   20112       if (haveF2orF3(pfx)) goto decode_failure;
   20113       delta = dis_op2_E_G ( vbi, pfx, Iop_And8, WithFlagNone, True, sz, delta, "and" );
   20114       return delta;
   20115 
   20116    case 0x24: /* AND Ib, AL */
   20117       if (haveF2orF3(pfx)) goto decode_failure;
   20118       delta = dis_op_imm_A( 1, False, Iop_And8, True, delta, "and" );
   20119       return delta;
   20120    case 0x25: /* AND Iv, eAX */
   20121       if (haveF2orF3(pfx)) goto decode_failure;
   20122       delta = dis_op_imm_A( sz, False, Iop_And8, True, delta, "and" );
   20123       return delta;
   20124 
   20125    case 0x28: /* SUB Gb,Eb */
   20126       if (!validF2orF3) goto decode_failure;
   20127       delta = dis_op2_G_E ( vbi, pfx, Iop_Sub8, WithFlagNone, True, 1, delta, "sub" );
   20128       return delta;
   20129    case 0x29: /* SUB Gv,Ev */
   20130       if (!validF2orF3) goto decode_failure;
   20131       delta = dis_op2_G_E ( vbi, pfx, Iop_Sub8, WithFlagNone, True, sz, delta, "sub" );
   20132       return delta;
   20133 
   20134    case 0x2A: /* SUB Eb,Gb */
   20135       if (haveF2orF3(pfx)) goto decode_failure;
   20136       delta = dis_op2_E_G ( vbi, pfx, Iop_Sub8, WithFlagNone, True, 1, delta, "sub" );
   20137       return delta;
   20138    case 0x2B: /* SUB Ev,Gv */
   20139       if (haveF2orF3(pfx)) goto decode_failure;
   20140       delta = dis_op2_E_G ( vbi, pfx, Iop_Sub8, WithFlagNone, True, sz, delta, "sub" );
   20141       return delta;
   20142 
   20143    case 0x2C: /* SUB Ib, AL */
   20144       if (haveF2orF3(pfx)) goto decode_failure;
   20145       delta = dis_op_imm_A(1, False, Iop_Sub8, True, delta, "sub" );
   20146       return delta;
   20147    case 0x2D: /* SUB Iv, eAX */
   20148       if (haveF2orF3(pfx)) goto decode_failure;
   20149       delta = dis_op_imm_A( sz, False, Iop_Sub8, True, delta, "sub" );
   20150       return delta;
   20151 
   20152    case 0x30: /* XOR Gb,Eb */
   20153       if (!validF2orF3) goto decode_failure;
   20154       delta = dis_op2_G_E ( vbi, pfx, Iop_Xor8, WithFlagNone, True, 1, delta, "xor" );
   20155       return delta;
   20156    case 0x31: /* XOR Gv,Ev */
   20157       if (!validF2orF3) goto decode_failure;
   20158       delta = dis_op2_G_E ( vbi, pfx, Iop_Xor8, WithFlagNone, True, sz, delta, "xor" );
   20159       return delta;
   20160 
   20161    case 0x32: /* XOR Eb,Gb */
   20162       if (haveF2orF3(pfx)) goto decode_failure;
   20163       delta = dis_op2_E_G ( vbi, pfx, Iop_Xor8, WithFlagNone, True, 1, delta, "xor" );
   20164       return delta;
   20165    case 0x33: /* XOR Ev,Gv */
   20166       if (haveF2orF3(pfx)) goto decode_failure;
   20167       delta = dis_op2_E_G ( vbi, pfx, Iop_Xor8, WithFlagNone, True, sz, delta, "xor" );
   20168       return delta;
   20169 
   20170    case 0x34: /* XOR Ib, AL */
   20171       if (haveF2orF3(pfx)) goto decode_failure;
   20172       delta = dis_op_imm_A( 1, False, Iop_Xor8, True, delta, "xor" );
   20173       return delta;
   20174    case 0x35: /* XOR Iv, eAX */
   20175       if (haveF2orF3(pfx)) goto decode_failure;
   20176       delta = dis_op_imm_A( sz, False, Iop_Xor8, True, delta, "xor" );
   20177       return delta;
   20178 
   20179    case 0x38: /* CMP Gb,Eb */
   20180       if (haveF2orF3(pfx)) goto decode_failure;
   20181       delta = dis_op2_G_E ( vbi, pfx, Iop_Sub8, WithFlagNone, False, 1, delta, "cmp" );
   20182       return delta;
   20183    case 0x39: /* CMP Gv,Ev */
   20184       if (haveF2orF3(pfx)) goto decode_failure;
   20185       delta = dis_op2_G_E ( vbi, pfx, Iop_Sub8, WithFlagNone, False, sz, delta, "cmp" );
   20186       return delta;
   20187 
   20188    case 0x3A: /* CMP Eb,Gb */
   20189       if (haveF2orF3(pfx)) goto decode_failure;
   20190       delta = dis_op2_E_G ( vbi, pfx, Iop_Sub8, WithFlagNone, False, 1, delta, "cmp" );
   20191       return delta;
   20192    case 0x3B: /* CMP Ev,Gv */
   20193       if (haveF2orF3(pfx)) goto decode_failure;
   20194       delta = dis_op2_E_G ( vbi, pfx, Iop_Sub8, WithFlagNone, False, sz, delta, "cmp" );
   20195       return delta;
   20196 
   20197    case 0x3C: /* CMP Ib, AL */
   20198       if (haveF2orF3(pfx)) goto decode_failure;
   20199       delta = dis_op_imm_A( 1, False, Iop_Sub8, False, delta, "cmp" );
   20200       return delta;
   20201    case 0x3D: /* CMP Iv, eAX */
   20202       if (haveF2orF3(pfx)) goto decode_failure;
   20203       delta = dis_op_imm_A( sz, False, Iop_Sub8, False, delta, "cmp" );
   20204       return delta;
   20205 
   20206    case 0x50: /* PUSH eAX */
   20207    case 0x51: /* PUSH eCX */
   20208    case 0x52: /* PUSH eDX */
   20209    case 0x53: /* PUSH eBX */
   20210    case 0x55: /* PUSH eBP */
   20211    case 0x56: /* PUSH eSI */
   20212    case 0x57: /* PUSH eDI */
   20213    case 0x54: /* PUSH eSP */
   20214       /* This is the Right Way, in that the value to be pushed is
   20215          established before %rsp is changed, so that pushq %rsp
   20216          correctly pushes the old value. */
   20217       if (haveF2orF3(pfx)) goto decode_failure;
   20218       vassert(sz == 2 || sz == 4 || sz == 8);
   20219       if (sz == 4)
   20220          sz = 8; /* there is no encoding for 32-bit push in 64-bit mode */
   20221       ty = sz==2 ? Ity_I16 : Ity_I64;
   20222       t1 = newTemp(ty);
   20223       t2 = newTemp(Ity_I64);
   20224       assign(t1, getIRegRexB(sz, pfx, opc-0x50));
   20225       assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(sz)));
   20226       putIReg64(R_RSP, mkexpr(t2) );
   20227       storeLE(mkexpr(t2),mkexpr(t1));
   20228       DIP("push%c %s\n", nameISize(sz), nameIRegRexB(sz,pfx,opc-0x50));
   20229       return delta;
   20230 
   20231    case 0x58: /* POP eAX */
   20232    case 0x59: /* POP eCX */
   20233    case 0x5A: /* POP eDX */
   20234    case 0x5B: /* POP eBX */
   20235    case 0x5D: /* POP eBP */
   20236    case 0x5E: /* POP eSI */
   20237    case 0x5F: /* POP eDI */
   20238    case 0x5C: /* POP eSP */
   20239       if (haveF2orF3(pfx)) goto decode_failure;
   20240       vassert(sz == 2 || sz == 4 || sz == 8);
   20241       if (sz == 4)
   20242          sz = 8; /* there is no encoding for 32-bit pop in 64-bit mode */
   20243       t1 = newTemp(szToITy(sz));
   20244       t2 = newTemp(Ity_I64);
   20245       assign(t2, getIReg64(R_RSP));
   20246       assign(t1, loadLE(szToITy(sz),mkexpr(t2)));
   20247       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t2), mkU64(sz)));
   20248       putIRegRexB(sz, pfx, opc-0x58, mkexpr(t1));
   20249       DIP("pop%c %s\n", nameISize(sz), nameIRegRexB(sz,pfx,opc-0x58));
   20250       return delta;
   20251 
   20252    case 0x63: /* MOVSX */
   20253       if (haveF2orF3(pfx)) goto decode_failure;
   20254       if (haveREX(pfx) && 1==getRexW(pfx)) {
   20255          vassert(sz == 8);
   20256          /* movsx r/m32 to r64 */
   20257          modrm = getUChar(delta);
   20258          if (epartIsReg(modrm)) {
   20259             delta++;
   20260             putIRegG(8, pfx, modrm,
   20261                              unop(Iop_32Sto64,
   20262                                   getIRegE(4, pfx, modrm)));
   20263             DIP("movslq %s,%s\n",
   20264                 nameIRegE(4, pfx, modrm),
   20265                 nameIRegG(8, pfx, modrm));
   20266             return delta;
   20267          } else {
   20268             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   20269             delta += alen;
   20270             putIRegG(8, pfx, modrm,
   20271                              unop(Iop_32Sto64,
   20272                                   loadLE(Ity_I32, mkexpr(addr))));
   20273             DIP("movslq %s,%s\n", dis_buf,
   20274                 nameIRegG(8, pfx, modrm));
   20275             return delta;
   20276          }
   20277       } else {
   20278          goto decode_failure;
   20279       }
   20280 
   20281    case 0x68: /* PUSH Iv */
   20282       if (haveF2orF3(pfx)) goto decode_failure;
   20283       /* Note, sz==4 is not possible in 64-bit mode.  Hence ... */
   20284       if (sz == 4) sz = 8;
   20285       d64 = getSDisp(imin(4,sz),delta);
   20286       delta += imin(4,sz);
   20287       goto do_push_I;
   20288 
   20289    case 0x69: /* IMUL Iv, Ev, Gv */
   20290       if (haveF2orF3(pfx)) goto decode_failure;
   20291       delta = dis_imul_I_E_G ( vbi, pfx, sz, delta, sz );
   20292       return delta;
   20293 
   20294    case 0x6A: /* PUSH Ib, sign-extended to sz */
   20295       if (haveF2orF3(pfx)) goto decode_failure;
   20296       /* Note, sz==4 is not possible in 64-bit mode.  Hence ... */
   20297       if (sz == 4) sz = 8;
   20298       d64 = getSDisp8(delta); delta += 1;
   20299       goto do_push_I;
   20300    do_push_I:
   20301       ty = szToITy(sz);
   20302       t1 = newTemp(Ity_I64);
   20303       t2 = newTemp(ty);
   20304       assign( t1, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
   20305       putIReg64(R_RSP, mkexpr(t1) );
   20306       /* stop mkU16 asserting if d32 is a negative 16-bit number
   20307          (bug #132813) */
   20308       if (ty == Ity_I16)
   20309          d64 &= 0xFFFF;
   20310       storeLE( mkexpr(t1), mkU(ty,d64) );
   20311       DIP("push%c $%lld\n", nameISize(sz), (Long)d64);
   20312       return delta;
   20313 
   20314    case 0x6B: /* IMUL Ib, Ev, Gv */
   20315       delta = dis_imul_I_E_G ( vbi, pfx, sz, delta, 1 );
   20316       return delta;
   20317 
   20318    case 0x70:
   20319    case 0x71:
   20320    case 0x72:   /* JBb/JNAEb (jump below) */
   20321    case 0x73:   /* JNBb/JAEb (jump not below) */
   20322    case 0x74:   /* JZb/JEb (jump zero) */
   20323    case 0x75:   /* JNZb/JNEb (jump not zero) */
   20324    case 0x76:   /* JBEb/JNAb (jump below or equal) */
   20325    case 0x77:   /* JNBEb/JAb (jump not below or equal) */
   20326    case 0x78:   /* JSb (jump negative) */
   20327    case 0x79:   /* JSb (jump not negative) */
   20328    case 0x7A:   /* JP (jump parity even) */
   20329    case 0x7B:   /* JNP/JPO (jump parity odd) */
   20330    case 0x7C:   /* JLb/JNGEb (jump less) */
   20331    case 0x7D:   /* JGEb/JNLb (jump greater or equal) */
   20332    case 0x7E:   /* JLEb/JNGb (jump less or equal) */
   20333    case 0x7F: { /* JGb/JNLEb (jump greater) */
   20334       Long   jmpDelta;
   20335       const HChar* comment  = "";
   20336       if (haveF3(pfx)) goto decode_failure;
   20337       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   20338       jmpDelta = getSDisp8(delta);
   20339       vassert(-128 <= jmpDelta && jmpDelta < 128);
   20340       d64 = (guest_RIP_bbstart+delta+1) + jmpDelta;
   20341       delta++;
   20342       if (resteerCisOk
   20343           && vex_control.guest_chase_cond
   20344           && (Addr64)d64 != (Addr64)guest_RIP_bbstart
   20345           && jmpDelta < 0
   20346           && resteerOkFn( callback_opaque, (Addr64)d64) ) {
   20347          /* Speculation: assume this backward branch is taken.  So we
   20348             need to emit a side-exit to the insn following this one,
   20349             on the negation of the condition, and continue at the
   20350             branch target address (d64).  If we wind up back at the
   20351             first instruction of the trace, just stop; it's better to
   20352             let the IR loop unroller handle that case. */
   20353          stmt( IRStmt_Exit(
   20354                   mk_amd64g_calculate_condition(
   20355                      (AMD64Condcode)(1 ^ (opc - 0x70))),
   20356                   Ijk_Boring,
   20357                   IRConst_U64(guest_RIP_bbstart+delta),
   20358                   OFFB_RIP ) );
   20359          dres->whatNext   = Dis_ResteerC;
   20360          dres->continueAt = d64;
   20361          comment = "(assumed taken)";
   20362       }
   20363       else
   20364       if (resteerCisOk
   20365           && vex_control.guest_chase_cond
   20366           && (Addr64)d64 != (Addr64)guest_RIP_bbstart
   20367           && jmpDelta >= 0
   20368           && resteerOkFn( callback_opaque, guest_RIP_bbstart+delta ) ) {
   20369          /* Speculation: assume this forward branch is not taken.  So
   20370             we need to emit a side-exit to d64 (the dest) and continue
   20371             disassembling at the insn immediately following this
   20372             one. */
   20373          stmt( IRStmt_Exit(
   20374                   mk_amd64g_calculate_condition((AMD64Condcode)(opc - 0x70)),
   20375                   Ijk_Boring,
   20376                   IRConst_U64(d64),
   20377                   OFFB_RIP ) );
   20378          dres->whatNext   = Dis_ResteerC;
   20379          dres->continueAt = guest_RIP_bbstart+delta;
   20380          comment = "(assumed not taken)";
   20381       }
   20382       else {
   20383          /* Conservative default translation - end the block at this
   20384             point. */
   20385          jcc_01( dres, (AMD64Condcode)(opc - 0x70),
   20386                  guest_RIP_bbstart+delta, d64 );
   20387          vassert(dres->whatNext == Dis_StopHere);
   20388       }
   20389       DIP("j%s-8 0x%llx %s\n", name_AMD64Condcode(opc - 0x70), (ULong)d64,
   20390           comment);
   20391       return delta;
   20392    }
   20393 
   20394    case 0x80: /* Grp1 Ib,Eb */
   20395       modrm = getUChar(delta);
   20396       /* Disallow F2/XACQ and F3/XREL for the non-mem case.  Allow
   20397          just one for the mem case and also require LOCK in this case.
   20398          Note that this erroneously allows XACQ/XREL on CMP since we
   20399          don't check the subopcode here.  No big deal. */
   20400       if (epartIsReg(modrm) && haveF2orF3(pfx))
   20401          goto decode_failure;
   20402       if (!epartIsReg(modrm) && haveF2andF3(pfx))
   20403          goto decode_failure;
   20404       if (!epartIsReg(modrm) && haveF2orF3(pfx) && !haveLOCK(pfx))
   20405          goto decode_failure;
   20406       am_sz = lengthAMode(pfx,delta);
   20407       sz    = 1;
   20408       d_sz  = 1;
   20409       d64   = getSDisp8(delta + am_sz);
   20410       delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
   20411       return delta;
   20412 
   20413    case 0x81: /* Grp1 Iv,Ev */
   20414       modrm = getUChar(delta);
   20415       /* Same comment as for case 0x80 just above. */
   20416       if (epartIsReg(modrm) && haveF2orF3(pfx))
   20417          goto decode_failure;
   20418       if (!epartIsReg(modrm) && haveF2andF3(pfx))
   20419          goto decode_failure;
   20420       if (!epartIsReg(modrm) && haveF2orF3(pfx) && !haveLOCK(pfx))
   20421          goto decode_failure;
   20422       am_sz = lengthAMode(pfx,delta);
   20423       d_sz  = imin(sz,4);
   20424       d64   = getSDisp(d_sz, delta + am_sz);
   20425       delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
   20426       return delta;
   20427 
   20428    case 0x83: /* Grp1 Ib,Ev */
   20429       if (haveF2orF3(pfx)) goto decode_failure;
   20430       modrm = getUChar(delta);
   20431       am_sz = lengthAMode(pfx,delta);
   20432       d_sz  = 1;
   20433       d64   = getSDisp8(delta + am_sz);
   20434       delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
   20435       return delta;
   20436 
   20437    case 0x84: /* TEST Eb,Gb */
   20438       if (haveF2orF3(pfx)) goto decode_failure;
   20439       delta = dis_op2_E_G ( vbi, pfx, Iop_And8, WithFlagNone, False,
   20440                             1, delta, "test" );
   20441       return delta;
   20442 
   20443    case 0x85: /* TEST Ev,Gv */
   20444       if (haveF2orF3(pfx)) goto decode_failure;
   20445       delta = dis_op2_E_G ( vbi, pfx, Iop_And8, WithFlagNone, False,
   20446                             sz, delta, "test" );
   20447       return delta;
   20448 
   20449    /* XCHG reg,mem automatically asserts LOCK# even without a LOCK
   20450       prefix.  Therefore, generate CAS regardless of the presence or
   20451       otherwise of a LOCK prefix. */
   20452    case 0x86: /* XCHG Gb,Eb */
   20453       sz = 1;
   20454       /* Fall through ... */
   20455    case 0x87: /* XCHG Gv,Ev */
   20456       modrm = getUChar(delta);
   20457       /* Check whether F2 or F3 are allowable.  For the mem case, one
   20458          or the othter but not both are.  We don't care about the
   20459          presence of LOCK in this case -- XCHG is unusual in this
   20460          respect. */
   20461       if (haveF2orF3(pfx)) {
   20462          if (epartIsReg(modrm)) {
   20463             goto decode_failure;
   20464          } else {
   20465             if (haveF2andF3(pfx))
   20466                goto decode_failure;
   20467          }
   20468       }
   20469       ty = szToITy(sz);
   20470       t1 = newTemp(ty); t2 = newTemp(ty);
   20471       if (epartIsReg(modrm)) {
   20472          assign(t1, getIRegE(sz, pfx, modrm));
   20473          assign(t2, getIRegG(sz, pfx, modrm));
   20474          putIRegG(sz, pfx, modrm, mkexpr(t1));
   20475          putIRegE(sz, pfx, modrm, mkexpr(t2));
   20476          delta++;
   20477          DIP("xchg%c %s, %s\n",
   20478              nameISize(sz), nameIRegG(sz, pfx, modrm),
   20479                             nameIRegE(sz, pfx, modrm));
   20480       } else {
   20481          *expect_CAS = True;
   20482          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   20483          assign( t1, loadLE(ty, mkexpr(addr)) );
   20484          assign( t2, getIRegG(sz, pfx, modrm) );
   20485          casLE( mkexpr(addr),
   20486                 mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
   20487          putIRegG( sz, pfx, modrm, mkexpr(t1) );
   20488          delta += alen;
   20489          DIP("xchg%c %s, %s\n", nameISize(sz),
   20490                                 nameIRegG(sz, pfx, modrm), dis_buf);
   20491       }
   20492       return delta;
   20493 
   20494    case 0x88: { /* MOV Gb,Eb */
   20495       /* We let dis_mov_G_E decide whether F3(XRELEASE) is allowable. */
   20496       Bool ok = True;
   20497       delta = dis_mov_G_E(vbi, pfx, 1, delta, &ok);
   20498       if (!ok) goto decode_failure;
   20499       return delta;
   20500    }
   20501 
   20502    case 0x89: { /* MOV Gv,Ev */
   20503       /* We let dis_mov_G_E decide whether F3(XRELEASE) is allowable. */
   20504       Bool ok = True;
   20505       delta = dis_mov_G_E(vbi, pfx, sz, delta, &ok);
   20506       if (!ok) goto decode_failure;
   20507       return delta;
   20508    }
   20509 
   20510    case 0x8A: /* MOV Eb,Gb */
   20511       if (haveF2orF3(pfx)) goto decode_failure;
   20512       delta = dis_mov_E_G(vbi, pfx, 1, delta);
   20513       return delta;
   20514 
   20515    case 0x8B: /* MOV Ev,Gv */
   20516       if (haveF2orF3(pfx)) goto decode_failure;
   20517       delta = dis_mov_E_G(vbi, pfx, sz, delta);
   20518       return delta;
   20519 
   20520    case 0x8C: /* MOV S,E -- MOV from a SEGMENT REGISTER */
   20521       if (haveF2orF3(pfx)) goto decode_failure;
   20522       delta = dis_mov_S_E(vbi, pfx, sz, delta);
   20523       return delta;
   20524 
   20525    case 0x8D: /* LEA M,Gv */
   20526       if (haveF2orF3(pfx)) goto decode_failure;
   20527       if (sz != 4 && sz != 8)
   20528          goto decode_failure;
   20529       modrm = getUChar(delta);
   20530       if (epartIsReg(modrm))
   20531          goto decode_failure;
   20532       /* NOTE!  this is the one place where a segment override prefix
   20533          has no effect on the address calculation.  Therefore we clear
   20534          any segment override bits in pfx. */
   20535       addr = disAMode ( &alen, vbi, clearSegBits(pfx), delta, dis_buf, 0 );
   20536       delta += alen;
   20537       /* This is a hack.  But it isn't clear that really doing the
   20538          calculation at 32 bits is really worth it.  Hence for leal,
   20539          do the full 64-bit calculation and then truncate it. */
   20540       putIRegG( sz, pfx, modrm,
   20541                          sz == 4
   20542                             ? unop(Iop_64to32, mkexpr(addr))
   20543                             : mkexpr(addr)
   20544               );
   20545       DIP("lea%c %s, %s\n", nameISize(sz), dis_buf,
   20546                             nameIRegG(sz,pfx,modrm));
   20547       return delta;
   20548 
   20549    case 0x8F: { /* POPQ m64 / POPW m16 */
   20550       Int   len;
   20551       UChar rm;
   20552       /* There is no encoding for 32-bit pop in 64-bit mode.
   20553          So sz==4 actually means sz==8. */
   20554       if (haveF2orF3(pfx)) goto decode_failure;
   20555       vassert(sz == 2 || sz == 4
   20556               || /* tolerate redundant REX.W, see #210481 */ sz == 8);
   20557       if (sz == 4) sz = 8;
   20558       if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
   20559 
   20560       rm = getUChar(delta);
   20561 
   20562       /* make sure this instruction is correct POP */
   20563       if (epartIsReg(rm) || gregLO3ofRM(rm) != 0)
   20564          goto decode_failure;
   20565       /* and has correct size */
   20566       vassert(sz == 8);
   20567 
   20568       t1 = newTemp(Ity_I64);
   20569       t3 = newTemp(Ity_I64);
   20570       assign( t1, getIReg64(R_RSP) );
   20571       assign( t3, loadLE(Ity_I64, mkexpr(t1)) );
   20572 
   20573       /* Increase RSP; must be done before the STORE.  Intel manual
   20574          says: If the RSP register is used as a base register for
   20575          addressing a destination operand in memory, the POP
   20576          instruction computes the effective address of the operand
   20577          after it increments the RSP register.  */
   20578       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t1), mkU64(sz)) );
   20579 
   20580       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   20581       storeLE( mkexpr(addr), mkexpr(t3) );
   20582 
   20583       DIP("popl %s\n", dis_buf);
   20584 
   20585       delta += len;
   20586       return delta;
   20587    }
   20588 
   20589    case 0x90: /* XCHG eAX,eAX */
   20590       /* detect and handle F3 90 (rep nop) specially */
   20591       if (!have66(pfx) && !haveF2(pfx) && haveF3(pfx)) {
   20592          DIP("rep nop (P4 pause)\n");
   20593          /* "observe" the hint.  The Vex client needs to be careful not
   20594             to cause very long delays as a result, though. */
   20595          jmp_lit(dres, Ijk_Yield, guest_RIP_bbstart+delta);
   20596          vassert(dres->whatNext == Dis_StopHere);
   20597          return delta;
   20598       }
   20599       /* detect and handle NOPs specially */
   20600       if (/* F2/F3 probably change meaning completely */
   20601           !haveF2orF3(pfx)
   20602           /* If REX.B is 1, we're not exchanging rAX with itself */
   20603           && getRexB(pfx)==0 ) {
   20604          DIP("nop\n");
   20605          return delta;
   20606       }
   20607       /* else fall through to normal case. */
   20608    case 0x91: /* XCHG rAX,rCX */
   20609    case 0x92: /* XCHG rAX,rDX */
   20610    case 0x93: /* XCHG rAX,rBX */
   20611    case 0x94: /* XCHG rAX,rSP */
   20612    case 0x95: /* XCHG rAX,rBP */
   20613    case 0x96: /* XCHG rAX,rSI */
   20614    case 0x97: /* XCHG rAX,rDI */
   20615       /* guard against mutancy */
   20616       if (haveF2orF3(pfx)) goto decode_failure;
   20617       codegen_xchg_rAX_Reg ( pfx, sz, opc - 0x90 );
   20618       return delta;
   20619 
   20620    case 0x98: /* CBW */
   20621       if (haveF2orF3(pfx)) goto decode_failure;
   20622       if (sz == 8) {
   20623          putIRegRAX( 8, unop(Iop_32Sto64, getIRegRAX(4)) );
   20624          DIP(/*"cdqe\n"*/"cltq");
   20625          return delta;
   20626       }
   20627       if (sz == 4) {
   20628          putIRegRAX( 4, unop(Iop_16Sto32, getIRegRAX(2)) );
   20629          DIP("cwtl\n");
   20630          return delta;
   20631       }
   20632       if (sz == 2) {
   20633          putIRegRAX( 2, unop(Iop_8Sto16, getIRegRAX(1)) );
   20634          DIP("cbw\n");
   20635          return delta;
   20636       }
   20637       goto decode_failure;
   20638 
   20639    case 0x99: /* CWD/CDQ/CQO */
   20640       if (haveF2orF3(pfx)) goto decode_failure;
   20641       vassert(sz == 2 || sz == 4 || sz == 8);
   20642       ty = szToITy(sz);
   20643       putIRegRDX( sz,
   20644                   binop(mkSizedOp(ty,Iop_Sar8),
   20645                         getIRegRAX(sz),
   20646                         mkU8(sz == 2 ? 15 : (sz == 4 ? 31 : 63))) );
   20647       DIP(sz == 2 ? "cwd\n"
   20648                   : (sz == 4 ? /*"cdq\n"*/ "cltd\n"
   20649                              : "cqo\n"));
   20650       return delta;
   20651 
   20652    case 0x9B: /* FWAIT (X87 insn) */
   20653       /* ignore? */
   20654       DIP("fwait\n");
   20655       return delta;
   20656 
   20657    case 0x9C: /* PUSHF */ {
   20658       /* Note.  There is no encoding for a 32-bit pushf in 64-bit
   20659          mode.  So sz==4 actually means sz==8. */
   20660       /* 24 July 06: has also been seen with a redundant REX prefix,
   20661          so must also allow sz==8. */
   20662       if (haveF2orF3(pfx)) goto decode_failure;
   20663       vassert(sz == 2 || sz == 4 || sz == 8);
   20664       if (sz == 4) sz = 8;
   20665       if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
   20666 
   20667       t1 = newTemp(Ity_I64);
   20668       assign( t1, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
   20669       putIReg64(R_RSP, mkexpr(t1) );
   20670 
   20671       t2 = newTemp(Ity_I64);
   20672       assign( t2, mk_amd64g_calculate_rflags_all() );
   20673 
   20674       /* Patch in the D flag.  This can simply be a copy of bit 10 of
   20675          baseBlock[OFFB_DFLAG]. */
   20676       t3 = newTemp(Ity_I64);
   20677       assign( t3, binop(Iop_Or64,
   20678                         mkexpr(t2),
   20679                         binop(Iop_And64,
   20680                               IRExpr_Get(OFFB_DFLAG,Ity_I64),
   20681                               mkU64(1<<10)))
   20682             );
   20683 
   20684       /* And patch in the ID flag. */
   20685       t4 = newTemp(Ity_I64);
   20686       assign( t4, binop(Iop_Or64,
   20687                         mkexpr(t3),
   20688                         binop(Iop_And64,
   20689                               binop(Iop_Shl64, IRExpr_Get(OFFB_IDFLAG,Ity_I64),
   20690                                                mkU8(21)),
   20691                               mkU64(1<<21)))
   20692             );
   20693 
   20694       /* And patch in the AC flag too. */
   20695       t5 = newTemp(Ity_I64);
   20696       assign( t5, binop(Iop_Or64,
   20697                         mkexpr(t4),
   20698                         binop(Iop_And64,
   20699                               binop(Iop_Shl64, IRExpr_Get(OFFB_ACFLAG,Ity_I64),
   20700                                                mkU8(18)),
   20701                               mkU64(1<<18)))
   20702             );
   20703 
   20704       /* if sz==2, the stored value needs to be narrowed. */
   20705       if (sz == 2)
   20706         storeLE( mkexpr(t1), unop(Iop_32to16,
   20707                              unop(Iop_64to32,mkexpr(t5))) );
   20708       else
   20709         storeLE( mkexpr(t1), mkexpr(t5) );
   20710 
   20711       DIP("pushf%c\n", nameISize(sz));
   20712       return delta;
   20713    }
   20714 
   20715    case 0x9D: /* POPF */
   20716       /* Note.  There is no encoding for a 32-bit popf in 64-bit mode.
   20717          So sz==4 actually means sz==8. */
   20718       if (haveF2orF3(pfx)) goto decode_failure;
   20719       vassert(sz == 2 || sz == 4);
   20720       if (sz == 4) sz = 8;
   20721       if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
   20722       t1 = newTemp(Ity_I64); t2 = newTemp(Ity_I64);
   20723       assign(t2, getIReg64(R_RSP));
   20724       assign(t1, widenUto64(loadLE(szToITy(sz),mkexpr(t2))));
   20725       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t2), mkU64(sz)));
   20726       /* t1 is the flag word.  Mask out everything except OSZACP and
   20727          set the flags thunk to AMD64G_CC_OP_COPY. */
   20728       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   20729       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   20730       stmt( IRStmt_Put( OFFB_CC_DEP1,
   20731                         binop(Iop_And64,
   20732                               mkexpr(t1),
   20733                               mkU64( AMD64G_CC_MASK_C | AMD64G_CC_MASK_P
   20734                                      | AMD64G_CC_MASK_A | AMD64G_CC_MASK_Z
   20735                                      | AMD64G_CC_MASK_S| AMD64G_CC_MASK_O )
   20736                              )
   20737                        )
   20738           );
   20739 
   20740       /* Also need to set the D flag, which is held in bit 10 of t1.
   20741          If zero, put 1 in OFFB_DFLAG, else -1 in OFFB_DFLAG. */
   20742       stmt( IRStmt_Put(
   20743                OFFB_DFLAG,
   20744                IRExpr_ITE(
   20745                   unop(Iop_64to1,
   20746                        binop(Iop_And64,
   20747                              binop(Iop_Shr64, mkexpr(t1), mkU8(10)),
   20748                              mkU64(1))),
   20749                   mkU64(0xFFFFFFFFFFFFFFFFULL),
   20750                   mkU64(1)))
   20751           );
   20752 
   20753       /* And set the ID flag */
   20754       stmt( IRStmt_Put(
   20755                OFFB_IDFLAG,
   20756                IRExpr_ITE(
   20757                   unop(Iop_64to1,
   20758                        binop(Iop_And64,
   20759                              binop(Iop_Shr64, mkexpr(t1), mkU8(21)),
   20760                              mkU64(1))),
   20761                   mkU64(1),
   20762                   mkU64(0)))
   20763           );
   20764 
   20765       /* And set the AC flag too */
   20766       stmt( IRStmt_Put(
   20767                OFFB_ACFLAG,
   20768                IRExpr_ITE(
   20769                   unop(Iop_64to1,
   20770                        binop(Iop_And64,
   20771                              binop(Iop_Shr64, mkexpr(t1), mkU8(18)),
   20772                              mkU64(1))),
   20773                   mkU64(1),
   20774                   mkU64(0)))
   20775           );
   20776 
   20777       DIP("popf%c\n", nameISize(sz));
   20778       return delta;
   20779 
   20780    case 0x9E: /* SAHF */
   20781       codegen_SAHF();
   20782       DIP("sahf\n");
   20783       return delta;
   20784 
   20785    case 0x9F: /* LAHF */
   20786       codegen_LAHF();
   20787       DIP("lahf\n");
   20788       return delta;
   20789 
   20790    case 0xA0: /* MOV Ob,AL */
   20791       if (have66orF2orF3(pfx)) goto decode_failure;
   20792       sz = 1;
   20793       /* Fall through ... */
   20794    case 0xA1: /* MOV Ov,eAX */
   20795       if (sz != 8 && sz != 4 && sz != 2 && sz != 1)
   20796          goto decode_failure;
   20797       d64 = getDisp64(delta);
   20798       delta += 8;
   20799       ty = szToITy(sz);
   20800       addr = newTemp(Ity_I64);
   20801       assign( addr, handleAddrOverrides(vbi, pfx, mkU64(d64)) );
   20802       putIRegRAX(sz, loadLE( ty, mkexpr(addr) ));
   20803       DIP("mov%c %s0x%llx, %s\n", nameISize(sz),
   20804                                   segRegTxt(pfx), (ULong)d64,
   20805                                   nameIRegRAX(sz));
   20806       return delta;
   20807 
   20808    case 0xA2: /* MOV AL,Ob */
   20809       if (have66orF2orF3(pfx)) goto decode_failure;
   20810       sz = 1;
   20811       /* Fall through ... */
   20812    case 0xA3: /* MOV eAX,Ov */
   20813       if (sz != 8 && sz != 4 && sz != 2 && sz != 1)
   20814          goto decode_failure;
   20815       d64 = getDisp64(delta);
   20816       delta += 8;
   20817       ty = szToITy(sz);
   20818       addr = newTemp(Ity_I64);
   20819       assign( addr, handleAddrOverrides(vbi, pfx, mkU64(d64)) );
   20820       storeLE( mkexpr(addr), getIRegRAX(sz) );
   20821       DIP("mov%c %s, %s0x%llx\n", nameISize(sz), nameIRegRAX(sz),
   20822                                   segRegTxt(pfx), (ULong)d64);
   20823       return delta;
   20824 
   20825    case 0xA4:
   20826    case 0xA5:
   20827       /* F3 A4: rep movsb */
   20828       if (haveF3(pfx) && !haveF2(pfx)) {
   20829          if (opc == 0xA4)
   20830             sz = 1;
   20831          dis_REP_op ( dres, AMD64CondAlways, dis_MOVS, sz,
   20832                       guest_RIP_curr_instr,
   20833                       guest_RIP_bbstart+delta, "rep movs", pfx );
   20834         dres->whatNext = Dis_StopHere;
   20835         return delta;
   20836       }
   20837       /* A4: movsb */
   20838       if (!haveF3(pfx) && !haveF2(pfx)) {
   20839          if (opc == 0xA4)
   20840             sz = 1;
   20841          dis_string_op( dis_MOVS, sz, "movs", pfx );
   20842          return delta;
   20843       }
   20844       goto decode_failure;
   20845 
   20846    case 0xA6:
   20847    case 0xA7:
   20848       /* F3 A6/A7: repe cmps/rep cmps{w,l,q} */
   20849       if (haveF3(pfx) && !haveF2(pfx)) {
   20850          if (opc == 0xA6)
   20851             sz = 1;
   20852          dis_REP_op ( dres, AMD64CondZ, dis_CMPS, sz,
   20853                       guest_RIP_curr_instr,
   20854                       guest_RIP_bbstart+delta, "repe cmps", pfx );
   20855          dres->whatNext = Dis_StopHere;
   20856          return delta;
   20857       }
   20858       goto decode_failure;
   20859 
   20860    case 0xAA:
   20861    case 0xAB:
   20862       /* F3 AA/AB: rep stosb/rep stos{w,l,q} */
   20863       if (haveF3(pfx) && !haveF2(pfx)) {
   20864          if (opc == 0xAA)
   20865             sz = 1;
   20866          dis_REP_op ( dres, AMD64CondAlways, dis_STOS, sz,
   20867                       guest_RIP_curr_instr,
   20868                       guest_RIP_bbstart+delta, "rep stos", pfx );
   20869          vassert(dres->whatNext == Dis_StopHere);
   20870          return delta;
   20871       }
   20872       /* AA/AB: stosb/stos{w,l,q} */
   20873       if (!haveF3(pfx) && !haveF2(pfx)) {
   20874          if (opc == 0xAA)
   20875             sz = 1;
   20876          dis_string_op( dis_STOS, sz, "stos", pfx );
   20877          return delta;
   20878       }
   20879       goto decode_failure;
   20880 
   20881    case 0xA8: /* TEST Ib, AL */
   20882       if (haveF2orF3(pfx)) goto decode_failure;
   20883       delta = dis_op_imm_A( 1, False, Iop_And8, False, delta, "test" );
   20884       return delta;
   20885    case 0xA9: /* TEST Iv, eAX */
   20886       if (haveF2orF3(pfx)) goto decode_failure;
   20887       delta = dis_op_imm_A( sz, False, Iop_And8, False, delta, "test" );
   20888       return delta;
   20889 
   20890    case 0xAC: /* LODS, no REP prefix */
   20891    case 0xAD:
   20892       dis_string_op( dis_LODS, ( opc == 0xAC ? 1 : sz ), "lods", pfx );
   20893       return delta;
   20894 
   20895    case 0xAE:
   20896    case 0xAF:
   20897       /* F2 AE/AF: repne scasb/repne scas{w,l,q} */
   20898       if (haveF2(pfx) && !haveF3(pfx)) {
   20899          if (opc == 0xAE)
   20900             sz = 1;
   20901          dis_REP_op ( dres, AMD64CondNZ, dis_SCAS, sz,
   20902                       guest_RIP_curr_instr,
   20903                       guest_RIP_bbstart+delta, "repne scas", pfx );
   20904          vassert(dres->whatNext == Dis_StopHere);
   20905          return delta;
   20906       }
   20907       /* F3 AE/AF: repe scasb/repe scas{w,l,q} */
   20908       if (!haveF2(pfx) && haveF3(pfx)) {
   20909          if (opc == 0xAE)
   20910             sz = 1;
   20911          dis_REP_op ( dres, AMD64CondZ, dis_SCAS, sz,
   20912                       guest_RIP_curr_instr,
   20913                       guest_RIP_bbstart+delta, "repe scas", pfx );
   20914          vassert(dres->whatNext == Dis_StopHere);
   20915          return delta;
   20916       }
   20917       /* AE/AF: scasb/scas{w,l,q} */
   20918       if (!haveF2(pfx) && !haveF3(pfx)) {
   20919          if (opc == 0xAE)
   20920             sz = 1;
   20921          dis_string_op( dis_SCAS, sz, "scas", pfx );
   20922          return delta;
   20923       }
   20924       goto decode_failure;
   20925 
   20926    /* XXXX be careful here with moves to AH/BH/CH/DH */
   20927    case 0xB0: /* MOV imm,AL */
   20928    case 0xB1: /* MOV imm,CL */
   20929    case 0xB2: /* MOV imm,DL */
   20930    case 0xB3: /* MOV imm,BL */
   20931    case 0xB4: /* MOV imm,AH */
   20932    case 0xB5: /* MOV imm,CH */
   20933    case 0xB6: /* MOV imm,DH */
   20934    case 0xB7: /* MOV imm,BH */
   20935       if (haveF2orF3(pfx)) goto decode_failure;
   20936       d64 = getUChar(delta);
   20937       delta += 1;
   20938       putIRegRexB(1, pfx, opc-0xB0, mkU8(d64));
   20939       DIP("movb $%lld,%s\n", d64, nameIRegRexB(1,pfx,opc-0xB0));
   20940       return delta;
   20941 
   20942    case 0xB8: /* MOV imm,eAX */
   20943    case 0xB9: /* MOV imm,eCX */
   20944    case 0xBA: /* MOV imm,eDX */
   20945    case 0xBB: /* MOV imm,eBX */
   20946    case 0xBC: /* MOV imm,eSP */
   20947    case 0xBD: /* MOV imm,eBP */
   20948    case 0xBE: /* MOV imm,eSI */
   20949    case 0xBF: /* MOV imm,eDI */
   20950       /* This is the one-and-only place where 64-bit literals are
   20951          allowed in the instruction stream. */
   20952       if (haveF2orF3(pfx)) goto decode_failure;
   20953       if (sz == 8) {
   20954          d64 = getDisp64(delta);
   20955          delta += 8;
   20956          putIRegRexB(8, pfx, opc-0xB8, mkU64(d64));
   20957          DIP("movabsq $%lld,%s\n", (Long)d64,
   20958                                    nameIRegRexB(8,pfx,opc-0xB8));
   20959       } else {
   20960          d64 = getSDisp(imin(4,sz),delta);
   20961          delta += imin(4,sz);
   20962          putIRegRexB(sz, pfx, opc-0xB8,
   20963                          mkU(szToITy(sz), d64 & mkSizeMask(sz)));
   20964          DIP("mov%c $%lld,%s\n", nameISize(sz),
   20965                                  (Long)d64,
   20966                                  nameIRegRexB(sz,pfx,opc-0xB8));
   20967       }
   20968       return delta;
   20969 
   20970    case 0xC0: { /* Grp2 Ib,Eb */
   20971       Bool decode_OK = True;
   20972       if (haveF2orF3(pfx)) goto decode_failure;
   20973       modrm = getUChar(delta);
   20974       am_sz = lengthAMode(pfx,delta);
   20975       d_sz  = 1;
   20976       d64   = getUChar(delta + am_sz);
   20977       sz    = 1;
   20978       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   20979                          mkU8(d64 & 0xFF), NULL, &decode_OK );
   20980       if (!decode_OK) goto decode_failure;
   20981       return delta;
   20982    }
   20983 
   20984    case 0xC1: { /* Grp2 Ib,Ev */
   20985       Bool decode_OK = True;
   20986       if (haveF2orF3(pfx)) goto decode_failure;
   20987       modrm = getUChar(delta);
   20988       am_sz = lengthAMode(pfx,delta);
   20989       d_sz  = 1;
   20990       d64   = getUChar(delta + am_sz);
   20991       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   20992                          mkU8(d64 & 0xFF), NULL, &decode_OK );
   20993       if (!decode_OK) goto decode_failure;
   20994       return delta;
   20995    }
   20996 
   20997    case 0xC2: /* RET imm16 */
   20998       if (have66orF3(pfx)) goto decode_failure;
   20999       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   21000       d64 = getUDisp16(delta);
   21001       delta += 2;
   21002       dis_ret(dres, vbi, d64);
   21003       DIP("ret $%lld\n", d64);
   21004       return delta;
   21005 
   21006    case 0xC3: /* RET */
   21007       if (have66(pfx)) goto decode_failure;
   21008       /* F3 is acceptable on AMD. */
   21009       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   21010       dis_ret(dres, vbi, 0);
   21011       DIP(haveF3(pfx) ? "rep ; ret\n" : "ret\n");
   21012       return delta;
   21013 
   21014    case 0xC6: /* C6 /0 = MOV Ib,Eb */
   21015       sz = 1;
   21016       goto maybe_do_Mov_I_E;
   21017    case 0xC7: /* C7 /0 = MOV Iv,Ev */
   21018       goto maybe_do_Mov_I_E;
   21019    maybe_do_Mov_I_E:
   21020       modrm = getUChar(delta);
   21021       if (gregLO3ofRM(modrm) == 0) {
   21022          if (epartIsReg(modrm)) {
   21023             /* Neither F2 nor F3 are allowable. */
   21024             if (haveF2orF3(pfx)) goto decode_failure;
   21025             delta++; /* mod/rm byte */
   21026             d64 = getSDisp(imin(4,sz),delta);
   21027             delta += imin(4,sz);
   21028             putIRegE(sz, pfx, modrm,
   21029                          mkU(szToITy(sz), d64 & mkSizeMask(sz)));
   21030             DIP("mov%c $%lld, %s\n", nameISize(sz),
   21031                                      (Long)d64,
   21032                                      nameIRegE(sz,pfx,modrm));
   21033          } else {
   21034             if (haveF2(pfx)) goto decode_failure;
   21035             /* F3(XRELEASE) is allowable here */
   21036             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   21037                               /*xtra*/imin(4,sz) );
   21038             delta += alen;
   21039             d64 = getSDisp(imin(4,sz),delta);
   21040             delta += imin(4,sz);
   21041             storeLE(mkexpr(addr),
   21042                     mkU(szToITy(sz), d64 & mkSizeMask(sz)));
   21043             DIP("mov%c $%lld, %s\n", nameISize(sz), (Long)d64, dis_buf);
   21044          }
   21045          return delta;
   21046       }
   21047       /* BEGIN HACKY SUPPORT FOR xbegin */
   21048       if (opc == 0xC7 && modrm == 0xF8 && !have66orF2orF3(pfx) && sz == 4
   21049           && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   21050          delta++; /* mod/rm byte */
   21051          d64 = getSDisp(4,delta);
   21052          delta += 4;
   21053          guest_RIP_next_mustcheck = True;
   21054          guest_RIP_next_assumed   = guest_RIP_bbstart + delta;
   21055          Addr64 failAddr = guest_RIP_bbstart + delta + d64;
   21056          /* EAX contains the failure status code.  Bit 3 is "Set if an
   21057             internal buffer overflowed", which seems like the
   21058             least-bogus choice we can make here. */
   21059          putIRegRAX(4, mkU32(1<<3));
   21060          /* And jump to the fail address. */
   21061          jmp_lit(dres, Ijk_Boring, failAddr);
   21062          vassert(dres->whatNext == Dis_StopHere);
   21063          DIP("xbeginq 0x%llx\n", failAddr);
   21064          return delta;
   21065       }
   21066       /* END HACKY SUPPORT FOR xbegin */
   21067       /* BEGIN HACKY SUPPORT FOR xabort */
   21068       if (opc == 0xC6 && modrm == 0xF8 && !have66orF2orF3(pfx) && sz == 1
   21069           && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   21070          delta++; /* mod/rm byte */
   21071          abyte = getUChar(delta); delta++;
   21072          /* There is never a real transaction in progress, so do nothing. */
   21073          DIP("xabort $%d", (Int)abyte);
   21074          return delta;
   21075       }
   21076       /* END HACKY SUPPORT FOR xabort */
   21077       goto decode_failure;
   21078 
   21079    case 0xC8: /* ENTER */
   21080       /* Same comments re operand size as for LEAVE below apply.
   21081          Also, only handles the case "enter $imm16, $0"; other cases
   21082          for the second operand (nesting depth) are not handled. */
   21083       if (sz != 4)
   21084          goto decode_failure;
   21085       d64 = getUDisp16(delta);
   21086       delta += 2;
   21087       vassert(d64 >= 0 && d64 <= 0xFFFF);
   21088       if (getUChar(delta) != 0)
   21089          goto decode_failure;
   21090       delta++;
   21091       /* Intel docs seem to suggest:
   21092            push rbp
   21093            temp = rsp
   21094            rbp = temp
   21095            rsp = rsp - imm16
   21096       */
   21097       t1 = newTemp(Ity_I64);
   21098       assign(t1, getIReg64(R_RBP));
   21099       t2 = newTemp(Ity_I64);
   21100       assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   21101       putIReg64(R_RSP, mkexpr(t2));
   21102       storeLE(mkexpr(t2), mkexpr(t1));
   21103       putIReg64(R_RBP, mkexpr(t2));
   21104       if (d64 > 0) {
   21105          putIReg64(R_RSP, binop(Iop_Sub64, mkexpr(t2), mkU64(d64)));
   21106       }
   21107       DIP("enter $%u, $0\n", (UInt)d64);
   21108       return delta;
   21109 
   21110    case 0xC9: /* LEAVE */
   21111       /* In 64-bit mode this defaults to a 64-bit operand size.  There
   21112          is no way to encode a 32-bit variant.  Hence sz==4 but we do
   21113          it as if sz=8. */
   21114       if (sz != 4)
   21115          goto decode_failure;
   21116       t1 = newTemp(Ity_I64);
   21117       t2 = newTemp(Ity_I64);
   21118       assign(t1, getIReg64(R_RBP));
   21119       /* First PUT RSP looks redundant, but need it because RSP must
   21120          always be up-to-date for Memcheck to work... */
   21121       putIReg64(R_RSP, mkexpr(t1));
   21122       assign(t2, loadLE(Ity_I64,mkexpr(t1)));
   21123       putIReg64(R_RBP, mkexpr(t2));
   21124       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t1), mkU64(8)) );
   21125       DIP("leave\n");
   21126       return delta;
   21127 
   21128    case 0xCC: /* INT 3 */
   21129       jmp_lit(dres, Ijk_SigTRAP, guest_RIP_bbstart + delta);
   21130       vassert(dres->whatNext == Dis_StopHere);
   21131       DIP("int $0x3\n");
   21132       return delta;
   21133 
   21134    case 0xCD: /* INT imm8 */
   21135       d64 = getUChar(delta); delta++;
   21136 
   21137       /* Handle int $0xD2 (Solaris fasttrap syscalls). */
   21138       if (d64 == 0xD2) {
   21139          jmp_lit(dres, Ijk_Sys_int210, guest_RIP_bbstart + delta);
   21140          vassert(dres->whatNext == Dis_StopHere);
   21141          DIP("int $0xD2\n");
   21142          return delta;
   21143       }
   21144       goto decode_failure;
   21145 
   21146    case 0xD0: { /* Grp2 1,Eb */
   21147       Bool decode_OK = True;
   21148       if (haveF2orF3(pfx)) goto decode_failure;
   21149       modrm = getUChar(delta);
   21150       am_sz = lengthAMode(pfx,delta);
   21151       d_sz  = 0;
   21152       d64   = 1;
   21153       sz    = 1;
   21154       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   21155                          mkU8(d64), NULL, &decode_OK );
   21156       if (!decode_OK) goto decode_failure;
   21157       return delta;
   21158    }
   21159 
   21160    case 0xD1: { /* Grp2 1,Ev */
   21161       Bool decode_OK = True;
   21162       if (haveF2orF3(pfx)) goto decode_failure;
   21163       modrm = getUChar(delta);
   21164       am_sz = lengthAMode(pfx,delta);
   21165       d_sz  = 0;
   21166       d64   = 1;
   21167       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   21168                          mkU8(d64), NULL, &decode_OK );
   21169       if (!decode_OK) goto decode_failure;
   21170       return delta;
   21171    }
   21172 
   21173    case 0xD2: { /* Grp2 CL,Eb */
   21174       Bool decode_OK = True;
   21175       if (haveF2orF3(pfx)) goto decode_failure;
   21176       modrm = getUChar(delta);
   21177       am_sz = lengthAMode(pfx,delta);
   21178       d_sz  = 0;
   21179       sz    = 1;
   21180       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   21181                          getIRegCL(), "%cl", &decode_OK );
   21182       if (!decode_OK) goto decode_failure;
   21183       return delta;
   21184    }
   21185 
   21186    case 0xD3: { /* Grp2 CL,Ev */
   21187       Bool decode_OK = True;
   21188       if (haveF2orF3(pfx)) goto decode_failure;
   21189       modrm = getUChar(delta);
   21190       am_sz = lengthAMode(pfx,delta);
   21191       d_sz  = 0;
   21192       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   21193                          getIRegCL(), "%cl", &decode_OK );
   21194       if (!decode_OK) goto decode_failure;
   21195       return delta;
   21196    }
   21197 
   21198    case 0xD8: /* X87 instructions */
   21199    case 0xD9:
   21200    case 0xDA:
   21201    case 0xDB:
   21202    case 0xDC:
   21203    case 0xDD:
   21204    case 0xDE:
   21205    case 0xDF: {
   21206       Bool redundantREXWok = False;
   21207 
   21208       if (haveF2orF3(pfx))
   21209          goto decode_failure;
   21210 
   21211       /* kludge to tolerate redundant rex.w prefixes (should do this
   21212          properly one day) */
   21213       /* mono 1.1.18.1 produces 48 D9 FA, which is rex.w fsqrt */
   21214       if ( (opc == 0xD9 && getUChar(delta+0) == 0xFA)/*fsqrt*/ )
   21215          redundantREXWok = True;
   21216 
   21217       Bool size_OK = False;
   21218       if ( sz == 4 )
   21219          size_OK = True;
   21220       else if ( sz == 8 )
   21221          size_OK = redundantREXWok;
   21222       else if ( sz == 2 ) {
   21223          int mod_rm = getUChar(delta+0);
   21224          int reg = gregLO3ofRM(mod_rm);
   21225          /* The HotSpot JVM uses these */
   21226          if ( (opc == 0xDD) && (reg == 0 /* FLDL   */ ||
   21227                                 reg == 4 /* FNSAVE */ ||
   21228                                 reg == 6 /* FRSTOR */ ) )
   21229             size_OK = True;
   21230       }
   21231       /* AMD manual says 0x66 size override is ignored, except where
   21232          it is meaningful */
   21233       if (!size_OK)
   21234          goto decode_failure;
   21235 
   21236       Bool decode_OK = False;
   21237       delta = dis_FPU ( &decode_OK, vbi, pfx, delta );
   21238       if (!decode_OK)
   21239          goto decode_failure;
   21240 
   21241       return delta;
   21242    }
   21243 
   21244    case 0xE0: /* LOOPNE disp8: decrement count, jump if count != 0 && ZF==0 */
   21245    case 0xE1: /* LOOPE  disp8: decrement count, jump if count != 0 && ZF==1 */
   21246    case 0xE2: /* LOOP   disp8: decrement count, jump if count != 0 */
   21247     { /* The docs say this uses rCX as a count depending on the
   21248          address size override, not the operand one. */
   21249       IRExpr* zbit  = NULL;
   21250       IRExpr* count = NULL;
   21251       IRExpr* cond  = NULL;
   21252       const HChar* xtra = NULL;
   21253 
   21254       if (have66orF2orF3(pfx) || 1==getRexW(pfx)) goto decode_failure;
   21255       /* So at this point we've rejected any variants which appear to
   21256          be governed by the usual operand-size modifiers.  Hence only
   21257          the address size prefix can have an effect.  It changes the
   21258          size from 64 (default) to 32. */
   21259       d64 = guest_RIP_bbstart+delta+1 + getSDisp8(delta);
   21260       delta++;
   21261       if (haveASO(pfx)) {
   21262          /* 64to32 of 64-bit get is merely a get-put improvement
   21263             trick. */
   21264          putIReg32(R_RCX, binop(Iop_Sub32,
   21265                                 unop(Iop_64to32, getIReg64(R_RCX)),
   21266                                 mkU32(1)));
   21267       } else {
   21268          putIReg64(R_RCX, binop(Iop_Sub64, getIReg64(R_RCX), mkU64(1)));
   21269       }
   21270 
   21271       /* This is correct, both for 32- and 64-bit versions.  If we're
   21272          doing a 32-bit dec and the result is zero then the default
   21273          zero extension rule will cause the upper 32 bits to be zero
   21274          too.  Hence a 64-bit check against zero is OK. */
   21275       count = getIReg64(R_RCX);
   21276       cond = binop(Iop_CmpNE64, count, mkU64(0));
   21277       switch (opc) {
   21278          case 0xE2:
   21279             xtra = "";
   21280             break;
   21281          case 0xE1:
   21282             xtra = "e";
   21283             zbit = mk_amd64g_calculate_condition( AMD64CondZ );
   21284             cond = mkAnd1(cond, zbit);
   21285             break;
   21286          case 0xE0:
   21287             xtra = "ne";
   21288             zbit = mk_amd64g_calculate_condition( AMD64CondNZ );
   21289             cond = mkAnd1(cond, zbit);
   21290             break;
   21291          default:
   21292             vassert(0);
   21293       }
   21294       stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U64(d64), OFFB_RIP) );
   21295 
   21296       DIP("loop%s%s 0x%llx\n", xtra, haveASO(pfx) ? "l" : "", (ULong)d64);
   21297       return delta;
   21298     }
   21299 
   21300    case 0xE3:
   21301       /* JRCXZ or JECXZ, depending address size override. */
   21302       if (have66orF2orF3(pfx)) goto decode_failure;
   21303       d64 = (guest_RIP_bbstart+delta+1) + getSDisp8(delta);
   21304       delta++;
   21305       if (haveASO(pfx)) {
   21306          /* 32-bit */
   21307          stmt( IRStmt_Exit( binop(Iop_CmpEQ64,
   21308                                   unop(Iop_32Uto64, getIReg32(R_RCX)),
   21309                                   mkU64(0)),
   21310                             Ijk_Boring,
   21311                             IRConst_U64(d64),
   21312                             OFFB_RIP
   21313              ));
   21314          DIP("jecxz 0x%llx\n", (ULong)d64);
   21315       } else {
   21316          /* 64-bit */
   21317          stmt( IRStmt_Exit( binop(Iop_CmpEQ64,
   21318                                   getIReg64(R_RCX),
   21319                                   mkU64(0)),
   21320                             Ijk_Boring,
   21321                             IRConst_U64(d64),
   21322                             OFFB_RIP
   21323                ));
   21324          DIP("jrcxz 0x%llx\n", (ULong)d64);
   21325       }
   21326       return delta;
   21327 
   21328    case 0xE4: /* IN imm8, AL */
   21329       sz = 1;
   21330       t1 = newTemp(Ity_I64);
   21331       abyte = getUChar(delta); delta++;
   21332       assign(t1, mkU64( abyte & 0xFF ));
   21333       DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIRegRAX(sz));
   21334       goto do_IN;
   21335    case 0xE5: /* IN imm8, eAX */
   21336       if (!(sz == 2 || sz == 4)) goto decode_failure;
   21337       t1 = newTemp(Ity_I64);
   21338       abyte = getUChar(delta); delta++;
   21339       assign(t1, mkU64( abyte & 0xFF ));
   21340       DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIRegRAX(sz));
   21341       goto do_IN;
   21342    case 0xEC: /* IN %DX, AL */
   21343       sz = 1;
   21344       t1 = newTemp(Ity_I64);
   21345       assign(t1, unop(Iop_16Uto64, getIRegRDX(2)));
   21346       DIP("in%c %s,%s\n", nameISize(sz), nameIRegRDX(2),
   21347                                          nameIRegRAX(sz));
   21348       goto do_IN;
   21349    case 0xED: /* IN %DX, eAX */
   21350       if (!(sz == 2 || sz == 4)) goto decode_failure;
   21351       t1 = newTemp(Ity_I64);
   21352       assign(t1, unop(Iop_16Uto64, getIRegRDX(2)));
   21353       DIP("in%c %s,%s\n", nameISize(sz), nameIRegRDX(2),
   21354                                          nameIRegRAX(sz));
   21355       goto do_IN;
   21356    do_IN: {
   21357       /* At this point, sz indicates the width, and t1 is a 64-bit
   21358          value giving port number. */
   21359       IRDirty* d;
   21360       if (haveF2orF3(pfx)) goto decode_failure;
   21361       vassert(sz == 1 || sz == 2 || sz == 4);
   21362       ty = szToITy(sz);
   21363       t2 = newTemp(Ity_I64);
   21364       d = unsafeIRDirty_1_N(
   21365              t2,
   21366              0/*regparms*/,
   21367              "amd64g_dirtyhelper_IN",
   21368              &amd64g_dirtyhelper_IN,
   21369              mkIRExprVec_2( mkexpr(t1), mkU64(sz) )
   21370           );
   21371       /* do the call, dumping the result in t2. */
   21372       stmt( IRStmt_Dirty(d) );
   21373       putIRegRAX(sz, narrowTo( ty, mkexpr(t2) ) );
   21374       return delta;
   21375    }
   21376 
   21377    case 0xE6: /* OUT AL, imm8 */
   21378       sz = 1;
   21379       t1 = newTemp(Ity_I64);
   21380       abyte = getUChar(delta); delta++;
   21381       assign( t1, mkU64( abyte & 0xFF ) );
   21382       DIP("out%c %s,$%d\n", nameISize(sz), nameIRegRAX(sz), (Int)abyte);
   21383       goto do_OUT;
   21384    case 0xE7: /* OUT eAX, imm8 */
   21385       if (!(sz == 2 || sz == 4)) goto decode_failure;
   21386       t1 = newTemp(Ity_I64);
   21387       abyte = getUChar(delta); delta++;
   21388       assign( t1, mkU64( abyte & 0xFF ) );
   21389       DIP("out%c %s,$%d\n", nameISize(sz), nameIRegRAX(sz), (Int)abyte);
   21390       goto do_OUT;
   21391    case 0xEE: /* OUT AL, %DX */
   21392       sz = 1;
   21393       t1 = newTemp(Ity_I64);
   21394       assign( t1, unop(Iop_16Uto64, getIRegRDX(2)) );
   21395       DIP("out%c %s,%s\n", nameISize(sz), nameIRegRAX(sz),
   21396                                           nameIRegRDX(2));
   21397       goto do_OUT;
   21398    case 0xEF: /* OUT eAX, %DX */
   21399       if (!(sz == 2 || sz == 4)) goto decode_failure;
   21400       t1 = newTemp(Ity_I64);
   21401       assign( t1, unop(Iop_16Uto64, getIRegRDX(2)) );
   21402       DIP("out%c %s,%s\n", nameISize(sz), nameIRegRAX(sz),
   21403                                           nameIRegRDX(2));
   21404       goto do_OUT;
   21405    do_OUT: {
   21406       /* At this point, sz indicates the width, and t1 is a 64-bit
   21407          value giving port number. */
   21408       IRDirty* d;
   21409       if (haveF2orF3(pfx)) goto decode_failure;
   21410       vassert(sz == 1 || sz == 2 || sz == 4);
   21411       ty = szToITy(sz);
   21412       d = unsafeIRDirty_0_N(
   21413              0/*regparms*/,
   21414              "amd64g_dirtyhelper_OUT",
   21415              &amd64g_dirtyhelper_OUT,
   21416              mkIRExprVec_3( mkexpr(t1),
   21417                             widenUto64( getIRegRAX(sz) ),
   21418                             mkU64(sz) )
   21419           );
   21420       stmt( IRStmt_Dirty(d) );
   21421       return delta;
   21422    }
   21423 
   21424    case 0xE8: /* CALL J4 */
   21425       if (haveF3(pfx)) goto decode_failure;
   21426       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   21427       d64 = getSDisp32(delta); delta += 4;
   21428       d64 += (guest_RIP_bbstart+delta);
   21429       /* (guest_RIP_bbstart+delta) == return-to addr, d64 == call-to addr */
   21430       t1 = newTemp(Ity_I64);
   21431       assign(t1, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   21432       putIReg64(R_RSP, mkexpr(t1));
   21433       storeLE( mkexpr(t1), mkU64(guest_RIP_bbstart+delta));
   21434       t2 = newTemp(Ity_I64);
   21435       assign(t2, mkU64((Addr64)d64));
   21436       make_redzone_AbiHint(vbi, t1, t2/*nia*/, "call-d32");
   21437       if (resteerOkFn( callback_opaque, (Addr64)d64) ) {
   21438          /* follow into the call target. */
   21439          dres->whatNext   = Dis_ResteerU;
   21440          dres->continueAt = d64;
   21441       } else {
   21442          jmp_lit(dres, Ijk_Call, d64);
   21443          vassert(dres->whatNext == Dis_StopHere);
   21444       }
   21445       DIP("call 0x%llx\n", (ULong)d64);
   21446       return delta;
   21447 
   21448    case 0xE9: /* Jv (jump, 16/32 offset) */
   21449       if (haveF3(pfx)) goto decode_failure;
   21450       if (sz != 4)
   21451          goto decode_failure; /* JRS added 2004 July 11 */
   21452       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   21453       d64 = (guest_RIP_bbstart+delta+sz) + getSDisp(sz,delta);
   21454       delta += sz;
   21455       if (resteerOkFn(callback_opaque, (Addr64)d64)) {
   21456          dres->whatNext   = Dis_ResteerU;
   21457          dres->continueAt = d64;
   21458       } else {
   21459          jmp_lit(dres, Ijk_Boring, d64);
   21460          vassert(dres->whatNext == Dis_StopHere);
   21461       }
   21462       DIP("jmp 0x%llx\n", (ULong)d64);
   21463       return delta;
   21464 
   21465    case 0xEB: /* Jb (jump, byte offset) */
   21466       if (haveF3(pfx)) goto decode_failure;
   21467       if (sz != 4)
   21468          goto decode_failure; /* JRS added 2004 July 11 */
   21469       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   21470       d64 = (guest_RIP_bbstart+delta+1) + getSDisp8(delta);
   21471       delta++;
   21472       if (resteerOkFn(callback_opaque, (Addr64)d64)) {
   21473          dres->whatNext   = Dis_ResteerU;
   21474          dres->continueAt = d64;
   21475       } else {
   21476          jmp_lit(dres, Ijk_Boring, d64);
   21477          vassert(dres->whatNext == Dis_StopHere);
   21478       }
   21479       DIP("jmp-8 0x%llx\n", (ULong)d64);
   21480       return delta;
   21481 
   21482    case 0xF5: /* CMC */
   21483    case 0xF8: /* CLC */
   21484    case 0xF9: /* STC */
   21485       t1 = newTemp(Ity_I64);
   21486       t2 = newTemp(Ity_I64);
   21487       assign( t1, mk_amd64g_calculate_rflags_all() );
   21488       switch (opc) {
   21489          case 0xF5:
   21490             assign( t2, binop(Iop_Xor64, mkexpr(t1),
   21491                                          mkU64(AMD64G_CC_MASK_C)));
   21492             DIP("cmc\n");
   21493             break;
   21494          case 0xF8:
   21495             assign( t2, binop(Iop_And64, mkexpr(t1),
   21496                                          mkU64(~AMD64G_CC_MASK_C)));
   21497             DIP("clc\n");
   21498             break;
   21499          case 0xF9:
   21500             assign( t2, binop(Iop_Or64, mkexpr(t1),
   21501                                         mkU64(AMD64G_CC_MASK_C)));
   21502             DIP("stc\n");
   21503             break;
   21504          default:
   21505             vpanic("disInstr(x64)(cmc/clc/stc)");
   21506       }
   21507       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   21508       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   21509       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t2) ));
   21510       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   21511          elimination of previous stores to this field work better. */
   21512       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   21513       return delta;
   21514 
   21515    case 0xF6: { /* Grp3 Eb */
   21516       Bool decode_OK = True;
   21517       /* RM'd: if (haveF2orF3(pfx)) goto decode_failure; */
   21518       /* We now let dis_Grp3 itself decide if F2 and/or F3 are valid */
   21519       delta = dis_Grp3 ( vbi, pfx, 1, delta, &decode_OK );
   21520       if (!decode_OK) goto decode_failure;
   21521       return delta;
   21522    }
   21523 
   21524    case 0xF7: { /* Grp3 Ev */
   21525       Bool decode_OK = True;
   21526       /* RM'd: if (haveF2orF3(pfx)) goto decode_failure; */
   21527       /* We now let dis_Grp3 itself decide if F2 and/or F3 are valid */
   21528       delta = dis_Grp3 ( vbi, pfx, sz, delta, &decode_OK );
   21529       if (!decode_OK) goto decode_failure;
   21530       return delta;
   21531    }
   21532 
   21533    case 0xFC: /* CLD */
   21534       if (haveF2orF3(pfx)) goto decode_failure;
   21535       stmt( IRStmt_Put( OFFB_DFLAG, mkU64(1)) );
   21536       DIP("cld\n");
   21537       return delta;
   21538 
   21539    case 0xFD: /* STD */
   21540       if (haveF2orF3(pfx)) goto decode_failure;
   21541       stmt( IRStmt_Put( OFFB_DFLAG, mkU64(-1ULL)) );
   21542       DIP("std\n");
   21543       return delta;
   21544 
   21545    case 0xFE: { /* Grp4 Eb */
   21546       Bool decode_OK = True;
   21547       /* RM'd: if (haveF2orF3(pfx)) goto decode_failure; */
   21548       /* We now let dis_Grp4 itself decide if F2 and/or F3 are valid */
   21549       delta = dis_Grp4 ( vbi, pfx, delta, &decode_OK );
   21550       if (!decode_OK) goto decode_failure;
   21551       return delta;
   21552    }
   21553 
   21554    case 0xFF: { /* Grp5 Ev */
   21555       Bool decode_OK = True;
   21556       /* RM'd: if (haveF2orF3(pfx)) goto decode_failure; */
   21557       /* We now let dis_Grp5 itself decide if F2 and/or F3 are valid */
   21558       delta = dis_Grp5 ( vbi, pfx, sz, delta, dres, &decode_OK );
   21559       if (!decode_OK) goto decode_failure;
   21560       return delta;
   21561    }
   21562 
   21563    default:
   21564       break;
   21565 
   21566    }
   21567 
   21568   decode_failure:
   21569    return deltaIN; /* fail */
   21570 }
   21571 
   21572 
   21573 /*------------------------------------------------------------*/
   21574 /*---                                                      ---*/
   21575 /*--- Top-level post-escape decoders: dis_ESC_0F           ---*/
   21576 /*---                                                      ---*/
   21577 /*------------------------------------------------------------*/
   21578 
   21579 static IRTemp math_BSWAP ( IRTemp t1, IRType ty )
   21580 {
   21581    IRTemp t2 = newTemp(ty);
   21582    if (ty == Ity_I64) {
   21583       IRTemp m8  = newTemp(Ity_I64);
   21584       IRTemp s8  = newTemp(Ity_I64);
   21585       IRTemp m16 = newTemp(Ity_I64);
   21586       IRTemp s16 = newTemp(Ity_I64);
   21587       IRTemp m32 = newTemp(Ity_I64);
   21588       assign( m8, mkU64(0xFF00FF00FF00FF00ULL) );
   21589       assign( s8,
   21590               binop(Iop_Or64,
   21591                     binop(Iop_Shr64,
   21592                           binop(Iop_And64,mkexpr(t1),mkexpr(m8)),
   21593                           mkU8(8)),
   21594                     binop(Iop_And64,
   21595                           binop(Iop_Shl64,mkexpr(t1),mkU8(8)),
   21596                           mkexpr(m8))
   21597                    )
   21598             );
   21599 
   21600       assign( m16, mkU64(0xFFFF0000FFFF0000ULL) );
   21601       assign( s16,
   21602               binop(Iop_Or64,
   21603                     binop(Iop_Shr64,
   21604                           binop(Iop_And64,mkexpr(s8),mkexpr(m16)),
   21605                           mkU8(16)),
   21606                     binop(Iop_And64,
   21607                           binop(Iop_Shl64,mkexpr(s8),mkU8(16)),
   21608                           mkexpr(m16))
   21609                    )
   21610             );
   21611 
   21612       assign( m32, mkU64(0xFFFFFFFF00000000ULL) );
   21613       assign( t2,
   21614               binop(Iop_Or64,
   21615                     binop(Iop_Shr64,
   21616                           binop(Iop_And64,mkexpr(s16),mkexpr(m32)),
   21617                           mkU8(32)),
   21618                     binop(Iop_And64,
   21619                           binop(Iop_Shl64,mkexpr(s16),mkU8(32)),
   21620                           mkexpr(m32))
   21621                    )
   21622             );
   21623       return t2;
   21624    }
   21625    if (ty == Ity_I32) {
   21626       assign( t2,
   21627          binop(
   21628             Iop_Or32,
   21629             binop(Iop_Shl32, mkexpr(t1), mkU8(24)),
   21630             binop(
   21631                Iop_Or32,
   21632                binop(Iop_And32, binop(Iop_Shl32, mkexpr(t1), mkU8(8)),
   21633                                 mkU32(0x00FF0000)),
   21634                binop(Iop_Or32,
   21635                      binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(8)),
   21636                                       mkU32(0x0000FF00)),
   21637                      binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(24)),
   21638                                       mkU32(0x000000FF) )
   21639             )))
   21640       );
   21641       return t2;
   21642    }
   21643    if (ty == Ity_I16) {
   21644       assign(t2,
   21645              binop(Iop_Or16,
   21646                    binop(Iop_Shl16, mkexpr(t1), mkU8(8)),
   21647                    binop(Iop_Shr16, mkexpr(t1), mkU8(8)) ));
   21648       return t2;
   21649    }
   21650    vassert(0);
   21651    /*NOTREACHED*/
   21652    return IRTemp_INVALID;
   21653 }
   21654 
   21655 
   21656 __attribute__((noinline))
   21657 static
   21658 Long dis_ESC_0F (
   21659         /*MB_OUT*/DisResult* dres,
   21660         /*MB_OUT*/Bool*      expect_CAS,
   21661         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
   21662         Bool         resteerCisOk,
   21663         void*        callback_opaque,
   21664         const VexArchInfo* archinfo,
   21665         const VexAbiInfo*  vbi,
   21666         Prefix pfx, Int sz, Long deltaIN
   21667      )
   21668 {
   21669    Long   d64   = 0;
   21670    IRTemp addr  = IRTemp_INVALID;
   21671    IRTemp t1    = IRTemp_INVALID;
   21672    IRTemp t2    = IRTemp_INVALID;
   21673    UChar  modrm = 0;
   21674    Int    am_sz = 0;
   21675    Int    alen  = 0;
   21676    HChar  dis_buf[50];
   21677 
   21678    /* In the first switch, look for ordinary integer insns. */
   21679    Long   delta = deltaIN;
   21680    UChar  opc   = getUChar(delta);
   21681    delta++;
   21682    switch (opc) { /* first switch */
   21683 
   21684    case 0x01:
   21685    {
   21686       modrm = getUChar(delta);
   21687       /* 0F 01 /0 -- SGDT */
   21688       /* 0F 01 /1 -- SIDT */
   21689       if (!epartIsReg(modrm)
   21690           && (gregLO3ofRM(modrm) == 0 || gregLO3ofRM(modrm) == 1)) {
   21691          /* This is really revolting, but ... since each processor
   21692             (core) only has one IDT and one GDT, just let the guest
   21693             see it (pass-through semantics).  I can't see any way to
   21694             construct a faked-up value, so don't bother to try. */
   21695          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21696          delta += alen;
   21697          switch (gregLO3ofRM(modrm)) {
   21698             case 0: DIP("sgdt %s\n", dis_buf); break;
   21699             case 1: DIP("sidt %s\n", dis_buf); break;
   21700             default: vassert(0); /*NOTREACHED*/
   21701          }
   21702          IRDirty* d = unsafeIRDirty_0_N (
   21703                           0/*regparms*/,
   21704                           "amd64g_dirtyhelper_SxDT",
   21705                           &amd64g_dirtyhelper_SxDT,
   21706                           mkIRExprVec_2( mkexpr(addr),
   21707                                          mkU64(gregLO3ofRM(modrm)) )
   21708                       );
   21709          /* declare we're writing memory */
   21710          d->mFx   = Ifx_Write;
   21711          d->mAddr = mkexpr(addr);
   21712          d->mSize = 6;
   21713          stmt( IRStmt_Dirty(d) );
   21714          return delta;
   21715       }
   21716       /* 0F 01 D0 = XGETBV */
   21717       if (modrm == 0xD0 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   21718          delta += 1;
   21719          DIP("xgetbv\n");
   21720          /* Fault (SEGV) if ECX isn't zero.  Intel docs say #GP and I
   21721             am not sure if that translates in to SEGV or to something
   21722             else, in user space. */
   21723          t1 = newTemp(Ity_I32);
   21724          assign( t1, getIReg32(R_RCX) );
   21725          stmt( IRStmt_Exit(binop(Iop_CmpNE32, mkexpr(t1), mkU32(0)),
   21726                            Ijk_SigSEGV,
   21727                            IRConst_U64(guest_RIP_curr_instr),
   21728                            OFFB_RIP
   21729          ));
   21730          putIRegRAX(4, mkU32(7));
   21731          putIRegRDX(4, mkU32(0));
   21732          return delta;
   21733       }
   21734       /* BEGIN HACKY SUPPORT FOR xend */
   21735       /* 0F 01 D5 = XEND */
   21736       if (modrm == 0xD5 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   21737          /* We are never in an transaction (xbegin immediately aborts).
   21738             So this just always generates a General Protection Fault. */
   21739          delta += 1;
   21740          jmp_lit(dres, Ijk_SigSEGV, guest_RIP_bbstart + delta);
   21741          vassert(dres->whatNext == Dis_StopHere);
   21742          DIP("xend\n");
   21743          return delta;
   21744       }
   21745       /* END HACKY SUPPORT FOR xend */
   21746       /* BEGIN HACKY SUPPORT FOR xtest */
   21747       /* 0F 01 D6 = XTEST */
   21748       if (modrm == 0xD6 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   21749          /* Sets ZF because there never is a transaction, and all
   21750             CF, OF, SF, PF and AF are always cleared by xtest. */
   21751          delta += 1;
   21752          DIP("xtest\n");
   21753          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   21754          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   21755          stmt( IRStmt_Put( OFFB_CC_DEP1, mkU64(AMD64G_CC_MASK_Z) ));
   21756          /* Set NDEP even though it isn't used.  This makes redundant-PUT
   21757             elimination of previous stores to this field work better. */
   21758          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   21759          return delta;
   21760       }
   21761       /* END HACKY SUPPORT FOR xtest */
   21762       /* 0F 01 F9 = RDTSCP */
   21763       if (modrm == 0xF9 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_RDTSCP)) {
   21764          delta += 1;
   21765          /* Uses dirty helper:
   21766             void amd64g_dirtyhelper_RDTSCP ( VexGuestAMD64State* )
   21767             declared to wr rax, rcx, rdx
   21768          */
   21769          const HChar* fName = "amd64g_dirtyhelper_RDTSCP";
   21770          void*        fAddr = &amd64g_dirtyhelper_RDTSCP;
   21771          IRDirty* d
   21772             = unsafeIRDirty_0_N ( 0/*regparms*/,
   21773                                   fName, fAddr, mkIRExprVec_1(IRExpr_GSPTR()) );
   21774          /* declare guest state effects */
   21775          d->nFxState = 3;
   21776          vex_bzero(&d->fxState, sizeof(d->fxState));
   21777          d->fxState[0].fx     = Ifx_Write;
   21778          d->fxState[0].offset = OFFB_RAX;
   21779          d->fxState[0].size   = 8;
   21780          d->fxState[1].fx     = Ifx_Write;
   21781          d->fxState[1].offset = OFFB_RCX;
   21782          d->fxState[1].size   = 8;
   21783          d->fxState[2].fx     = Ifx_Write;
   21784          d->fxState[2].offset = OFFB_RDX;
   21785          d->fxState[2].size   = 8;
   21786          /* execute the dirty call, side-effecting guest state */
   21787          stmt( IRStmt_Dirty(d) );
   21788          /* RDTSCP is a serialising insn.  So, just in case someone is
   21789             using it as a memory fence ... */
   21790          stmt( IRStmt_MBE(Imbe_Fence) );
   21791          DIP("rdtscp\n");
   21792          return delta;
   21793       }
   21794       /* else decode failed */
   21795       break;
   21796    }
   21797 
   21798    case 0x05: /* SYSCALL */
   21799       guest_RIP_next_mustcheck = True;
   21800       guest_RIP_next_assumed = guest_RIP_bbstart + delta;
   21801       putIReg64( R_RCX, mkU64(guest_RIP_next_assumed) );
   21802       /* It's important that all guest state is up-to-date
   21803          at this point.  So we declare an end-of-block here, which
   21804          forces any cached guest state to be flushed. */
   21805       jmp_lit(dres, Ijk_Sys_syscall, guest_RIP_next_assumed);
   21806       vassert(dres->whatNext == Dis_StopHere);
   21807       DIP("syscall\n");
   21808       return delta;
   21809 
   21810    case 0x0B: /* UD2 */
   21811       stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_curr_instr) ) );
   21812       jmp_lit(dres, Ijk_NoDecode, guest_RIP_curr_instr);
   21813       vassert(dres->whatNext == Dis_StopHere);
   21814       DIP("ud2\n");
   21815       return delta;
   21816 
   21817    case 0x0D: /* 0F 0D /0 -- prefetch mem8 */
   21818               /* 0F 0D /1 -- prefetchw mem8 */
   21819       if (have66orF2orF3(pfx)) goto decode_failure;
   21820       modrm = getUChar(delta);
   21821       if (epartIsReg(modrm)) goto decode_failure;
   21822       if (gregLO3ofRM(modrm) != 0 && gregLO3ofRM(modrm) != 1)
   21823          goto decode_failure;
   21824       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21825       delta += alen;
   21826       switch (gregLO3ofRM(modrm)) {
   21827          case 0: DIP("prefetch %s\n", dis_buf); break;
   21828          case 1: DIP("prefetchw %s\n", dis_buf); break;
   21829          default: vassert(0); /*NOTREACHED*/
   21830       }
   21831       return delta;
   21832 
   21833    case 0x19:
   21834    case 0x1C:
   21835    case 0x1D:
   21836    case 0x1E:
   21837    case 0x1F:
   21838       // Intel CET instructions can have any prefixes before NOPs
   21839       // and can use any ModRM, SIB and disp
   21840       modrm = getUChar(delta);
   21841       if (epartIsReg(modrm)) {
   21842          delta += 1;
   21843          DIP("nop%c\n", nameISize(sz));
   21844       } else {
   21845          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21846          delta += alen;
   21847          DIP("nop%c %s\n", nameISize(sz), dis_buf);
   21848       }
   21849       return delta;
   21850 
   21851    case 0x31: { /* RDTSC */
   21852       IRTemp   val  = newTemp(Ity_I64);
   21853       IRExpr** args = mkIRExprVec_0();
   21854       IRDirty* d    = unsafeIRDirty_1_N (
   21855                          val,
   21856                          0/*regparms*/,
   21857                          "amd64g_dirtyhelper_RDTSC",
   21858                          &amd64g_dirtyhelper_RDTSC,
   21859                          args
   21860                       );
   21861       if (have66orF2orF3(pfx)) goto decode_failure;
   21862       /* execute the dirty call, dumping the result in val. */
   21863       stmt( IRStmt_Dirty(d) );
   21864       putIRegRDX(4, unop(Iop_64HIto32, mkexpr(val)));
   21865       putIRegRAX(4, unop(Iop_64to32, mkexpr(val)));
   21866       DIP("rdtsc\n");
   21867       return delta;
   21868    }
   21869 
   21870    case 0x40:
   21871    case 0x41:
   21872    case 0x42: /* CMOVBb/CMOVNAEb (cmov below) */
   21873    case 0x43: /* CMOVNBb/CMOVAEb (cmov not below) */
   21874    case 0x44: /* CMOVZb/CMOVEb (cmov zero) */
   21875    case 0x45: /* CMOVNZb/CMOVNEb (cmov not zero) */
   21876    case 0x46: /* CMOVBEb/CMOVNAb (cmov below or equal) */
   21877    case 0x47: /* CMOVNBEb/CMOVAb (cmov not below or equal) */
   21878    case 0x48: /* CMOVSb (cmov negative) */
   21879    case 0x49: /* CMOVSb (cmov not negative) */
   21880    case 0x4A: /* CMOVP (cmov parity even) */
   21881    case 0x4B: /* CMOVNP (cmov parity odd) */
   21882    case 0x4C: /* CMOVLb/CMOVNGEb (cmov less) */
   21883    case 0x4D: /* CMOVGEb/CMOVNLb (cmov greater or equal) */
   21884    case 0x4E: /* CMOVLEb/CMOVNGb (cmov less or equal) */
   21885    case 0x4F: /* CMOVGb/CMOVNLEb (cmov greater) */
   21886       if (haveF2orF3(pfx)) goto decode_failure;
   21887       delta = dis_cmov_E_G(vbi, pfx, sz, (AMD64Condcode)(opc - 0x40), delta);
   21888       return delta;
   21889 
   21890    case 0x80:
   21891    case 0x81:
   21892    case 0x82:   /* JBb/JNAEb (jump below) */
   21893    case 0x83:   /* JNBb/JAEb (jump not below) */
   21894    case 0x84:   /* JZb/JEb (jump zero) */
   21895    case 0x85:   /* JNZb/JNEb (jump not zero) */
   21896    case 0x86:   /* JBEb/JNAb (jump below or equal) */
   21897    case 0x87:   /* JNBEb/JAb (jump not below or equal) */
   21898    case 0x88:   /* JSb (jump negative) */
   21899    case 0x89:   /* JSb (jump not negative) */
   21900    case 0x8A:   /* JP (jump parity even) */
   21901    case 0x8B:   /* JNP/JPO (jump parity odd) */
   21902    case 0x8C:   /* JLb/JNGEb (jump less) */
   21903    case 0x8D:   /* JGEb/JNLb (jump greater or equal) */
   21904    case 0x8E:   /* JLEb/JNGb (jump less or equal) */
   21905    case 0x8F: { /* JGb/JNLEb (jump greater) */
   21906       Long   jmpDelta;
   21907       const HChar* comment  = "";
   21908       if (haveF3(pfx)) goto decode_failure;
   21909       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   21910       jmpDelta = getSDisp32(delta);
   21911       d64 = (guest_RIP_bbstart+delta+4) + jmpDelta;
   21912       delta += 4;
   21913       if (resteerCisOk
   21914           && vex_control.guest_chase_cond
   21915           && (Addr64)d64 != (Addr64)guest_RIP_bbstart
   21916           && jmpDelta < 0
   21917           && resteerOkFn( callback_opaque, (Addr64)d64) ) {
   21918          /* Speculation: assume this backward branch is taken.  So
   21919             we need to emit a side-exit to the insn following this
   21920             one, on the negation of the condition, and continue at
   21921             the branch target address (d64).  If we wind up back at
   21922             the first instruction of the trace, just stop; it's
   21923             better to let the IR loop unroller handle that case. */
   21924          stmt( IRStmt_Exit(
   21925                   mk_amd64g_calculate_condition(
   21926                      (AMD64Condcode)(1 ^ (opc - 0x80))),
   21927                   Ijk_Boring,
   21928                   IRConst_U64(guest_RIP_bbstart+delta),
   21929                   OFFB_RIP
   21930              ));
   21931          dres->whatNext   = Dis_ResteerC;
   21932          dres->continueAt = d64;
   21933          comment = "(assumed taken)";
   21934       }
   21935       else
   21936       if (resteerCisOk
   21937           && vex_control.guest_chase_cond
   21938           && (Addr64)d64 != (Addr64)guest_RIP_bbstart
   21939           && jmpDelta >= 0
   21940           && resteerOkFn( callback_opaque, guest_RIP_bbstart+delta ) ) {
   21941          /* Speculation: assume this forward branch is not taken.
   21942             So we need to emit a side-exit to d64 (the dest) and
   21943             continue disassembling at the insn immediately
   21944             following this one. */
   21945          stmt( IRStmt_Exit(
   21946                   mk_amd64g_calculate_condition((AMD64Condcode)
   21947                                                 (opc - 0x80)),
   21948                   Ijk_Boring,
   21949                   IRConst_U64(d64),
   21950                   OFFB_RIP
   21951              ));
   21952          dres->whatNext   = Dis_ResteerC;
   21953          dres->continueAt = guest_RIP_bbstart+delta;
   21954          comment = "(assumed not taken)";
   21955       }
   21956       else {
   21957          /* Conservative default translation - end the block at
   21958             this point. */
   21959          jcc_01( dres, (AMD64Condcode)(opc - 0x80),
   21960                  guest_RIP_bbstart+delta, d64 );
   21961          vassert(dres->whatNext == Dis_StopHere);
   21962       }
   21963       DIP("j%s-32 0x%llx %s\n", name_AMD64Condcode(opc - 0x80), (ULong)d64,
   21964           comment);
   21965       return delta;
   21966    }
   21967 
   21968    case 0x90:
   21969    case 0x91:
   21970    case 0x92: /* set-Bb/set-NAEb (set if below) */
   21971    case 0x93: /* set-NBb/set-AEb (set if not below) */
   21972    case 0x94: /* set-Zb/set-Eb (set if zero) */
   21973    case 0x95: /* set-NZb/set-NEb (set if not zero) */
   21974    case 0x96: /* set-BEb/set-NAb (set if below or equal) */
   21975    case 0x97: /* set-NBEb/set-Ab (set if not below or equal) */
   21976    case 0x98: /* set-Sb (set if negative) */
   21977    case 0x99: /* set-Sb (set if not negative) */
   21978    case 0x9A: /* set-P (set if parity even) */
   21979    case 0x9B: /* set-NP (set if parity odd) */
   21980    case 0x9C: /* set-Lb/set-NGEb (set if less) */
   21981    case 0x9D: /* set-GEb/set-NLb (set if greater or equal) */
   21982    case 0x9E: /* set-LEb/set-NGb (set if less or equal) */
   21983    case 0x9F: /* set-Gb/set-NLEb (set if greater) */
   21984       if (haveF2orF3(pfx)) goto decode_failure;
   21985       t1 = newTemp(Ity_I8);
   21986       assign( t1, unop(Iop_1Uto8,mk_amd64g_calculate_condition(opc-0x90)) );
   21987       modrm = getUChar(delta);
   21988       if (epartIsReg(modrm)) {
   21989          delta++;
   21990          putIRegE(1, pfx, modrm, mkexpr(t1));
   21991          DIP("set%s %s\n", name_AMD64Condcode(opc-0x90),
   21992                            nameIRegE(1,pfx,modrm));
   21993       } else {
   21994          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21995          delta += alen;
   21996          storeLE( mkexpr(addr), mkexpr(t1) );
   21997          DIP("set%s %s\n", name_AMD64Condcode(opc-0x90), dis_buf);
   21998       }
   21999       return delta;
   22000 
   22001    case 0x1A:
   22002    case 0x1B: { /* Future MPX instructions, currently NOPs.
   22003                    BNDMK b, m     F3 0F 1B
   22004                    BNDCL b, r/m   F3 0F 1A
   22005                    BNDCU b, r/m   F2 0F 1A
   22006                    BNDCN b, r/m   F2 0F 1B
   22007                    BNDMOV b, b/m  66 0F 1A
   22008                    BNDMOV b/m, b  66 0F 1B
   22009                    BNDLDX b, mib     0F 1A
   22010                    BNDSTX mib, b     0F 1B */
   22011 
   22012       /* All instructions have two operands. One operand is always the
   22013          bnd register number (bnd0-bnd3, other register numbers are
   22014          ignored when MPX isn't enabled, but should generate an
   22015          exception if MPX is enabled) given by gregOfRexRM. The other
   22016          operand is either a ModRM:reg, ModRM:r/m or a SIB encoded
   22017          address, all of which can be decoded by using either
   22018          eregOfRexRM or disAMode. */
   22019 
   22020       modrm = getUChar(delta);
   22021       int bnd = gregOfRexRM(pfx,modrm);
   22022       const HChar *oper;
   22023       if (epartIsReg(modrm)) {
   22024          oper = nameIReg64 (eregOfRexRM(pfx,modrm));
   22025          delta += 1;
   22026       } else {
   22027          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22028          delta += alen;
   22029          oper = dis_buf;
   22030       }
   22031 
   22032       if (haveF3no66noF2 (pfx)) {
   22033          if (opc == 0x1B) {
   22034             DIP ("bndmk %s, %%bnd%d\n", oper, bnd);
   22035          } else /* opc == 0x1A */ {
   22036             DIP ("bndcl %s, %%bnd%d\n", oper, bnd);
   22037          }
   22038       } else if (haveF2no66noF3 (pfx)) {
   22039          if (opc == 0x1A) {
   22040             DIP ("bndcu %s, %%bnd%d\n", oper, bnd);
   22041          } else /* opc == 0x1B */ {
   22042             DIP ("bndcn %s, %%bnd%d\n", oper, bnd);
   22043          }
   22044       } else if (have66noF2noF3 (pfx)) {
   22045          if (opc == 0x1A) {
   22046             DIP ("bndmov %s, %%bnd%d\n", oper, bnd);
   22047          } else /* opc == 0x1B */ {
   22048             DIP ("bndmov %%bnd%d, %s\n", bnd, oper);
   22049          }
   22050       } else if (haveNo66noF2noF3 (pfx)) {
   22051          if (opc == 0x1A) {
   22052             DIP ("bndldx %s, %%bnd%d\n", oper, bnd);
   22053          } else /* opc == 0x1B */ {
   22054             DIP ("bndstx %%bnd%d, %s\n", bnd, oper);
   22055          }
   22056       } else goto decode_failure;
   22057 
   22058       return delta;
   22059    }
   22060 
   22061    case 0xA2: { /* CPUID */
   22062       /* Uses dirty helper:
   22063             void amd64g_dirtyhelper_CPUID ( VexGuestAMD64State* )
   22064          declared to mod rax, wr rbx, rcx, rdx
   22065       */
   22066       IRDirty*     d     = NULL;
   22067       const HChar* fName = NULL;
   22068       void*        fAddr = NULL;
   22069 
   22070       if (haveF2orF3(pfx)) goto decode_failure;
   22071 
   22072       /* This isn't entirely correct, CPUID should depend on the VEX
   22073          capabilities, not on the underlying CPU. See bug #324882. */
   22074       if ((archinfo->hwcaps & VEX_HWCAPS_AMD64_SSE3) &&
   22075           (archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16) &&
   22076           (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX2)) {
   22077          fName = "amd64g_dirtyhelper_CPUID_avx2";
   22078          fAddr = &amd64g_dirtyhelper_CPUID_avx2;
   22079          /* This is a Core-i7-4910-like machine */
   22080       }
   22081       else if ((archinfo->hwcaps & VEX_HWCAPS_AMD64_SSE3) &&
   22082                (archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16) &&
   22083                (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   22084          fName = "amd64g_dirtyhelper_CPUID_avx_and_cx16";
   22085          fAddr = &amd64g_dirtyhelper_CPUID_avx_and_cx16;
   22086          /* This is a Core-i5-2300-like machine */
   22087       }
   22088       else if ((archinfo->hwcaps & VEX_HWCAPS_AMD64_SSE3) &&
   22089                (archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16)) {
   22090          fName = "amd64g_dirtyhelper_CPUID_sse42_and_cx16";
   22091          fAddr = &amd64g_dirtyhelper_CPUID_sse42_and_cx16;
   22092          /* This is a Core-i5-670-like machine */
   22093       }
   22094       else {
   22095          /* Give a CPUID for at least a baseline machine, SSE2
   22096             only, and no CX16 */
   22097          fName = "amd64g_dirtyhelper_CPUID_baseline";
   22098          fAddr = &amd64g_dirtyhelper_CPUID_baseline;
   22099       }
   22100 
   22101       vassert(fName); vassert(fAddr);
   22102       d = unsafeIRDirty_0_N ( 0/*regparms*/,
   22103                               fName, fAddr, mkIRExprVec_1(IRExpr_GSPTR()) );
   22104       /* declare guest state effects */
   22105       d->nFxState = 4;
   22106       vex_bzero(&d->fxState, sizeof(d->fxState));
   22107       d->fxState[0].fx     = Ifx_Modify;
   22108       d->fxState[0].offset = OFFB_RAX;
   22109       d->fxState[0].size   = 8;
   22110       d->fxState[1].fx     = Ifx_Write;
   22111       d->fxState[1].offset = OFFB_RBX;
   22112       d->fxState[1].size   = 8;
   22113       d->fxState[2].fx     = Ifx_Modify;
   22114       d->fxState[2].offset = OFFB_RCX;
   22115       d->fxState[2].size   = 8;
   22116       d->fxState[3].fx     = Ifx_Write;
   22117       d->fxState[3].offset = OFFB_RDX;
   22118       d->fxState[3].size   = 8;
   22119       /* execute the dirty call, side-effecting guest state */
   22120       stmt( IRStmt_Dirty(d) );
   22121       /* CPUID is a serialising insn.  So, just in case someone is
   22122          using it as a memory fence ... */
   22123       stmt( IRStmt_MBE(Imbe_Fence) );
   22124       DIP("cpuid\n");
   22125       return delta;
   22126    }
   22127 
   22128    case 0xA3: { /* BT Gv,Ev */
   22129       /* We let dis_bt_G_E decide whether F2 or F3 are allowable. */
   22130       Bool ok = True;
   22131       if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
   22132       delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpNone, &ok );
   22133       if (!ok) goto decode_failure;
   22134       return delta;
   22135    }
   22136 
   22137    case 0xA4: /* SHLDv imm8,Gv,Ev */
   22138       modrm = getUChar(delta);
   22139       d64   = delta + lengthAMode(pfx, delta);
   22140       vex_sprintf(dis_buf, "$%d", (Int)getUChar(d64));
   22141       delta = dis_SHLRD_Gv_Ev (
   22142                  vbi, pfx, delta, modrm, sz,
   22143                  mkU8(getUChar(d64)), True, /* literal */
   22144                  dis_buf, True /* left */ );
   22145       return delta;
   22146 
   22147    case 0xA5: /* SHLDv %cl,Gv,Ev */
   22148       modrm = getUChar(delta);
   22149       delta = dis_SHLRD_Gv_Ev (
   22150                  vbi, pfx, delta, modrm, sz,
   22151                  getIRegCL(), False, /* not literal */
   22152                  "%cl", True /* left */ );
   22153       return delta;
   22154 
   22155    case 0xAB: { /* BTS Gv,Ev */
   22156       /* We let dis_bt_G_E decide whether F2 or F3 are allowable. */
   22157       Bool ok = True;
   22158       if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
   22159       delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpSet, &ok );
   22160       if (!ok) goto decode_failure;
   22161       return delta;
   22162    }
   22163 
   22164    case 0xAC: /* SHRDv imm8,Gv,Ev */
   22165       modrm = getUChar(delta);
   22166       d64   = delta + lengthAMode(pfx, delta);
   22167       vex_sprintf(dis_buf, "$%d", (Int)getUChar(d64));
   22168       delta = dis_SHLRD_Gv_Ev (
   22169                  vbi, pfx, delta, modrm, sz,
   22170                  mkU8(getUChar(d64)), True, /* literal */
   22171                  dis_buf, False /* right */ );
   22172       return delta;
   22173 
   22174    case 0xAD: /* SHRDv %cl,Gv,Ev */
   22175       modrm = getUChar(delta);
   22176       delta = dis_SHLRD_Gv_Ev (
   22177                  vbi, pfx, delta, modrm, sz,
   22178                  getIRegCL(), False, /* not literal */
   22179                  "%cl", False /* right */);
   22180       return delta;
   22181 
   22182    case 0xAF: /* IMUL Ev, Gv */
   22183       if (haveF2orF3(pfx)) goto decode_failure;
   22184       delta = dis_mul_E_G ( vbi, pfx, sz, delta );
   22185       return delta;
   22186 
   22187    case 0xB0: { /* CMPXCHG Gb,Eb */
   22188       Bool ok = True;
   22189       /* We let dis_cmpxchg_G_E decide whether F2 or F3 are allowable. */
   22190       delta = dis_cmpxchg_G_E ( &ok, vbi, pfx, 1, delta );
   22191       if (!ok) goto decode_failure;
   22192       return delta;
   22193    }
   22194 
   22195    case 0xB1: { /* CMPXCHG Gv,Ev (allowed in 16,32,64 bit) */
   22196       Bool ok = True;
   22197       /* We let dis_cmpxchg_G_E decide whether F2 or F3 are allowable. */
   22198       if (sz != 2 && sz != 4 && sz != 8) goto decode_failure;
   22199       delta = dis_cmpxchg_G_E ( &ok, vbi, pfx, sz, delta );
   22200       if (!ok) goto decode_failure;
   22201       return delta;
   22202    }
   22203 
   22204    case 0xB3: { /* BTR Gv,Ev */
   22205       /* We let dis_bt_G_E decide whether F2 or F3 are allowable. */
   22206       Bool ok = True;
   22207       if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
   22208       delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpReset, &ok );
   22209       if (!ok) goto decode_failure;
   22210       return delta;
   22211    }
   22212 
   22213    case 0xB6: /* MOVZXb Eb,Gv */
   22214       if (haveF2orF3(pfx)) goto decode_failure;
   22215       if (sz != 2 && sz != 4 && sz != 8)
   22216          goto decode_failure;
   22217       delta = dis_movx_E_G ( vbi, pfx, delta, 1, sz, False );
   22218       return delta;
   22219 
   22220    case 0xB7: /* MOVZXw Ew,Gv */
   22221       if (haveF2orF3(pfx)) goto decode_failure;
   22222       if (sz != 4 && sz != 8)
   22223          goto decode_failure;
   22224       delta = dis_movx_E_G ( vbi, pfx, delta, 2, sz, False );
   22225       return delta;
   22226 
   22227    case 0xBA: { /* Grp8 Ib,Ev */
   22228       /* We let dis_Grp8_Imm decide whether F2 or F3 are allowable. */
   22229       Bool decode_OK = False;
   22230       modrm = getUChar(delta);
   22231       am_sz = lengthAMode(pfx,delta);
   22232       d64   = getSDisp8(delta + am_sz);
   22233       delta = dis_Grp8_Imm ( vbi, pfx, delta, modrm, am_sz, sz, d64,
   22234                              &decode_OK );
   22235       if (!decode_OK)
   22236          goto decode_failure;
   22237       return delta;
   22238    }
   22239 
   22240    case 0xBB: { /* BTC Gv,Ev */
   22241       /* We let dis_bt_G_E decide whether F2 or F3 are allowable. */
   22242       Bool ok = False;
   22243       if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
   22244       delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpComp, &ok );
   22245       if (!ok) goto decode_failure;
   22246       return delta;
   22247    }
   22248 
   22249    case 0xBC: /* BSF Gv,Ev */
   22250       if (!haveF2orF3(pfx)
   22251           || (haveF3noF2(pfx)
   22252               && 0 == (archinfo->hwcaps & VEX_HWCAPS_AMD64_BMI))) {
   22253          /* no-F2 no-F3 0F BC = BSF
   22254                   or F3 0F BC = REP; BSF on older CPUs.  */
   22255          delta = dis_bs_E_G ( vbi, pfx, sz, delta, True );
   22256          return delta;
   22257       }
   22258       /* Fall through, since F3 0F BC is TZCNT, and needs to
   22259          be handled by dis_ESC_0F__SSE4. */
   22260       break;
   22261 
   22262    case 0xBD: /* BSR Gv,Ev */
   22263       if (!haveF2orF3(pfx)
   22264           || (haveF3noF2(pfx)
   22265               && 0 == (archinfo->hwcaps & VEX_HWCAPS_AMD64_LZCNT))) {
   22266          /* no-F2 no-F3 0F BD = BSR
   22267                   or F3 0F BD = REP; BSR on older CPUs.  */
   22268          delta = dis_bs_E_G ( vbi, pfx, sz, delta, False );
   22269          return delta;
   22270       }
   22271       /* Fall through, since F3 0F BD is LZCNT, and needs to
   22272          be handled by dis_ESC_0F__SSE4. */
   22273       break;
   22274 
   22275    case 0xBE: /* MOVSXb Eb,Gv */
   22276       if (haveF2orF3(pfx)) goto decode_failure;
   22277       if (sz != 2 && sz != 4 && sz != 8)
   22278          goto decode_failure;
   22279       delta = dis_movx_E_G ( vbi, pfx, delta, 1, sz, True );
   22280       return delta;
   22281 
   22282    case 0xBF: /* MOVSXw Ew,Gv */
   22283       if (haveF2orF3(pfx)) goto decode_failure;
   22284       if (sz != 4 && sz != 8)
   22285          goto decode_failure;
   22286       delta = dis_movx_E_G ( vbi, pfx, delta, 2, sz, True );
   22287       return delta;
   22288 
   22289    case 0xC0: { /* XADD Gb,Eb */
   22290       Bool decode_OK = False;
   22291       delta = dis_xadd_G_E ( &decode_OK, vbi, pfx, 1, delta );
   22292       if (!decode_OK)
   22293          goto decode_failure;
   22294       return delta;
   22295    }
   22296 
   22297    case 0xC1: { /* XADD Gv,Ev */
   22298       Bool decode_OK = False;
   22299       delta = dis_xadd_G_E ( &decode_OK, vbi, pfx, sz, delta );
   22300       if (!decode_OK)
   22301          goto decode_failure;
   22302       return delta;
   22303    }
   22304 
   22305    case 0xC7: { /* CMPXCHG8B Ev, CMPXCHG16B Ev */
   22306       IRType  elemTy     = sz==4 ? Ity_I32 : Ity_I64;
   22307       IRTemp  expdHi     = newTemp(elemTy);
   22308       IRTemp  expdLo     = newTemp(elemTy);
   22309       IRTemp  dataHi     = newTemp(elemTy);
   22310       IRTemp  dataLo     = newTemp(elemTy);
   22311       IRTemp  oldHi      = newTemp(elemTy);
   22312       IRTemp  oldLo      = newTemp(elemTy);
   22313       IRTemp  flags_old  = newTemp(Ity_I64);
   22314       IRTemp  flags_new  = newTemp(Ity_I64);
   22315       IRTemp  success    = newTemp(Ity_I1);
   22316       IROp    opOR       = sz==4 ? Iop_Or32    : Iop_Or64;
   22317       IROp    opXOR      = sz==4 ? Iop_Xor32   : Iop_Xor64;
   22318       IROp    opCasCmpEQ = sz==4 ? Iop_CasCmpEQ32 : Iop_CasCmpEQ64;
   22319       IRExpr* zero       = sz==4 ? mkU32(0)    : mkU64(0);
   22320       IRTemp expdHi64    = newTemp(Ity_I64);
   22321       IRTemp expdLo64    = newTemp(Ity_I64);
   22322 
   22323       /* Translate this using a DCAS, even if there is no LOCK
   22324          prefix.  Life is too short to bother with generating two
   22325          different translations for the with/without-LOCK-prefix
   22326          cases. */
   22327       *expect_CAS = True;
   22328 
   22329       /* Decode, and generate address. */
   22330       if (have66(pfx)) goto decode_failure;
   22331       if (sz != 4 && sz != 8) goto decode_failure;
   22332       if (sz == 8 && !(archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16))
   22333          goto decode_failure;
   22334       modrm = getUChar(delta);
   22335       if (epartIsReg(modrm)) goto decode_failure;
   22336       if (gregLO3ofRM(modrm) != 1) goto decode_failure;
   22337       if (haveF2orF3(pfx)) {
   22338          /* Since the e-part is memory only, F2 or F3 (one or the
   22339             other) is acceptable if LOCK is also present.  But only
   22340             for cmpxchg8b. */
   22341          if (sz == 8) goto decode_failure;
   22342          if (haveF2andF3(pfx) || !haveLOCK(pfx)) goto decode_failure;
   22343       }
   22344 
   22345       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22346       delta += alen;
   22347 
   22348       /* cmpxchg16b requires an alignment check. */
   22349       if (sz == 8)
   22350          gen_SEGV_if_not_16_aligned( addr );
   22351 
   22352       /* Get the expected and new values. */
   22353       assign( expdHi64, getIReg64(R_RDX) );
   22354       assign( expdLo64, getIReg64(R_RAX) );
   22355 
   22356       /* These are the correctly-sized expected and new values.
   22357          However, we also get expdHi64/expdLo64 above as 64-bits
   22358          regardless, because we will need them later in the 32-bit
   22359          case (paradoxically). */
   22360       assign( expdHi, sz==4 ? unop(Iop_64to32, mkexpr(expdHi64))
   22361                             : mkexpr(expdHi64) );
   22362       assign( expdLo, sz==4 ? unop(Iop_64to32, mkexpr(expdLo64))
   22363                             : mkexpr(expdLo64) );
   22364       assign( dataHi, sz==4 ? getIReg32(R_RCX) : getIReg64(R_RCX) );
   22365       assign( dataLo, sz==4 ? getIReg32(R_RBX) : getIReg64(R_RBX) );
   22366 
   22367       /* Do the DCAS */
   22368       stmt( IRStmt_CAS(
   22369                mkIRCAS( oldHi, oldLo,
   22370                         Iend_LE, mkexpr(addr),
   22371                         mkexpr(expdHi), mkexpr(expdLo),
   22372                         mkexpr(dataHi), mkexpr(dataLo)
   22373             )));
   22374 
   22375       /* success when oldHi:oldLo == expdHi:expdLo */
   22376       assign( success,
   22377               binop(opCasCmpEQ,
   22378                     binop(opOR,
   22379                           binop(opXOR, mkexpr(oldHi), mkexpr(expdHi)),
   22380                           binop(opXOR, mkexpr(oldLo), mkexpr(expdLo))
   22381                     ),
   22382                     zero
   22383               ));
   22384 
   22385       /* If the DCAS is successful, that is to say oldHi:oldLo ==
   22386          expdHi:expdLo, then put expdHi:expdLo back in RDX:RAX,
   22387          which is where they came from originally.  Both the actual
   22388          contents of these two regs, and any shadow values, are
   22389          unchanged.  If the DCAS fails then we're putting into
   22390          RDX:RAX the value seen in memory. */
   22391       /* Now of course there's a complication in the 32-bit case
   22392          (bah!): if the DCAS succeeds, we need to leave RDX:RAX
   22393          unchanged; but if we use the same scheme as in the 64-bit
   22394          case, we get hit by the standard rule that a write to the
   22395          bottom 32 bits of an integer register zeros the upper 32
   22396          bits.  And so the upper halves of RDX and RAX mysteriously
   22397          become zero.  So we have to stuff back in the original
   22398          64-bit values which we previously stashed in
   22399          expdHi64:expdLo64, even if we're doing a cmpxchg8b. */
   22400       /* It's just _so_ much fun ... */
   22401       putIRegRDX( 8,
   22402                   IRExpr_ITE( mkexpr(success),
   22403                               mkexpr(expdHi64),
   22404                               sz == 4 ? unop(Iop_32Uto64, mkexpr(oldHi))
   22405                                       : mkexpr(oldHi)
   22406                 ));
   22407       putIRegRAX( 8,
   22408                   IRExpr_ITE( mkexpr(success),
   22409                               mkexpr(expdLo64),
   22410                               sz == 4 ? unop(Iop_32Uto64, mkexpr(oldLo))
   22411                                       : mkexpr(oldLo)
   22412                 ));
   22413 
   22414       /* Copy the success bit into the Z flag and leave the others
   22415          unchanged */
   22416       assign( flags_old, widenUto64(mk_amd64g_calculate_rflags_all()));
   22417       assign(
   22418          flags_new,
   22419          binop(Iop_Or64,
   22420                binop(Iop_And64, mkexpr(flags_old),
   22421                                 mkU64(~AMD64G_CC_MASK_Z)),
   22422                binop(Iop_Shl64,
   22423                      binop(Iop_And64,
   22424                            unop(Iop_1Uto64, mkexpr(success)), mkU64(1)),
   22425                      mkU8(AMD64G_CC_SHIFT_Z)) ));
   22426 
   22427       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   22428       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(flags_new) ));
   22429       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   22430       /* Set NDEP even though it isn't used.  This makes
   22431          redundant-PUT elimination of previous stores to this field
   22432          work better. */
   22433       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   22434 
   22435       /* Sheesh.  Aren't you glad it was me and not you that had to
   22436          write and validate all this grunge? */
   22437 
   22438       DIP("cmpxchg8b %s\n", dis_buf);
   22439       return delta;
   22440    }
   22441 
   22442    case 0xC8: /* BSWAP %eax */
   22443    case 0xC9:
   22444    case 0xCA:
   22445    case 0xCB:
   22446    case 0xCC:
   22447    case 0xCD:
   22448    case 0xCE:
   22449    case 0xCF: /* BSWAP %edi */
   22450       if (haveF2orF3(pfx)) goto decode_failure;
   22451       /* According to the AMD64 docs, this insn can have size 4 or
   22452          8. */
   22453       if (sz == 4) {
   22454          t1 = newTemp(Ity_I32);
   22455          assign( t1, getIRegRexB(4, pfx, opc-0xC8) );
   22456          t2 = math_BSWAP( t1, Ity_I32 );
   22457          putIRegRexB(4, pfx, opc-0xC8, mkexpr(t2));
   22458          DIP("bswapl %s\n", nameIRegRexB(4, pfx, opc-0xC8));
   22459          return delta;
   22460       }
   22461       if (sz == 8) {
   22462          t1 = newTemp(Ity_I64);
   22463          t2 = newTemp(Ity_I64);
   22464          assign( t1, getIRegRexB(8, pfx, opc-0xC8) );
   22465          t2 = math_BSWAP( t1, Ity_I64 );
   22466          putIRegRexB(8, pfx, opc-0xC8, mkexpr(t2));
   22467          DIP("bswapq %s\n", nameIRegRexB(8, pfx, opc-0xC8));
   22468          return delta;
   22469       }
   22470       goto decode_failure;
   22471 
   22472    default:
   22473       break;
   22474 
   22475    } /* first switch */
   22476 
   22477 
   22478    /* =-=-=-=-=-=-=-=-= MMXery =-=-=-=-=-=-=-=-= */
   22479    /* In the second switch, pick off MMX insns. */
   22480 
   22481    if (!have66orF2orF3(pfx)) {
   22482       /* So there's no SIMD prefix. */
   22483 
   22484       vassert(sz == 4 || sz == 8);
   22485 
   22486       switch (opc) { /* second switch */
   22487 
   22488       case 0x71:
   22489       case 0x72:
   22490       case 0x73: /* PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
   22491 
   22492       case 0x6E: /* MOVD (src)ireg-or-mem, (dst)mmxreg */
   22493       case 0x7E: /* MOVD (src)mmxreg, (dst)ireg-or-mem */
   22494       case 0x7F: /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
   22495       case 0x6F: /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
   22496 
   22497       case 0xFC:
   22498       case 0xFD:
   22499       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
   22500 
   22501       case 0xEC:
   22502       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
   22503 
   22504       case 0xDC:
   22505       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   22506 
   22507       case 0xF8:
   22508       case 0xF9:
   22509       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
   22510 
   22511       case 0xE8:
   22512       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
   22513 
   22514       case 0xD8:
   22515       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   22516 
   22517       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
   22518       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
   22519 
   22520       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
   22521 
   22522       case 0x74:
   22523       case 0x75:
   22524       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
   22525 
   22526       case 0x64:
   22527       case 0x65:
   22528       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
   22529 
   22530       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
   22531       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
   22532       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
   22533 
   22534       case 0x68:
   22535       case 0x69:
   22536       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
   22537 
   22538       case 0x60:
   22539       case 0x61:
   22540       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
   22541 
   22542       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
   22543       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
   22544       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
   22545       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
   22546 
   22547       case 0xF1: /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
   22548       case 0xF2:
   22549       case 0xF3:
   22550 
   22551       case 0xD1: /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
   22552       case 0xD2:
   22553       case 0xD3:
   22554 
   22555       case 0xE1: /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
   22556       case 0xE2: {
   22557          Bool decode_OK = False;
   22558          delta = dis_MMX ( &decode_OK, vbi, pfx, sz, deltaIN );
   22559          if (decode_OK)
   22560             return delta;
   22561          goto decode_failure;
   22562       }
   22563 
   22564       default:
   22565          break;
   22566       } /* second switch */
   22567 
   22568    }
   22569 
   22570    /* A couple of MMX corner cases */
   22571    if (opc == 0x0E/* FEMMS */ || opc == 0x77/* EMMS */) {
   22572       if (sz != 4)
   22573          goto decode_failure;
   22574       do_EMMS_preamble();
   22575       DIP("{f}emms\n");
   22576       return delta;
   22577    }
   22578 
   22579    /* =-=-=-=-=-=-=-=-= SSE2ery =-=-=-=-=-=-=-=-= */
   22580    /* Perhaps it's an SSE or SSE2 instruction.  We can try this
   22581       without checking the guest hwcaps because SSE2 is a baseline
   22582       facility in 64 bit mode. */
   22583    {
   22584       Bool decode_OK = False;
   22585       delta = dis_ESC_0F__SSE2 ( &decode_OK,
   22586                                  archinfo, vbi, pfx, sz, deltaIN, dres );
   22587       if (decode_OK)
   22588          return delta;
   22589    }
   22590 
   22591    /* =-=-=-=-=-=-=-=-= SSE3ery =-=-=-=-=-=-=-=-= */
   22592    /* Perhaps it's a SSE3 instruction.  FIXME: check guest hwcaps
   22593       first. */
   22594    {
   22595       Bool decode_OK = False;
   22596       delta = dis_ESC_0F__SSE3 ( &decode_OK, vbi, pfx, sz, deltaIN );
   22597       if (decode_OK)
   22598          return delta;
   22599    }
   22600 
   22601    /* =-=-=-=-=-=-=-=-= SSE4ery =-=-=-=-=-=-=-=-= */
   22602    /* Perhaps it's a SSE4 instruction.  FIXME: check guest hwcaps
   22603       first. */
   22604    {
   22605       Bool decode_OK = False;
   22606       delta = dis_ESC_0F__SSE4 ( &decode_OK,
   22607                                  archinfo, vbi, pfx, sz, deltaIN );
   22608       if (decode_OK)
   22609          return delta;
   22610    }
   22611 
   22612   decode_failure:
   22613    return deltaIN; /* fail */
   22614 }
   22615 
   22616 
   22617 /*------------------------------------------------------------*/
   22618 /*---                                                      ---*/
   22619 /*--- Top-level post-escape decoders: dis_ESC_0F38         ---*/
   22620 /*---                                                      ---*/
   22621 /*------------------------------------------------------------*/
   22622 
   22623 __attribute__((noinline))
   22624 static
   22625 Long dis_ESC_0F38 (
   22626         /*MB_OUT*/DisResult* dres,
   22627         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
   22628         Bool         resteerCisOk,
   22629         void*        callback_opaque,
   22630         const VexArchInfo* archinfo,
   22631         const VexAbiInfo*  vbi,
   22632         Prefix pfx, Int sz, Long deltaIN
   22633      )
   22634 {
   22635    Long   delta = deltaIN;
   22636    UChar  opc   = getUChar(delta);
   22637    delta++;
   22638    switch (opc) {
   22639 
   22640    case 0xF0:   /* 0F 38 F0 = MOVBE m16/32/64(E), r16/32/64(G) */
   22641    case 0xF1: { /* 0F 38 F1 = MOVBE r16/32/64(G), m16/32/64(E) */
   22642       if (!haveF2orF3(pfx) && !haveVEX(pfx)
   22643           && (sz == 2 || sz == 4 || sz == 8)) {
   22644          IRTemp addr  = IRTemp_INVALID;
   22645          UChar  modrm = 0;
   22646          Int    alen  = 0;
   22647          HChar  dis_buf[50];
   22648          modrm = getUChar(delta);
   22649          if (epartIsReg(modrm)) break;
   22650          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22651          delta += alen;
   22652          IRType ty = szToITy(sz);
   22653          IRTemp src = newTemp(ty);
   22654          if (opc == 0xF0) { /* LOAD */
   22655             assign(src, loadLE(ty, mkexpr(addr)));
   22656             IRTemp dst = math_BSWAP(src, ty);
   22657             putIRegG(sz, pfx, modrm, mkexpr(dst));
   22658             DIP("movbe %s,%s\n", dis_buf, nameIRegG(sz, pfx, modrm));
   22659          } else { /* STORE */
   22660             assign(src, getIRegG(sz, pfx, modrm));
   22661             IRTemp dst = math_BSWAP(src, ty);
   22662             storeLE(mkexpr(addr), mkexpr(dst));
   22663             DIP("movbe %s,%s\n", nameIRegG(sz, pfx, modrm), dis_buf);
   22664          }
   22665          return delta;
   22666       }
   22667       /* else fall through; maybe one of the decoders below knows what
   22668          it is. */
   22669       break;
   22670    }
   22671 
   22672    default:
   22673       break;
   22674    }
   22675 
   22676    /* =-=-=-=-=-=-=-=-= SSSE3ery =-=-=-=-=-=-=-=-= */
   22677    /* Perhaps it's an SSSE3 instruction.  FIXME: consult guest hwcaps
   22678       rather than proceeding indiscriminately. */
   22679    {
   22680       Bool decode_OK = False;
   22681       delta = dis_ESC_0F38__SupSSE3 ( &decode_OK, vbi, pfx, sz, deltaIN );
   22682       if (decode_OK)
   22683          return delta;
   22684    }
   22685 
   22686    /* =-=-=-=-=-=-=-=-= SSE4ery =-=-=-=-=-=-=-=-= */
   22687    /* Perhaps it's an SSE4 instruction.  FIXME: consult guest hwcaps
   22688       rather than proceeding indiscriminately. */
   22689    {
   22690       Bool decode_OK = False;
   22691       delta = dis_ESC_0F38__SSE4 ( &decode_OK, vbi, pfx, sz, deltaIN );
   22692       if (decode_OK)
   22693          return delta;
   22694    }
   22695 
   22696    /* Ignore previous decode attempts and restart from the beginning of
   22697       the instruction. */
   22698    delta = deltaIN;
   22699    opc   = getUChar(delta);
   22700    delta++;
   22701 
   22702    switch (opc) {
   22703 
   22704    case 0xF6: {
   22705       /* 66 0F 38 F6 = ADCX r32/64(G), m32/64(E) */
   22706       /* F3 0F 38 F6 = ADOX r32/64(G), m32/64(E) */
   22707       /* These were introduced in Broadwell.  Gate them on AVX so as to at
   22708          least reject them on earlier guests.  Has no host requirements. */
   22709       if (have66noF2noF3(pfx) && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   22710          if (sz == 2) {
   22711             sz = 4; /* 66 prefix but operand size is 4/8 */
   22712          }
   22713          delta = dis_op2_E_G ( vbi, pfx, Iop_Add8, WithFlagCarryX, True,
   22714                                sz, delta, "adcx" );
   22715          return delta;
   22716       }
   22717       if (haveF3no66noF2(pfx) && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   22718          delta = dis_op2_E_G ( vbi, pfx, Iop_Add8, WithFlagOverX, True,
   22719                                sz, delta, "adox" );
   22720          return delta;
   22721       }
   22722       /* else fall through */
   22723       break;
   22724    }
   22725 
   22726    default:
   22727       break;
   22728    }
   22729 
   22730   /*decode_failure:*/
   22731    return deltaIN; /* fail */
   22732 }
   22733 
   22734 
   22735 /*------------------------------------------------------------*/
   22736 /*---                                                      ---*/
   22737 /*--- Top-level post-escape decoders: dis_ESC_0F3A         ---*/
   22738 /*---                                                      ---*/
   22739 /*------------------------------------------------------------*/
   22740 
   22741 __attribute__((noinline))
   22742 static
   22743 Long dis_ESC_0F3A (
   22744         /*MB_OUT*/DisResult* dres,
   22745         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
   22746         Bool         resteerCisOk,
   22747         void*        callback_opaque,
   22748         const VexArchInfo* archinfo,
   22749         const VexAbiInfo*  vbi,
   22750         Prefix pfx, Int sz, Long deltaIN
   22751      )
   22752 {
   22753    Long   delta = deltaIN;
   22754    UChar  opc   = getUChar(delta);
   22755    delta++;
   22756    switch (opc) {
   22757 
   22758    default:
   22759       break;
   22760 
   22761    }
   22762 
   22763    /* =-=-=-=-=-=-=-=-= SSSE3ery =-=-=-=-=-=-=-=-= */
   22764    /* Perhaps it's an SSSE3 instruction.  FIXME: consult guest hwcaps
   22765       rather than proceeding indiscriminately. */
   22766    {
   22767       Bool decode_OK = False;
   22768       delta = dis_ESC_0F3A__SupSSE3 ( &decode_OK, vbi, pfx, sz, deltaIN );
   22769       if (decode_OK)
   22770          return delta;
   22771    }
   22772 
   22773    /* =-=-=-=-=-=-=-=-= SSE4ery =-=-=-=-=-=-=-=-= */
   22774    /* Perhaps it's an SSE4 instruction.  FIXME: consult guest hwcaps
   22775       rather than proceeding indiscriminately. */
   22776    {
   22777       Bool decode_OK = False;
   22778       delta = dis_ESC_0F3A__SSE4 ( &decode_OK, vbi, pfx, sz, deltaIN );
   22779       if (decode_OK)
   22780          return delta;
   22781    }
   22782 
   22783    return deltaIN; /* fail */
   22784 }
   22785 
   22786 
   22787 /*------------------------------------------------------------*/
   22788 /*---                                                      ---*/
   22789 /*--- Top-level post-escape decoders: dis_ESC_0F__VEX      ---*/
   22790 /*---                                                      ---*/
   22791 /*------------------------------------------------------------*/
   22792 
   22793 /* FIXME: common up with the _256_ version below? */
   22794 static
   22795 Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG (
   22796         /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
   22797         Prefix pfx, Long delta, const HChar* name,
   22798         /* The actual operation.  Use either 'op' or 'opfn',
   22799            but not both. */
   22800         IROp op, IRTemp(*opFn)(IRTemp,IRTemp),
   22801         Bool invertLeftArg,
   22802         Bool swapArgs
   22803      )
   22804 {
   22805    UChar  modrm = getUChar(delta);
   22806    UInt   rD    = gregOfRexRM(pfx, modrm);
   22807    UInt   rSL   = getVexNvvvv(pfx);
   22808    IRTemp tSL   = newTemp(Ity_V128);
   22809    IRTemp tSR   = newTemp(Ity_V128);
   22810    IRTemp addr  = IRTemp_INVALID;
   22811    HChar  dis_buf[50];
   22812    Int    alen  = 0;
   22813    vassert(0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*WIG?*/);
   22814 
   22815    assign(tSL, invertLeftArg ? unop(Iop_NotV128, getXMMReg(rSL))
   22816                              : getXMMReg(rSL));
   22817 
   22818    if (epartIsReg(modrm)) {
   22819       UInt rSR = eregOfRexRM(pfx, modrm);
   22820       delta += 1;
   22821       assign(tSR, getXMMReg(rSR));
   22822       DIP("%s %s,%s,%s\n",
   22823           name, nameXMMReg(rSR), nameXMMReg(rSL), nameXMMReg(rD));
   22824    } else {
   22825       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   22826       delta += alen;
   22827       assign(tSR, loadLE(Ity_V128, mkexpr(addr)));
   22828       DIP("%s %s,%s,%s\n",
   22829           name, dis_buf, nameXMMReg(rSL), nameXMMReg(rD));
   22830    }
   22831 
   22832    IRTemp res = IRTemp_INVALID;
   22833    if (op != Iop_INVALID) {
   22834       vassert(opFn == NULL);
   22835       res = newTemp(Ity_V128);
   22836       if (requiresRMode(op)) {
   22837          IRTemp rm = newTemp(Ity_I32);
   22838          assign(rm, get_FAKE_roundingmode()); /* XXXROUNDINGFIXME */
   22839          assign(res, swapArgs
   22840                         ? triop(op, mkexpr(rm), mkexpr(tSR), mkexpr(tSL))
   22841                         : triop(op, mkexpr(rm), mkexpr(tSL), mkexpr(tSR)));
   22842       } else {
   22843          assign(res, swapArgs
   22844                         ? binop(op, mkexpr(tSR), mkexpr(tSL))
   22845                         : binop(op, mkexpr(tSL), mkexpr(tSR)));
   22846       }
   22847    } else {
   22848       vassert(opFn != NULL);
   22849       res = swapArgs ? opFn(tSR, tSL) : opFn(tSL, tSR);
   22850    }
   22851 
   22852    putYMMRegLoAndZU(rD, mkexpr(res));
   22853 
   22854    *uses_vvvv = True;
   22855    return delta;
   22856 }
   22857 
   22858 
   22859 /* Handle a VEX_NDS_128_66_0F_WIG (3-addr) insn, with a simple IROp
   22860    for the operation, no inversion of the left arg, and no swapping of
   22861    args. */
   22862 static
   22863 Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple (
   22864         /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
   22865         Prefix pfx, Long delta, const HChar* name,
   22866         IROp op
   22867      )
   22868 {
   22869    return dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   22870              uses_vvvv, vbi, pfx, delta, name, op, NULL, False, False);
   22871 }
   22872 
   22873 
   22874 /* Handle a VEX_NDS_128_66_0F_WIG (3-addr) insn, using the given IR
   22875    generator to compute the result, no inversion of the left
   22876    arg, and no swapping of args. */
   22877 static
   22878 Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex (
   22879         /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
   22880         Prefix pfx, Long delta, const HChar* name,
   22881         IRTemp(*opFn)(IRTemp,IRTemp)
   22882      )
   22883 {
   22884    return dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   22885              uses_vvvv, vbi, pfx, delta, name,
   22886              Iop_INVALID, opFn, False, False );
   22887 }
   22888 
   22889 
   22890 /* Vector by scalar shift of V by the amount specified at the bottom
   22891    of E. */
   22892 static ULong dis_AVX128_shiftV_byE ( const VexAbiInfo* vbi,
   22893                                      Prefix pfx, Long delta,
   22894                                      const HChar* opname, IROp op )
   22895 {
   22896    HChar   dis_buf[50];
   22897    Int     alen, size;
   22898    IRTemp  addr;
   22899    Bool    shl, shr, sar;
   22900    UChar   modrm = getUChar(delta);
   22901    UInt    rG    = gregOfRexRM(pfx,modrm);
   22902    UInt    rV    = getVexNvvvv(pfx);;
   22903    IRTemp  g0    = newTemp(Ity_V128);
   22904    IRTemp  g1    = newTemp(Ity_V128);
   22905    IRTemp  amt   = newTemp(Ity_I64);
   22906    IRTemp  amt8  = newTemp(Ity_I8);
   22907    if (epartIsReg(modrm)) {
   22908       UInt rE = eregOfRexRM(pfx,modrm);
   22909       assign( amt, getXMMRegLane64(rE, 0) );
   22910       DIP("%s %s,%s,%s\n", opname, nameXMMReg(rE),
   22911           nameXMMReg(rV), nameXMMReg(rG) );
   22912       delta++;
   22913    } else {
   22914       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22915       assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
   22916       DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   22917       delta += alen;
   22918    }
   22919    assign( g0, getXMMReg(rV) );
   22920    assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
   22921 
   22922    shl = shr = sar = False;
   22923    size = 0;
   22924    switch (op) {
   22925       case Iop_ShlN16x8: shl = True; size = 32; break;
   22926       case Iop_ShlN32x4: shl = True; size = 32; break;
   22927       case Iop_ShlN64x2: shl = True; size = 64; break;
   22928       case Iop_SarN16x8: sar = True; size = 16; break;
   22929       case Iop_SarN32x4: sar = True; size = 32; break;
   22930       case Iop_ShrN16x8: shr = True; size = 16; break;
   22931       case Iop_ShrN32x4: shr = True; size = 32; break;
   22932       case Iop_ShrN64x2: shr = True; size = 64; break;
   22933       default: vassert(0);
   22934    }
   22935 
   22936    if (shl || shr) {
   22937      assign(
   22938         g1,
   22939         IRExpr_ITE(
   22940            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
   22941            binop(op, mkexpr(g0), mkexpr(amt8)),
   22942            mkV128(0x0000)
   22943         )
   22944      );
   22945    } else
   22946    if (sar) {
   22947      assign(
   22948         g1,
   22949         IRExpr_ITE(
   22950            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
   22951            binop(op, mkexpr(g0), mkexpr(amt8)),
   22952            binop(op, mkexpr(g0), mkU8(size-1))
   22953         )
   22954      );
   22955    } else {
   22956       vassert(0);
   22957    }
   22958 
   22959    putYMMRegLoAndZU( rG, mkexpr(g1) );
   22960    return delta;
   22961 }
   22962 
   22963 
   22964 /* Vector by scalar shift of V by the amount specified at the bottom
   22965    of E. */
   22966 static ULong dis_AVX256_shiftV_byE ( const VexAbiInfo* vbi,
   22967                                      Prefix pfx, Long delta,
   22968                                      const HChar* opname, IROp op )
   22969 {
   22970    HChar   dis_buf[50];
   22971    Int     alen, size;
   22972    IRTemp  addr;
   22973    Bool    shl, shr, sar;
   22974    UChar   modrm = getUChar(delta);
   22975    UInt    rG    = gregOfRexRM(pfx,modrm);
   22976    UInt    rV    = getVexNvvvv(pfx);;
   22977    IRTemp  g0    = newTemp(Ity_V256);
   22978    IRTemp  g1    = newTemp(Ity_V256);
   22979    IRTemp  amt   = newTemp(Ity_I64);
   22980    IRTemp  amt8  = newTemp(Ity_I8);
   22981    if (epartIsReg(modrm)) {
   22982       UInt rE = eregOfRexRM(pfx,modrm);
   22983       assign( amt, getXMMRegLane64(rE, 0) );
   22984       DIP("%s %s,%s,%s\n", opname, nameXMMReg(rE),
   22985           nameYMMReg(rV), nameYMMReg(rG) );
   22986       delta++;
   22987    } else {
   22988       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22989       assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
   22990       DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
   22991       delta += alen;
   22992    }
   22993    assign( g0, getYMMReg(rV) );
   22994    assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
   22995 
   22996    shl = shr = sar = False;
   22997    size = 0;
   22998    switch (op) {
   22999       case Iop_ShlN16x16: shl = True; size = 32; break;
   23000       case Iop_ShlN32x8:  shl = True; size = 32; break;
   23001       case Iop_ShlN64x4:  shl = True; size = 64; break;
   23002       case Iop_SarN16x16: sar = True; size = 16; break;
   23003       case Iop_SarN32x8:  sar = True; size = 32; break;
   23004       case Iop_ShrN16x16: shr = True; size = 16; break;
   23005       case Iop_ShrN32x8:  shr = True; size = 32; break;
   23006       case Iop_ShrN64x4:  shr = True; size = 64; break;
   23007       default: vassert(0);
   23008    }
   23009 
   23010    if (shl || shr) {
   23011      assign(
   23012         g1,
   23013         IRExpr_ITE(
   23014            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
   23015            binop(op, mkexpr(g0), mkexpr(amt8)),
   23016            binop(Iop_V128HLtoV256, mkV128(0), mkV128(0))
   23017         )
   23018      );
   23019    } else
   23020    if (sar) {
   23021      assign(
   23022         g1,
   23023         IRExpr_ITE(
   23024            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
   23025            binop(op, mkexpr(g0), mkexpr(amt8)),
   23026            binop(op, mkexpr(g0), mkU8(size-1))
   23027         )
   23028      );
   23029    } else {
   23030       vassert(0);
   23031    }
   23032 
   23033    putYMMReg( rG, mkexpr(g1) );
   23034    return delta;
   23035 }
   23036 
   23037 
   23038 /* Vector by vector shift of V by the amount specified at the bottom
   23039    of E.  Vector by vector shifts are defined for all shift amounts,
   23040    so not using Iop_S*x* here (and SSE2 doesn't support variable shifts
   23041    anyway).  */
   23042 static ULong dis_AVX_var_shiftV_byE ( const VexAbiInfo* vbi,
   23043                                       Prefix pfx, Long delta,
   23044                                       const HChar* opname, IROp op, Bool isYMM )
   23045 {
   23046    HChar   dis_buf[50];
   23047    Int     alen, size, i;
   23048    IRTemp  addr;
   23049    UChar   modrm = getUChar(delta);
   23050    UInt    rG    = gregOfRexRM(pfx,modrm);
   23051    UInt    rV    = getVexNvvvv(pfx);;
   23052    IRTemp  sV    = isYMM ? newTemp(Ity_V256) : newTemp(Ity_V128);
   23053    IRTemp  amt   = isYMM ? newTemp(Ity_V256) : newTemp(Ity_V128);
   23054    IRTemp  amts[8], sVs[8], res[8];
   23055    if (epartIsReg(modrm)) {
   23056       UInt rE = eregOfRexRM(pfx,modrm);
   23057       assign( amt, isYMM ? getYMMReg(rE) : getXMMReg(rE) );
   23058       if (isYMM) {
   23059          DIP("%s %s,%s,%s\n", opname, nameYMMReg(rE),
   23060              nameYMMReg(rV), nameYMMReg(rG) );
   23061       } else {
   23062          DIP("%s %s,%s,%s\n", opname, nameXMMReg(rE),
   23063              nameXMMReg(rV), nameXMMReg(rG) );
   23064       }
   23065       delta++;
   23066    } else {
   23067       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23068       assign( amt, loadLE(isYMM ? Ity_V256 : Ity_V128, mkexpr(addr)) );
   23069       if (isYMM) {
   23070          DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV),
   23071              nameYMMReg(rG) );
   23072       } else {
   23073          DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV),
   23074              nameXMMReg(rG) );
   23075       }
   23076       delta += alen;
   23077    }
   23078    assign( sV, isYMM ? getYMMReg(rV) : getXMMReg(rV) );
   23079 
   23080    size = 0;
   23081    switch (op) {
   23082       case Iop_Shl32: size = 32; break;
   23083       case Iop_Shl64: size = 64; break;
   23084       case Iop_Sar32: size = 32; break;
   23085       case Iop_Shr32: size = 32; break;
   23086       case Iop_Shr64: size = 64; break;
   23087       default: vassert(0);
   23088    }
   23089 
   23090    for (i = 0; i < 8; i++) {
   23091       sVs[i] = IRTemp_INVALID;
   23092       amts[i] = IRTemp_INVALID;
   23093    }
   23094    switch (size) {
   23095       case 32:
   23096          if (isYMM) {
   23097             breakupV256to32s( sV, &sVs[7], &sVs[6], &sVs[5], &sVs[4],
   23098                                   &sVs[3], &sVs[2], &sVs[1], &sVs[0] );
   23099             breakupV256to32s( amt, &amts[7], &amts[6], &amts[5], &amts[4],
   23100                                    &amts[3], &amts[2], &amts[1], &amts[0] );
   23101          } else {
   23102             breakupV128to32s( sV, &sVs[3], &sVs[2], &sVs[1], &sVs[0] );
   23103             breakupV128to32s( amt, &amts[3], &amts[2], &amts[1], &amts[0] );
   23104         }
   23105          break;
   23106       case 64:
   23107          if (isYMM) {
   23108             breakupV256to64s( sV, &sVs[3], &sVs[2], &sVs[1], &sVs[0] );
   23109             breakupV256to64s( amt, &amts[3], &amts[2], &amts[1], &amts[0] );
   23110          } else {
   23111             breakupV128to64s( sV, &sVs[1], &sVs[0] );
   23112             breakupV128to64s( amt, &amts[1], &amts[0] );
   23113          }
   23114          break;
   23115       default: vassert(0);
   23116    }
   23117    for (i = 0; i < 8; i++)
   23118       if (sVs[i] != IRTemp_INVALID) {
   23119          res[i] = size == 32 ? newTemp(Ity_I32) : newTemp(Ity_I64);
   23120          assign( res[i],
   23121                  IRExpr_ITE(
   23122                     binop(size == 32 ? Iop_CmpLT32U : Iop_CmpLT64U,
   23123                           mkexpr(amts[i]),
   23124                           size == 32 ? mkU32(size) : mkU64(size)),
   23125                     binop(op, mkexpr(sVs[i]),
   23126                                unop(size == 32 ? Iop_32to8 : Iop_64to8,
   23127                                     mkexpr(amts[i]))),
   23128                     op == Iop_Sar32 ? binop(op, mkexpr(sVs[i]), mkU8(size-1))
   23129                                     : size == 32 ? mkU32(0) : mkU64(0)
   23130          ));
   23131       }
   23132    switch (size) {
   23133       case 32:
   23134          for (i = 0; i < 8; i++)
   23135             putYMMRegLane32( rG, i, (i < 4 || isYMM)
   23136                                     ? mkexpr(res[i]) : mkU32(0) );
   23137          break;
   23138       case 64:
   23139          for (i = 0; i < 4; i++)
   23140             putYMMRegLane64( rG, i, (i < 2 || isYMM)
   23141                                     ? mkexpr(res[i]) : mkU64(0) );
   23142          break;
   23143       default: vassert(0);
   23144    }
   23145 
   23146    return delta;
   23147 }
   23148 
   23149 
   23150 /* Vector by scalar shift of E into V, by an immediate byte.  Modified
   23151    version of dis_SSE_shiftE_imm. */
   23152 static
   23153 Long dis_AVX128_shiftE_to_V_imm( Prefix pfx,
   23154                                  Long delta, const HChar* opname, IROp op )
   23155 {
   23156    Bool    shl, shr, sar;
   23157    UChar   rm   = getUChar(delta);
   23158    IRTemp  e0   = newTemp(Ity_V128);
   23159    IRTemp  e1   = newTemp(Ity_V128);
   23160    UInt    rD   = getVexNvvvv(pfx);
   23161    UChar   amt, size;
   23162    vassert(epartIsReg(rm));
   23163    vassert(gregLO3ofRM(rm) == 2
   23164            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
   23165    amt = getUChar(delta+1);
   23166    delta += 2;
   23167    DIP("%s $%d,%s,%s\n", opname,
   23168                          (Int)amt,
   23169                          nameXMMReg(eregOfRexRM(pfx,rm)),
   23170                          nameXMMReg(rD));
   23171    assign( e0, getXMMReg(eregOfRexRM(pfx,rm)) );
   23172 
   23173    shl = shr = sar = False;
   23174    size = 0;
   23175    switch (op) {
   23176       case Iop_ShlN16x8: shl = True; size = 16; break;
   23177       case Iop_ShlN32x4: shl = True; size = 32; break;
   23178       case Iop_ShlN64x2: shl = True; size = 64; break;
   23179       case Iop_SarN16x8: sar = True; size = 16; break;
   23180       case Iop_SarN32x4: sar = True; size = 32; break;
   23181       case Iop_ShrN16x8: shr = True; size = 16; break;
   23182       case Iop_ShrN32x4: shr = True; size = 32; break;
   23183       case Iop_ShrN64x2: shr = True; size = 64; break;
   23184       default: vassert(0);
   23185    }
   23186 
   23187    if (shl || shr) {
   23188      assign( e1, amt >= size
   23189                     ? mkV128(0x0000)
   23190                     : binop(op, mkexpr(e0), mkU8(amt))
   23191      );
   23192    } else
   23193    if (sar) {
   23194      assign( e1, amt >= size
   23195                     ? binop(op, mkexpr(e0), mkU8(size-1))
   23196                     : binop(op, mkexpr(e0), mkU8(amt))
   23197      );
   23198    } else {
   23199       vassert(0);
   23200    }
   23201 
   23202    putYMMRegLoAndZU( rD, mkexpr(e1) );
   23203    return delta;
   23204 }
   23205 
   23206 
   23207 /* Vector by scalar shift of E into V, by an immediate byte.  Modified
   23208    version of dis_AVX128_shiftE_to_V_imm. */
   23209 static
   23210 Long dis_AVX256_shiftE_to_V_imm( Prefix pfx,
   23211                                  Long delta, const HChar* opname, IROp op )
   23212 {
   23213    Bool    shl, shr, sar;
   23214    UChar   rm   = getUChar(delta);
   23215    IRTemp  e0   = newTemp(Ity_V256);
   23216    IRTemp  e1   = newTemp(Ity_V256);
   23217    UInt    rD   = getVexNvvvv(pfx);
   23218    UChar   amt, size;
   23219    vassert(epartIsReg(rm));
   23220    vassert(gregLO3ofRM(rm) == 2
   23221            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
   23222    amt = getUChar(delta+1);
   23223    delta += 2;
   23224    DIP("%s $%d,%s,%s\n", opname,
   23225                          (Int)amt,
   23226                          nameYMMReg(eregOfRexRM(pfx,rm)),
   23227                          nameYMMReg(rD));
   23228    assign( e0, getYMMReg(eregOfRexRM(pfx,rm)) );
   23229 
   23230    shl = shr = sar = False;
   23231    size = 0;
   23232    switch (op) {
   23233       case Iop_ShlN16x16: shl = True; size = 16; break;
   23234       case Iop_ShlN32x8:  shl = True; size = 32; break;
   23235       case Iop_ShlN64x4:  shl = True; size = 64; break;
   23236       case Iop_SarN16x16: sar = True; size = 16; break;
   23237       case Iop_SarN32x8:  sar = True; size = 32; break;
   23238       case Iop_ShrN16x16: shr = True; size = 16; break;
   23239       case Iop_ShrN32x8:  shr = True; size = 32; break;
   23240       case Iop_ShrN64x4:  shr = True; size = 64; break;
   23241       default: vassert(0);
   23242    }
   23243 
   23244 
   23245    if (shl || shr) {
   23246      assign( e1, amt >= size
   23247                     ? binop(Iop_V128HLtoV256, mkV128(0), mkV128(0))
   23248                     : binop(op, mkexpr(e0), mkU8(amt))
   23249      );
   23250    } else
   23251    if (sar) {
   23252      assign( e1, amt >= size
   23253                     ? binop(op, mkexpr(e0), mkU8(size-1))
   23254                     : binop(op, mkexpr(e0), mkU8(amt))
   23255      );
   23256    } else {
   23257       vassert(0);
   23258    }
   23259 
   23260    putYMMReg( rD, mkexpr(e1) );
   23261    return delta;
   23262 }
   23263 
   23264 
   23265 /* Lower 64-bit lane only AVX128 binary operation:
   23266    G[63:0]    = V[63:0] `op` E[63:0]
   23267    G[127:64]  = V[127:64]
   23268    G[255:128] = 0.
   23269    The specified op must be of the 64F0x2 kind, so that it
   23270    copies the upper half of the left operand to the result.
   23271 */
   23272 static Long dis_AVX128_E_V_to_G_lo64 ( /*OUT*/Bool* uses_vvvv,
   23273                                        const VexAbiInfo* vbi,
   23274                                        Prefix pfx, Long delta,
   23275                                        const HChar* opname, IROp op )
   23276 {
   23277    HChar   dis_buf[50];
   23278    Int     alen;
   23279    IRTemp  addr;
   23280    UChar   rm    = getUChar(delta);
   23281    UInt    rG    = gregOfRexRM(pfx,rm);
   23282    UInt    rV    = getVexNvvvv(pfx);
   23283    IRExpr* vpart = getXMMReg(rV);
   23284    if (epartIsReg(rm)) {
   23285       UInt rE = eregOfRexRM(pfx,rm);
   23286       putXMMReg( rG, binop(op, vpart, getXMMReg(rE)) );
   23287       DIP("%s %s,%s,%s\n", opname,
   23288           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   23289       delta = delta+1;
   23290    } else {
   23291       /* We can only do a 64-bit memory read, so the upper half of the
   23292          E operand needs to be made simply of zeroes. */
   23293       IRTemp epart = newTemp(Ity_V128);
   23294       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23295       assign( epart, unop( Iop_64UtoV128,
   23296                            loadLE(Ity_I64, mkexpr(addr))) );
   23297       putXMMReg( rG, binop(op, vpart, mkexpr(epart)) );
   23298       DIP("%s %s,%s,%s\n", opname,
   23299           dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   23300       delta = delta+alen;
   23301    }
   23302    putYMMRegLane128( rG, 1, mkV128(0) );
   23303    *uses_vvvv = True;
   23304    return delta;
   23305 }
   23306 
   23307 
   23308 /* Lower 64-bit lane only AVX128 unary operation:
   23309    G[63:0]    = op(E[63:0])
   23310    G[127:64]  = V[127:64]
   23311    G[255:128] = 0
   23312    The specified op must be of the 64F0x2 kind, so that it
   23313    copies the upper half of the operand to the result.
   23314 */
   23315 static Long dis_AVX128_E_V_to_G_lo64_unary ( /*OUT*/Bool* uses_vvvv,
   23316                                              const VexAbiInfo* vbi,
   23317                                              Prefix pfx, Long delta,
   23318                                              const HChar* opname, IROp op )
   23319 {
   23320    HChar   dis_buf[50];
   23321    Int     alen;
   23322    IRTemp  addr;
   23323    UChar   rm  = getUChar(delta);
   23324    UInt    rG  = gregOfRexRM(pfx,rm);
   23325    UInt    rV  = getVexNvvvv(pfx);
   23326    IRTemp  e64 = newTemp(Ity_I64);
   23327 
   23328    /* Fetch E[63:0] */
   23329    if (epartIsReg(rm)) {
   23330       UInt rE = eregOfRexRM(pfx,rm);
   23331       assign(e64, getXMMRegLane64(rE, 0));
   23332       DIP("%s %s,%s,%s\n", opname,
   23333           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   23334       delta += 1;
   23335    } else {
   23336       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23337       assign(e64, loadLE(Ity_I64, mkexpr(addr)));
   23338       DIP("%s %s,%s,%s\n", opname,
   23339           dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   23340       delta += alen;
   23341    }
   23342 
   23343    /* Create a value 'arg' as V[127:64]++E[63:0] */
   23344    IRTemp arg = newTemp(Ity_V128);
   23345    assign(arg,
   23346           binop(Iop_SetV128lo64,
   23347                 getXMMReg(rV), mkexpr(e64)));
   23348    /* and apply op to it */
   23349    putYMMRegLoAndZU( rG, unop(op, mkexpr(arg)) );
   23350    *uses_vvvv = True;
   23351    return delta;
   23352 }
   23353 
   23354 
   23355 /* Lower 32-bit lane only AVX128 unary operation:
   23356    G[31:0]    = op(E[31:0])
   23357    G[127:32]  = V[127:32]
   23358    G[255:128] = 0
   23359    The specified op must be of the 32F0x4 kind, so that it
   23360    copies the upper 3/4 of the operand to the result.
   23361 */
   23362 static Long dis_AVX128_E_V_to_G_lo32_unary ( /*OUT*/Bool* uses_vvvv,
   23363                                              const VexAbiInfo* vbi,
   23364                                              Prefix pfx, Long delta,
   23365                                              const HChar* opname, IROp op )
   23366 {
   23367    HChar   dis_buf[50];
   23368    Int     alen;
   23369    IRTemp  addr;
   23370    UChar   rm  = getUChar(delta);
   23371    UInt    rG  = gregOfRexRM(pfx,rm);
   23372    UInt    rV  = getVexNvvvv(pfx);
   23373    IRTemp  e32 = newTemp(Ity_I32);
   23374 
   23375    /* Fetch E[31:0] */
   23376    if (epartIsReg(rm)) {
   23377       UInt rE = eregOfRexRM(pfx,rm);
   23378       assign(e32, getXMMRegLane32(rE, 0));
   23379       DIP("%s %s,%s,%s\n", opname,
   23380           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   23381       delta += 1;
   23382    } else {
   23383       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23384       assign(e32, loadLE(Ity_I32, mkexpr(addr)));
   23385       DIP("%s %s,%s,%s\n", opname,
   23386           dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   23387       delta += alen;
   23388    }
   23389 
   23390    /* Create a value 'arg' as V[127:32]++E[31:0] */
   23391    IRTemp arg = newTemp(Ity_V128);
   23392    assign(arg,
   23393           binop(Iop_SetV128lo32,
   23394                 getXMMReg(rV), mkexpr(e32)));
   23395    /* and apply op to it */
   23396    putYMMRegLoAndZU( rG, unop(op, mkexpr(arg)) );
   23397    *uses_vvvv = True;
   23398    return delta;
   23399 }
   23400 
   23401 
   23402 /* Lower 32-bit lane only AVX128 binary operation:
   23403    G[31:0]    = V[31:0] `op` E[31:0]
   23404    G[127:32]  = V[127:32]
   23405    G[255:128] = 0.
   23406    The specified op must be of the 32F0x4 kind, so that it
   23407    copies the upper 3/4 of the left operand to the result.
   23408 */
   23409 static Long dis_AVX128_E_V_to_G_lo32 ( /*OUT*/Bool* uses_vvvv,
   23410                                        const VexAbiInfo* vbi,
   23411                                        Prefix pfx, Long delta,
   23412                                        const HChar* opname, IROp op )
   23413 {
   23414    HChar   dis_buf[50];
   23415    Int     alen;
   23416    IRTemp  addr;
   23417    UChar   rm    = getUChar(delta);
   23418    UInt    rG    = gregOfRexRM(pfx,rm);
   23419    UInt    rV    = getVexNvvvv(pfx);
   23420    IRExpr* vpart = getXMMReg(rV);
   23421    if (epartIsReg(rm)) {
   23422       UInt rE = eregOfRexRM(pfx,rm);
   23423       putXMMReg( rG, binop(op, vpart, getXMMReg(rE)) );
   23424       DIP("%s %s,%s,%s\n", opname,
   23425           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   23426       delta = delta+1;
   23427    } else {
   23428       /* We can only do a 32-bit memory read, so the upper 3/4 of the
   23429          E operand needs to be made simply of zeroes. */
   23430       IRTemp epart = newTemp(Ity_V128);
   23431       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23432       assign( epart, unop( Iop_32UtoV128,
   23433                            loadLE(Ity_I32, mkexpr(addr))) );
   23434       putXMMReg( rG, binop(op, vpart, mkexpr(epart)) );
   23435       DIP("%s %s,%s,%s\n", opname,
   23436           dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   23437       delta = delta+alen;
   23438    }
   23439    putYMMRegLane128( rG, 1, mkV128(0) );
   23440    *uses_vvvv = True;
   23441    return delta;
   23442 }
   23443 
   23444 
   23445 /* All-lanes AVX128 binary operation:
   23446    G[127:0]   = V[127:0] `op` E[127:0]
   23447    G[255:128] = 0.
   23448 */
   23449 static Long dis_AVX128_E_V_to_G ( /*OUT*/Bool* uses_vvvv,
   23450                                   const VexAbiInfo* vbi,
   23451                                   Prefix pfx, Long delta,
   23452                                   const HChar* opname, IROp op )
   23453 {
   23454    return dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   23455              uses_vvvv, vbi, pfx, delta, opname, op,
   23456              NULL, False/*!invertLeftArg*/, False/*!swapArgs*/
   23457    );
   23458 }
   23459 
   23460 
   23461 /* Handles AVX128 32F/64F comparisons.  A derivative of
   23462    dis_SSEcmp_E_to_G.  It can fail, in which case it returns the
   23463    original delta to indicate failure. */
   23464 static
   23465 Long dis_AVX128_cmp_V_E_to_G ( /*OUT*/Bool* uses_vvvv,
   23466                                const VexAbiInfo* vbi,
   23467                                Prefix pfx, Long delta,
   23468                                const HChar* opname, Bool all_lanes, Int sz )
   23469 {
   23470    vassert(sz == 4 || sz == 8);
   23471    Long    deltaIN = delta;
   23472    HChar   dis_buf[50];
   23473    Int     alen;
   23474    UInt    imm8;
   23475    IRTemp  addr;
   23476    Bool    preSwap = False;
   23477    IROp    op      = Iop_INVALID;
   23478    Bool    postNot = False;
   23479    IRTemp  plain   = newTemp(Ity_V128);
   23480    UChar   rm      = getUChar(delta);
   23481    UInt    rG      = gregOfRexRM(pfx, rm);
   23482    UInt    rV      = getVexNvvvv(pfx);
   23483    IRTemp argL     = newTemp(Ity_V128);
   23484    IRTemp argR     = newTemp(Ity_V128);
   23485 
   23486    assign(argL, getXMMReg(rV));
   23487    if (epartIsReg(rm)) {
   23488       imm8 = getUChar(delta+1);
   23489       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
   23490       if (!ok) return deltaIN; /* FAIL */
   23491       UInt rE = eregOfRexRM(pfx,rm);
   23492       assign(argR, getXMMReg(rE));
   23493       delta += 1+1;
   23494       DIP("%s $%u,%s,%s,%s\n",
   23495           opname, imm8,
   23496           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   23497    } else {
   23498       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   23499       imm8 = getUChar(delta+alen);
   23500       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
   23501       if (!ok) return deltaIN; /* FAIL */
   23502       assign(argR,
   23503              all_lanes   ? loadLE(Ity_V128, mkexpr(addr))
   23504              : sz == 8   ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
   23505              : /*sz==4*/   unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr))));
   23506       delta += alen+1;
   23507       DIP("%s $%u,%s,%s,%s\n",
   23508           opname, imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   23509    }
   23510 
   23511    assign(plain, preSwap ? binop(op, mkexpr(argR), mkexpr(argL))
   23512                          : binop(op, mkexpr(argL), mkexpr(argR)));
   23513 
   23514    if (all_lanes) {
   23515       /* This is simple: just invert the result, if necessary, and
   23516          have done. */
   23517       if (postNot) {
   23518          putYMMRegLoAndZU( rG, unop(Iop_NotV128, mkexpr(plain)) );
   23519       } else {
   23520          putYMMRegLoAndZU( rG, mkexpr(plain) );
   23521       }
   23522    }
   23523    else
   23524    if (!preSwap) {
   23525       /* More complex.  It's a one-lane-only, hence need to possibly
   23526          invert only that one lane.  But at least the other lanes are
   23527          correctly "in" the result, having been copied from the left
   23528          operand (argL). */
   23529       if (postNot) {
   23530          IRExpr* mask = mkV128(sz==4 ? 0x000F : 0x00FF);
   23531          putYMMRegLoAndZU( rG, binop(Iop_XorV128, mkexpr(plain),
   23532                                                   mask) );
   23533       } else {
   23534          putYMMRegLoAndZU( rG, mkexpr(plain) );
   23535       }
   23536    }
   23537    else {
   23538       /* This is the most complex case.  One-lane-only, but the args
   23539          were swapped.  So we have to possibly invert the bottom lane,
   23540          and (definitely) we have to copy the upper lane(s) from argL
   23541          since, due to the swapping, what's currently there is from
   23542          argR, which is not correct. */
   23543       IRTemp res     = newTemp(Ity_V128);
   23544       IRTemp mask    = newTemp(Ity_V128);
   23545       IRTemp notMask = newTemp(Ity_V128);
   23546       assign(mask,    mkV128(sz==4 ? 0x000F : 0x00FF));
   23547       assign(notMask, mkV128(sz==4 ? 0xFFF0 : 0xFF00));
   23548       if (postNot) {
   23549          assign(res,
   23550                 binop(Iop_OrV128,
   23551                       binop(Iop_AndV128,
   23552                             unop(Iop_NotV128, mkexpr(plain)),
   23553                             mkexpr(mask)),
   23554                       binop(Iop_AndV128, mkexpr(argL), mkexpr(notMask))));
   23555       } else {
   23556          assign(res,
   23557                 binop(Iop_OrV128,
   23558                       binop(Iop_AndV128,
   23559                             mkexpr(plain),
   23560                             mkexpr(mask)),
   23561                       binop(Iop_AndV128, mkexpr(argL), mkexpr(notMask))));
   23562       }
   23563       putYMMRegLoAndZU( rG, mkexpr(res) );
   23564    }
   23565 
   23566    *uses_vvvv = True;
   23567    return delta;
   23568 }
   23569 
   23570 
   23571 /* Handles AVX256 32F/64F comparisons.  A derivative of
   23572    dis_SSEcmp_E_to_G.  It can fail, in which case it returns the
   23573    original delta to indicate failure. */
   23574 static
   23575 Long dis_AVX256_cmp_V_E_to_G ( /*OUT*/Bool* uses_vvvv,
   23576                                const VexAbiInfo* vbi,
   23577                                Prefix pfx, Long delta,
   23578                                const HChar* opname, Int sz )
   23579 {
   23580    vassert(sz == 4 || sz == 8);
   23581    Long    deltaIN = delta;
   23582    HChar   dis_buf[50];
   23583    Int     alen;
   23584    UInt    imm8;
   23585    IRTemp  addr;
   23586    Bool    preSwap = False;
   23587    IROp    op      = Iop_INVALID;
   23588    Bool    postNot = False;
   23589    IRTemp  plain   = newTemp(Ity_V256);
   23590    UChar   rm      = getUChar(delta);
   23591    UInt    rG      = gregOfRexRM(pfx, rm);
   23592    UInt    rV      = getVexNvvvv(pfx);
   23593    IRTemp argL     = newTemp(Ity_V256);
   23594    IRTemp argR     = newTemp(Ity_V256);
   23595    IRTemp argLhi   = IRTemp_INVALID;
   23596    IRTemp argLlo   = IRTemp_INVALID;
   23597    IRTemp argRhi   = IRTemp_INVALID;
   23598    IRTemp argRlo   = IRTemp_INVALID;
   23599 
   23600    assign(argL, getYMMReg(rV));
   23601    if (epartIsReg(rm)) {
   23602       imm8 = getUChar(delta+1);
   23603       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8,
   23604                              True/*all_lanes*/, sz);
   23605       if (!ok) return deltaIN; /* FAIL */
   23606       UInt rE = eregOfRexRM(pfx,rm);
   23607       assign(argR, getYMMReg(rE));
   23608       delta += 1+1;
   23609       DIP("%s $%u,%s,%s,%s\n",
   23610           opname, imm8,
   23611           nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   23612    } else {
   23613       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   23614       imm8 = getUChar(delta+alen);
   23615       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8,
   23616                              True/*all_lanes*/, sz);
   23617       if (!ok) return deltaIN; /* FAIL */
   23618       assign(argR, loadLE(Ity_V256, mkexpr(addr)) );
   23619       delta += alen+1;
   23620       DIP("%s $%u,%s,%s,%s\n",
   23621           opname, imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   23622    }
   23623 
   23624    breakupV256toV128s( preSwap ? argR : argL, &argLhi, &argLlo );
   23625    breakupV256toV128s( preSwap ? argL : argR, &argRhi, &argRlo );
   23626    assign(plain, binop( Iop_V128HLtoV256,
   23627                         binop(op, mkexpr(argLhi), mkexpr(argRhi)),
   23628                         binop(op, mkexpr(argLlo), mkexpr(argRlo)) ) );
   23629 
   23630    /* This is simple: just invert the result, if necessary, and
   23631       have done. */
   23632    if (postNot) {
   23633       putYMMReg( rG, unop(Iop_NotV256, mkexpr(plain)) );
   23634    } else {
   23635       putYMMReg( rG, mkexpr(plain) );
   23636    }
   23637 
   23638    *uses_vvvv = True;
   23639    return delta;
   23640 }
   23641 
   23642 
   23643 /* Handles AVX128 unary E-to-G all-lanes operations. */
   23644 static
   23645 Long dis_AVX128_E_to_G_unary ( /*OUT*/Bool* uses_vvvv,
   23646                                const VexAbiInfo* vbi,
   23647                                Prefix pfx, Long delta,
   23648                                const HChar* opname,
   23649                                IRTemp (*opFn)(IRTemp) )
   23650 {
   23651    HChar  dis_buf[50];
   23652    Int    alen;
   23653    IRTemp addr;
   23654    IRTemp res  = newTemp(Ity_V128);
   23655    IRTemp arg  = newTemp(Ity_V128);
   23656    UChar  rm   = getUChar(delta);
   23657    UInt   rG   = gregOfRexRM(pfx, rm);
   23658    if (epartIsReg(rm)) {
   23659       UInt rE = eregOfRexRM(pfx,rm);
   23660       assign(arg, getXMMReg(rE));
   23661       delta += 1;
   23662       DIP("%s %s,%s\n", opname, nameXMMReg(rE), nameXMMReg(rG));
   23663    } else {
   23664       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23665       assign(arg, loadLE(Ity_V128, mkexpr(addr)));
   23666       delta += alen;
   23667       DIP("%s %s,%s\n", opname, dis_buf, nameXMMReg(rG));
   23668    }
   23669    res = opFn(arg);
   23670    putYMMRegLoAndZU( rG, mkexpr(res) );
   23671    *uses_vvvv = False;
   23672    return delta;
   23673 }
   23674 
   23675 
   23676 /* Handles AVX128 unary E-to-G all-lanes operations. */
   23677 static
   23678 Long dis_AVX128_E_to_G_unary_all ( /*OUT*/Bool* uses_vvvv,
   23679                                    const VexAbiInfo* vbi,
   23680                                    Prefix pfx, Long delta,
   23681                                    const HChar* opname, IROp op )
   23682 {
   23683    HChar  dis_buf[50];
   23684    Int    alen;
   23685    IRTemp addr;
   23686    IRTemp arg  = newTemp(Ity_V128);
   23687    UChar  rm   = getUChar(delta);
   23688    UInt   rG   = gregOfRexRM(pfx, rm);
   23689    if (epartIsReg(rm)) {
   23690       UInt rE = eregOfRexRM(pfx,rm);
   23691       assign(arg, getXMMReg(rE));
   23692       delta += 1;
   23693       DIP("%s %s,%s\n", opname, nameXMMReg(rE), nameXMMReg(rG));
   23694    } else {
   23695       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23696       assign(arg, loadLE(Ity_V128, mkexpr(addr)));
   23697       delta += alen;
   23698       DIP("%s %s,%s\n", opname, dis_buf, nameXMMReg(rG));
   23699    }
   23700    // Sqrt32Fx4 and Sqrt64Fx2 take a rounding mode, which is faked
   23701    // up in the usual way.
   23702    Bool needsIRRM = op == Iop_Sqrt32Fx4 || op == Iop_Sqrt64Fx2;
   23703    /* XXXROUNDINGFIXME */
   23704    IRExpr* res = needsIRRM ? binop(op, get_FAKE_roundingmode(), mkexpr(arg))
   23705                            : unop(op, mkexpr(arg));
   23706    putYMMRegLoAndZU( rG, res );
   23707    *uses_vvvv = False;
   23708    return delta;
   23709 }
   23710 
   23711 
   23712 /* FIXME: common up with the _128_ version above? */
   23713 static
   23714 Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG (
   23715         /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
   23716         Prefix pfx, Long delta, const HChar* name,
   23717         /* The actual operation.  Use either 'op' or 'opfn',
   23718            but not both. */
   23719         IROp op, IRTemp(*opFn)(IRTemp,IRTemp),
   23720         Bool invertLeftArg,
   23721         Bool swapArgs
   23722      )
   23723 {
   23724    UChar  modrm = getUChar(delta);
   23725    UInt   rD    = gregOfRexRM(pfx, modrm);
   23726    UInt   rSL   = getVexNvvvv(pfx);
   23727    IRTemp tSL   = newTemp(Ity_V256);
   23728    IRTemp tSR   = newTemp(Ity_V256);
   23729    IRTemp addr  = IRTemp_INVALID;
   23730    HChar  dis_buf[50];
   23731    Int    alen  = 0;
   23732    vassert(1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*WIG?*/);
   23733 
   23734    assign(tSL, invertLeftArg ? unop(Iop_NotV256, getYMMReg(rSL))
   23735                              : getYMMReg(rSL));
   23736 
   23737    if (epartIsReg(modrm)) {
   23738       UInt rSR = eregOfRexRM(pfx, modrm);
   23739       delta += 1;
   23740       assign(tSR, getYMMReg(rSR));
   23741       DIP("%s %s,%s,%s\n",
   23742           name, nameYMMReg(rSR), nameYMMReg(rSL), nameYMMReg(rD));
   23743    } else {
   23744       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   23745       delta += alen;
   23746       assign(tSR, loadLE(Ity_V256, mkexpr(addr)));
   23747       DIP("%s %s,%s,%s\n",
   23748           name, dis_buf, nameYMMReg(rSL), nameYMMReg(rD));
   23749    }
   23750 
   23751    IRTemp res = IRTemp_INVALID;
   23752    if (op != Iop_INVALID) {
   23753       vassert(opFn == NULL);
   23754       res = newTemp(Ity_V256);
   23755       if (requiresRMode(op)) {
   23756          IRTemp rm = newTemp(Ity_I32);
   23757          assign(rm, get_FAKE_roundingmode()); /* XXXROUNDINGFIXME */
   23758          assign(res, swapArgs
   23759                         ? triop(op, mkexpr(rm), mkexpr(tSR), mkexpr(tSL))
   23760                         : triop(op, mkexpr(rm), mkexpr(tSL), mkexpr(tSR)));
   23761       } else {
   23762          assign(res, swapArgs
   23763                         ? binop(op, mkexpr(tSR), mkexpr(tSL))
   23764                         : binop(op, mkexpr(tSL), mkexpr(tSR)));
   23765       }
   23766    } else {
   23767       vassert(opFn != NULL);
   23768       res = swapArgs ? opFn(tSR, tSL) : opFn(tSL, tSR);
   23769    }
   23770 
   23771    putYMMReg(rD, mkexpr(res));
   23772 
   23773    *uses_vvvv = True;
   23774    return delta;
   23775 }
   23776 
   23777 
   23778 /* All-lanes AVX256 binary operation:
   23779    G[255:0] = V[255:0] `op` E[255:0]
   23780 */
   23781 static Long dis_AVX256_E_V_to_G ( /*OUT*/Bool* uses_vvvv,
   23782                                   const VexAbiInfo* vbi,
   23783                                   Prefix pfx, Long delta,
   23784                                   const HChar* opname, IROp op )
   23785 {
   23786    return dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
   23787              uses_vvvv, vbi, pfx, delta, opname, op,
   23788              NULL, False/*!invertLeftArg*/, False/*!swapArgs*/
   23789    );
   23790 }
   23791 
   23792 
   23793 /* Handle a VEX_NDS_256_66_0F_WIG (3-addr) insn, with a simple IROp
   23794    for the operation, no inversion of the left arg, and no swapping of
   23795    args. */
   23796 static
   23797 Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple (
   23798         /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
   23799         Prefix pfx, Long delta, const HChar* name,
   23800         IROp op
   23801      )
   23802 {
   23803    return dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
   23804              uses_vvvv, vbi, pfx, delta, name, op, NULL, False, False);
   23805 }
   23806 
   23807 
   23808 /* Handle a VEX_NDS_256_66_0F_WIG (3-addr) insn, using the given IR
   23809    generator to compute the result, no inversion of the left
   23810    arg, and no swapping of args. */
   23811 static
   23812 Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex (
   23813         /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
   23814         Prefix pfx, Long delta, const HChar* name,
   23815         IRTemp(*opFn)(IRTemp,IRTemp)
   23816      )
   23817 {
   23818    return dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
   23819              uses_vvvv, vbi, pfx, delta, name,
   23820              Iop_INVALID, opFn, False, False );
   23821 }
   23822 
   23823 
   23824 /* Handles AVX256 unary E-to-G all-lanes operations. */
   23825 static
   23826 Long dis_AVX256_E_to_G_unary ( /*OUT*/Bool* uses_vvvv,
   23827                                const VexAbiInfo* vbi,
   23828                                Prefix pfx, Long delta,
   23829                                const HChar* opname,
   23830                                IRTemp (*opFn)(IRTemp) )
   23831 {
   23832    HChar  dis_buf[50];
   23833    Int    alen;
   23834    IRTemp addr;
   23835    IRTemp res  = newTemp(Ity_V256);
   23836    IRTemp arg  = newTemp(Ity_V256);
   23837    UChar  rm   = getUChar(delta);
   23838    UInt   rG   = gregOfRexRM(pfx, rm);
   23839    if (epartIsReg(rm)) {
   23840       UInt rE = eregOfRexRM(pfx,rm);
   23841       assign(arg, getYMMReg(rE));
   23842       delta += 1;
   23843       DIP("%s %s,%s\n", opname, nameYMMReg(rE), nameYMMReg(rG));
   23844    } else {
   23845       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23846       assign(arg, loadLE(Ity_V256, mkexpr(addr)));
   23847       delta += alen;
   23848       DIP("%s %s,%s\n", opname, dis_buf, nameYMMReg(rG));
   23849    }
   23850    res = opFn(arg);
   23851    putYMMReg( rG, mkexpr(res) );
   23852    *uses_vvvv = False;
   23853    return delta;
   23854 }
   23855 
   23856 
   23857 /* Handles AVX256 unary E-to-G all-lanes operations. */
   23858 static
   23859 Long dis_AVX256_E_to_G_unary_all ( /*OUT*/Bool* uses_vvvv,
   23860                                    const VexAbiInfo* vbi,
   23861                                    Prefix pfx, Long delta,
   23862                                    const HChar* opname, IROp op )
   23863 {
   23864    HChar  dis_buf[50];
   23865    Int    alen;
   23866    IRTemp addr;
   23867    IRTemp arg  = newTemp(Ity_V256);
   23868    UChar  rm   = getUChar(delta);
   23869    UInt   rG   = gregOfRexRM(pfx, rm);
   23870    if (epartIsReg(rm)) {
   23871       UInt rE = eregOfRexRM(pfx,rm);
   23872       assign(arg, getYMMReg(rE));
   23873       delta += 1;
   23874       DIP("%s %s,%s\n", opname, nameYMMReg(rE), nameYMMReg(rG));
   23875    } else {
   23876       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23877       assign(arg, loadLE(Ity_V256, mkexpr(addr)));
   23878       delta += alen;
   23879       DIP("%s %s,%s\n", opname, dis_buf, nameYMMReg(rG));
   23880    }
   23881    putYMMReg( rG, unop(op, mkexpr(arg)) );
   23882    *uses_vvvv = False;
   23883    return delta;
   23884 }
   23885 
   23886 
   23887 /* The use of ReinterpF64asI64 is ugly.  Surely could do better if we
   23888    had a variant of Iop_64x4toV256 that took F64s as args instead. */
   23889 static Long dis_CVTDQ2PD_256 ( const VexAbiInfo* vbi, Prefix pfx,
   23890                                Long delta )
   23891 {
   23892    IRTemp addr  = IRTemp_INVALID;
   23893    Int    alen  = 0;
   23894    HChar  dis_buf[50];
   23895    UChar  modrm = getUChar(delta);
   23896    IRTemp sV    = newTemp(Ity_V128);
   23897    UInt   rG    = gregOfRexRM(pfx,modrm);
   23898    if (epartIsReg(modrm)) {
   23899       UInt rE = eregOfRexRM(pfx,modrm);
   23900       assign( sV, getXMMReg(rE) );
   23901       delta += 1;
   23902       DIP("vcvtdq2pd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   23903    } else {
   23904       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23905       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   23906       delta += alen;
   23907       DIP("vcvtdq2pd %s,%s\n", dis_buf, nameYMMReg(rG) );
   23908    }
   23909    IRTemp s3, s2, s1, s0;
   23910    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   23911    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   23912    IRExpr* res
   23913       = IRExpr_Qop(
   23914            Iop_64x4toV256,
   23915            unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s3))),
   23916            unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s2))),
   23917            unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s1))),
   23918            unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s0)))
   23919         );
   23920    putYMMReg(rG, res);
   23921    return delta;
   23922 }
   23923 
   23924 
   23925 static Long dis_CVTPD2PS_256 ( const VexAbiInfo* vbi, Prefix pfx,
   23926                                Long delta )
   23927 {
   23928    IRTemp addr  = IRTemp_INVALID;
   23929    Int    alen  = 0;
   23930    HChar  dis_buf[50];
   23931    UChar  modrm = getUChar(delta);
   23932    UInt   rG    = gregOfRexRM(pfx,modrm);
   23933    IRTemp argV  = newTemp(Ity_V256);
   23934    IRTemp rmode = newTemp(Ity_I32);
   23935    if (epartIsReg(modrm)) {
   23936       UInt rE = eregOfRexRM(pfx,modrm);
   23937       assign( argV, getYMMReg(rE) );
   23938       delta += 1;
   23939       DIP("vcvtpd2psy %s,%s\n", nameYMMReg(rE), nameXMMReg(rG));
   23940    } else {
   23941       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23942       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
   23943       delta += alen;
   23944       DIP("vcvtpd2psy %s,%s\n", dis_buf, nameXMMReg(rG) );
   23945    }
   23946 
   23947    assign( rmode, get_sse_roundingmode() );
   23948    IRTemp t3, t2, t1, t0;
   23949    t3 = t2 = t1 = t0 = IRTemp_INVALID;
   23950    breakupV256to64s( argV, &t3, &t2, &t1, &t0 );
   23951 #  define CVT(_t)  binop( Iop_F64toF32, mkexpr(rmode), \
   23952                           unop(Iop_ReinterpI64asF64, mkexpr(_t)) )
   23953    putXMMRegLane32F( rG, 3, CVT(t3) );
   23954    putXMMRegLane32F( rG, 2, CVT(t2) );
   23955    putXMMRegLane32F( rG, 1, CVT(t1) );
   23956    putXMMRegLane32F( rG, 0, CVT(t0) );
   23957 #  undef CVT
   23958    putYMMRegLane128( rG, 1, mkV128(0) );
   23959    return delta;
   23960 }
   23961 
   23962 
   23963 static IRTemp math_VPUNPCK_YMM ( IRTemp tL, IRType tR, IROp op )
   23964 {
   23965    IRTemp tLhi, tLlo, tRhi, tRlo;
   23966    tLhi = tLlo = tRhi = tRlo = IRTemp_INVALID;
   23967    IRTemp res = newTemp(Ity_V256);
   23968    breakupV256toV128s( tL, &tLhi, &tLlo );
   23969    breakupV256toV128s( tR, &tRhi, &tRlo );
   23970    assign( res, binop( Iop_V128HLtoV256,
   23971                        binop( op, mkexpr(tRhi), mkexpr(tLhi) ),
   23972                        binop( op, mkexpr(tRlo), mkexpr(tLlo) ) ) );
   23973    return res;
   23974 }
   23975 
   23976 
   23977 static IRTemp math_VPUNPCKLBW_YMM ( IRTemp tL, IRTemp tR )
   23978 {
   23979    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO8x16 );
   23980 }
   23981 
   23982 
   23983 static IRTemp math_VPUNPCKLWD_YMM ( IRTemp tL, IRTemp tR )
   23984 {
   23985    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO16x8 );
   23986 }
   23987 
   23988 
   23989 static IRTemp math_VPUNPCKLDQ_YMM ( IRTemp tL, IRTemp tR )
   23990 {
   23991    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO32x4 );
   23992 }
   23993 
   23994 
   23995 static IRTemp math_VPUNPCKLQDQ_YMM ( IRTemp tL, IRTemp tR )
   23996 {
   23997    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO64x2 );
   23998 }
   23999 
   24000 
   24001 static IRTemp math_VPUNPCKHBW_YMM ( IRTemp tL, IRTemp tR )
   24002 {
   24003    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI8x16 );
   24004 }
   24005 
   24006 
   24007 static IRTemp math_VPUNPCKHWD_YMM ( IRTemp tL, IRTemp tR )
   24008 {
   24009    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI16x8 );
   24010 }
   24011 
   24012 
   24013 static IRTemp math_VPUNPCKHDQ_YMM ( IRTemp tL, IRTemp tR )
   24014 {
   24015    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI32x4 );
   24016 }
   24017 
   24018 
   24019 static IRTemp math_VPUNPCKHQDQ_YMM ( IRTemp tL, IRTemp tR )
   24020 {
   24021    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI64x2 );
   24022 }
   24023 
   24024 
   24025 static IRTemp math_VPACKSSWB_YMM ( IRTemp tL, IRTemp tR )
   24026 {
   24027    return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin16Sto8Sx16 );
   24028 }
   24029 
   24030 
   24031 static IRTemp math_VPACKUSWB_YMM ( IRTemp tL, IRTemp tR )
   24032 {
   24033    return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin16Sto8Ux16 );
   24034 }
   24035 
   24036 
   24037 static IRTemp math_VPACKSSDW_YMM ( IRTemp tL, IRTemp tR )
   24038 {
   24039    return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin32Sto16Sx8 );
   24040 }
   24041 
   24042 
   24043 static IRTemp math_VPACKUSDW_YMM ( IRTemp tL, IRTemp tR )
   24044 {
   24045    return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin32Sto16Ux8 );
   24046 }
   24047 
   24048 
   24049 __attribute__((noinline))
   24050 static
   24051 Long dis_ESC_0F__VEX (
   24052         /*MB_OUT*/DisResult* dres,
   24053         /*OUT*/   Bool*      uses_vvvv,
   24054         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
   24055         Bool         resteerCisOk,
   24056         void*        callback_opaque,
   24057         const VexArchInfo* archinfo,
   24058         const VexAbiInfo*  vbi,
   24059         Prefix pfx, Int sz, Long deltaIN
   24060      )
   24061 {
   24062    IRTemp addr  = IRTemp_INVALID;
   24063    Int    alen  = 0;
   24064    HChar  dis_buf[50];
   24065    Long   delta = deltaIN;
   24066    UChar  opc   = getUChar(delta);
   24067    delta++;
   24068    *uses_vvvv = False;
   24069 
   24070    switch (opc) {
   24071 
   24072    case 0x10:
   24073       /* VMOVSD m64, xmm1 = VEX.LIG.F2.0F.WIG 10 /r */
   24074       /* Move 64 bits from E (mem only) to G (lo half xmm).
   24075          Bits 255-64 of the dest are zeroed out. */
   24076       if (haveF2no66noF3(pfx) && !epartIsReg(getUChar(delta))) {
   24077          UChar modrm = getUChar(delta);
   24078          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24079          UInt   rG   = gregOfRexRM(pfx,modrm);
   24080          IRTemp z128 = newTemp(Ity_V128);
   24081          assign(z128, mkV128(0));
   24082          putXMMReg( rG, mkexpr(z128) );
   24083          /* FIXME: ALIGNMENT CHECK? */
   24084          putXMMRegLane64( rG, 0, loadLE(Ity_I64, mkexpr(addr)) );
   24085          putYMMRegLane128( rG, 1, mkexpr(z128) );
   24086          DIP("vmovsd %s,%s\n", dis_buf, nameXMMReg(rG));
   24087          delta += alen;
   24088          goto decode_success;
   24089       }
   24090       /* VMOVSD xmm3, xmm2, xmm1 = VEX.LIG.F2.0F.WIG 10 /r */
   24091       /* Reg form. */
   24092       if (haveF2no66noF3(pfx) && epartIsReg(getUChar(delta))) {
   24093          UChar modrm = getUChar(delta);
   24094          UInt  rG    = gregOfRexRM(pfx, modrm);
   24095          UInt  rE    = eregOfRexRM(pfx, modrm);
   24096          UInt  rV    = getVexNvvvv(pfx);
   24097          delta++;
   24098          DIP("vmovsd %s,%s,%s\n",
   24099              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   24100          IRTemp res = newTemp(Ity_V128);
   24101          assign(res, binop(Iop_64HLtoV128,
   24102                            getXMMRegLane64(rV, 1),
   24103                            getXMMRegLane64(rE, 0)));
   24104          putYMMRegLoAndZU(rG, mkexpr(res));
   24105          *uses_vvvv = True;
   24106          goto decode_success;
   24107       }
   24108       /* VMOVSS m32, xmm1 = VEX.LIG.F3.0F.WIG 10 /r */
   24109       /* Move 32 bits from E (mem only) to G (lo half xmm).
   24110          Bits 255-32 of the dest are zeroed out. */
   24111       if (haveF3no66noF2(pfx) && !epartIsReg(getUChar(delta))) {
   24112          UChar modrm = getUChar(delta);
   24113          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24114          UInt   rG   = gregOfRexRM(pfx,modrm);
   24115          IRTemp z128 = newTemp(Ity_V128);
   24116          assign(z128, mkV128(0));
   24117          putXMMReg( rG, mkexpr(z128) );
   24118          /* FIXME: ALIGNMENT CHECK? */
   24119          putXMMRegLane32( rG, 0, loadLE(Ity_I32, mkexpr(addr)) );
   24120          putYMMRegLane128( rG, 1, mkexpr(z128) );
   24121          DIP("vmovss %s,%s\n", dis_buf, nameXMMReg(rG));
   24122          delta += alen;
   24123          goto decode_success;
   24124       }
   24125       /* VMOVSS xmm3, xmm2, xmm1 = VEX.LIG.F3.0F.WIG 10 /r */
   24126       /* Reg form. */
   24127       if (haveF3no66noF2(pfx) && epartIsReg(getUChar(delta))) {
   24128          UChar modrm = getUChar(delta);
   24129          UInt  rG    = gregOfRexRM(pfx, modrm);
   24130          UInt  rE    = eregOfRexRM(pfx, modrm);
   24131          UInt  rV    = getVexNvvvv(pfx);
   24132          delta++;
   24133          DIP("vmovss %s,%s,%s\n",
   24134              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   24135          IRTemp res = newTemp(Ity_V128);
   24136          assign( res, binop( Iop_64HLtoV128,
   24137                              getXMMRegLane64(rV, 1),
   24138                              binop(Iop_32HLto64,
   24139                                    getXMMRegLane32(rV, 1),
   24140                                    getXMMRegLane32(rE, 0)) ) );
   24141          putYMMRegLoAndZU(rG, mkexpr(res));
   24142          *uses_vvvv = True;
   24143          goto decode_success;
   24144       }
   24145       /* VMOVUPD xmm2/m128, xmm1 = VEX.128.66.0F.WIG 10 /r */
   24146       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24147          UChar modrm = getUChar(delta);
   24148          UInt  rG    = gregOfRexRM(pfx, modrm);
   24149          if (epartIsReg(modrm)) {
   24150             UInt rE = eregOfRexRM(pfx,modrm);
   24151             putYMMRegLoAndZU( rG, getXMMReg( rE ));
   24152             DIP("vmovupd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   24153             delta += 1;
   24154          } else {
   24155             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24156             putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
   24157             DIP("vmovupd %s,%s\n", dis_buf, nameXMMReg(rG));
   24158             delta += alen;
   24159          }
   24160          goto decode_success;
   24161       }
   24162       /* VMOVUPD ymm2/m256, ymm1 = VEX.256.66.0F.WIG 10 /r */
   24163       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24164          UChar modrm = getUChar(delta);
   24165          UInt  rG    = gregOfRexRM(pfx, modrm);
   24166          if (epartIsReg(modrm)) {
   24167             UInt rE = eregOfRexRM(pfx,modrm);
   24168             putYMMReg( rG, getYMMReg( rE ));
   24169             DIP("vmovupd %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   24170             delta += 1;
   24171          } else {
   24172             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24173             putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
   24174             DIP("vmovupd %s,%s\n", dis_buf, nameYMMReg(rG));
   24175             delta += alen;
   24176          }
   24177          goto decode_success;
   24178       }
   24179       /* VMOVUPS xmm2/m128, xmm1 = VEX.128.0F.WIG 10 /r */
   24180       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24181          UChar modrm = getUChar(delta);
   24182          UInt  rG    = gregOfRexRM(pfx, modrm);
   24183          if (epartIsReg(modrm)) {
   24184             UInt rE = eregOfRexRM(pfx,modrm);
   24185             putYMMRegLoAndZU( rG, getXMMReg( rE ));
   24186             DIP("vmovups %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   24187             delta += 1;
   24188          } else {
   24189             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24190             putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
   24191             DIP("vmovups %s,%s\n", dis_buf, nameXMMReg(rG));
   24192             delta += alen;
   24193          }
   24194          goto decode_success;
   24195       }
   24196       /* VMOVUPS ymm2/m256, ymm1 = VEX.256.0F.WIG 10 /r */
   24197       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24198          UChar modrm = getUChar(delta);
   24199          UInt  rG    = gregOfRexRM(pfx, modrm);
   24200          if (epartIsReg(modrm)) {
   24201             UInt rE = eregOfRexRM(pfx,modrm);
   24202             putYMMReg( rG, getYMMReg( rE ));
   24203             DIP("vmovups %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   24204             delta += 1;
   24205          } else {
   24206             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24207             putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
   24208             DIP("vmovups %s,%s\n", dis_buf, nameYMMReg(rG));
   24209             delta += alen;
   24210          }
   24211          goto decode_success;
   24212       }
   24213       break;
   24214 
   24215    case 0x11:
   24216       /* VMOVSD xmm1, m64 = VEX.LIG.F2.0F.WIG 11 /r */
   24217       /* Move 64 bits from G (low half xmm) to mem only. */
   24218       if (haveF2no66noF3(pfx) && !epartIsReg(getUChar(delta))) {
   24219          UChar modrm = getUChar(delta);
   24220          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24221          UInt   rG   = gregOfRexRM(pfx,modrm);
   24222          /* FIXME: ALIGNMENT CHECK? */
   24223          storeLE( mkexpr(addr), getXMMRegLane64(rG, 0));
   24224          DIP("vmovsd %s,%s\n", nameXMMReg(rG), dis_buf);
   24225          delta += alen;
   24226          goto decode_success;
   24227       }
   24228       /* VMOVSD xmm3, xmm2, xmm1 = VEX.LIG.F2.0F.WIG 11 /r */
   24229       /* Reg form. */
   24230       if (haveF2no66noF3(pfx) && epartIsReg(getUChar(delta))) {
   24231          UChar modrm = getUChar(delta);
   24232          UInt  rG    = gregOfRexRM(pfx, modrm);
   24233          UInt  rE    = eregOfRexRM(pfx, modrm);
   24234          UInt  rV    = getVexNvvvv(pfx);
   24235          delta++;
   24236          DIP("vmovsd %s,%s,%s\n",
   24237              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   24238          IRTemp res = newTemp(Ity_V128);
   24239          assign(res, binop(Iop_64HLtoV128,
   24240                            getXMMRegLane64(rV, 1),
   24241                            getXMMRegLane64(rE, 0)));
   24242          putYMMRegLoAndZU(rG, mkexpr(res));
   24243          *uses_vvvv = True;
   24244          goto decode_success;
   24245       }
   24246       /* VMOVSS xmm1, m64 = VEX.LIG.F3.0F.WIG 11 /r */
   24247       /* Move 32 bits from G (low 1/4 xmm) to mem only. */
   24248       if (haveF3no66noF2(pfx) && !epartIsReg(getUChar(delta))) {
   24249          UChar modrm = getUChar(delta);
   24250          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24251          UInt   rG   = gregOfRexRM(pfx,modrm);
   24252          /* FIXME: ALIGNMENT CHECK? */
   24253          storeLE( mkexpr(addr), getXMMRegLane32(rG, 0));
   24254          DIP("vmovss %s,%s\n", nameXMMReg(rG), dis_buf);
   24255          delta += alen;
   24256          goto decode_success;
   24257       }
   24258       /* VMOVSS xmm3, xmm2, xmm1 = VEX.LIG.F3.0F.WIG 11 /r */
   24259       /* Reg form. */
   24260       if (haveF3no66noF2(pfx) && epartIsReg(getUChar(delta))) {
   24261          UChar modrm = getUChar(delta);
   24262          UInt  rG    = gregOfRexRM(pfx, modrm);
   24263          UInt  rE    = eregOfRexRM(pfx, modrm);
   24264          UInt  rV    = getVexNvvvv(pfx);
   24265          delta++;
   24266          DIP("vmovss %s,%s,%s\n",
   24267              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   24268          IRTemp res = newTemp(Ity_V128);
   24269          assign( res, binop( Iop_64HLtoV128,
   24270                              getXMMRegLane64(rV, 1),
   24271                              binop(Iop_32HLto64,
   24272                                    getXMMRegLane32(rV, 1),
   24273                                    getXMMRegLane32(rE, 0)) ) );
   24274          putYMMRegLoAndZU(rG, mkexpr(res));
   24275          *uses_vvvv = True;
   24276          goto decode_success;
   24277       }
   24278       /* VMOVUPD xmm1, xmm2/m128 = VEX.128.66.0F.WIG 11 /r */
   24279       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24280          UChar modrm = getUChar(delta);
   24281          UInt  rG    = gregOfRexRM(pfx,modrm);
   24282          if (epartIsReg(modrm)) {
   24283             UInt rE = eregOfRexRM(pfx,modrm);
   24284             putYMMRegLoAndZU( rE, getXMMReg(rG) );
   24285             DIP("vmovupd %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
   24286             delta += 1;
   24287          } else {
   24288             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24289             storeLE( mkexpr(addr), getXMMReg(rG) );
   24290             DIP("vmovupd %s,%s\n", nameXMMReg(rG), dis_buf);
   24291             delta += alen;
   24292          }
   24293          goto decode_success;
   24294       }
   24295       /* VMOVUPD ymm1, ymm2/m256 = VEX.256.66.0F.WIG 11 /r */
   24296       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24297          UChar modrm = getUChar(delta);
   24298          UInt  rG    = gregOfRexRM(pfx,modrm);
   24299          if (epartIsReg(modrm)) {
   24300             UInt rE = eregOfRexRM(pfx,modrm);
   24301             putYMMReg( rE, getYMMReg(rG) );
   24302             DIP("vmovupd %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
   24303             delta += 1;
   24304          } else {
   24305             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24306             storeLE( mkexpr(addr), getYMMReg(rG) );
   24307             DIP("vmovupd %s,%s\n", nameYMMReg(rG), dis_buf);
   24308             delta += alen;
   24309          }
   24310          goto decode_success;
   24311       }
   24312       /* VMOVUPS xmm1, xmm2/m128 = VEX.128.0F.WIG 11 /r */
   24313       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24314          UChar modrm = getUChar(delta);
   24315          UInt  rG    = gregOfRexRM(pfx,modrm);
   24316          if (epartIsReg(modrm)) {
   24317             UInt rE = eregOfRexRM(pfx,modrm);
   24318             putYMMRegLoAndZU( rE, getXMMReg(rG) );
   24319             DIP("vmovups %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
   24320             delta += 1;
   24321          } else {
   24322             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24323             storeLE( mkexpr(addr), getXMMReg(rG) );
   24324             DIP("vmovups %s,%s\n", nameXMMReg(rG), dis_buf);
   24325             delta += alen;
   24326          }
   24327          goto decode_success;
   24328       }
   24329       /* VMOVUPS ymm1, ymm2/m256 = VEX.256.0F.WIG 11 /r */
   24330       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24331          UChar modrm = getUChar(delta);
   24332          UInt  rG    = gregOfRexRM(pfx,modrm);
   24333          if (epartIsReg(modrm)) {
   24334             UInt rE = eregOfRexRM(pfx,modrm);
   24335             putYMMReg( rE, getYMMReg(rG) );
   24336             DIP("vmovups %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
   24337             delta += 1;
   24338          } else {
   24339             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24340             storeLE( mkexpr(addr), getYMMReg(rG) );
   24341             DIP("vmovups %s,%s\n", nameYMMReg(rG), dis_buf);
   24342             delta += alen;
   24343          }
   24344          goto decode_success;
   24345       }
   24346       break;
   24347 
   24348    case 0x12:
   24349       /* VMOVDDUP xmm2/m64, xmm1 = VEX.128.F2.0F.WIG /12 r */
   24350       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24351          delta = dis_MOVDDUP_128( vbi, pfx, delta, True/*isAvx*/ );
   24352          goto decode_success;
   24353       }
   24354       /* VMOVDDUP ymm2/m256, ymm1 = VEX.256.F2.0F.WIG /12 r */
   24355       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24356          delta = dis_MOVDDUP_256( vbi, pfx, delta );
   24357          goto decode_success;
   24358       }
   24359       /* VMOVHLPS xmm3, xmm2, xmm1 = VEX.NDS.128.0F.WIG 12 /r */
   24360       /* Insn only exists in reg form */
   24361       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   24362           && epartIsReg(getUChar(delta))) {
   24363          UChar modrm = getUChar(delta);
   24364          UInt  rG    = gregOfRexRM(pfx, modrm);
   24365          UInt  rE    = eregOfRexRM(pfx, modrm);
   24366          UInt  rV    = getVexNvvvv(pfx);
   24367          delta++;
   24368          DIP("vmovhlps %s,%s,%s\n",
   24369              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   24370          IRTemp res = newTemp(Ity_V128);
   24371          assign(res, binop(Iop_64HLtoV128,
   24372                            getXMMRegLane64(rV, 1),
   24373                            getXMMRegLane64(rE, 1)));
   24374          putYMMRegLoAndZU(rG, mkexpr(res));
   24375          *uses_vvvv = True;
   24376          goto decode_success;
   24377       }
   24378       /* VMOVLPS m64, xmm1, xmm2 = VEX.NDS.128.0F.WIG 12 /r */
   24379       /* Insn exists only in mem form, it appears. */
   24380       /* VMOVLPD m64, xmm1, xmm2 = VEX.NDS.128.66.0F.WIG 12 /r */
   24381       /* Insn exists only in mem form, it appears. */
   24382       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   24383           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
   24384          UChar modrm = getUChar(delta);
   24385          UInt  rG    = gregOfRexRM(pfx, modrm);
   24386          UInt  rV    = getVexNvvvv(pfx);
   24387          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24388          delta += alen;
   24389          DIP("vmovlpd %s,%s,%s\n",
   24390              dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   24391          IRTemp res = newTemp(Ity_V128);
   24392          assign(res, binop(Iop_64HLtoV128,
   24393                            getXMMRegLane64(rV, 1),
   24394                            loadLE(Ity_I64, mkexpr(addr))));
   24395          putYMMRegLoAndZU(rG, mkexpr(res));
   24396          *uses_vvvv = True;
   24397          goto decode_success;
   24398       }
   24399       /* VMOVSLDUP xmm2/m128, xmm1 = VEX.NDS.128.F3.0F.WIG 12 /r */
   24400       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
   24401          delta = dis_MOVSxDUP_128( vbi, pfx, delta, True/*isAvx*/,
   24402                                    True/*isL*/ );
   24403          goto decode_success;
   24404       }
   24405       /* VMOVSLDUP ymm2/m256, ymm1 = VEX.NDS.256.F3.0F.WIG 12 /r */
   24406       if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
   24407          delta = dis_MOVSxDUP_256( vbi, pfx, delta, True/*isL*/ );
   24408          goto decode_success;
   24409       }
   24410       break;
   24411 
   24412    case 0x13:
   24413       /* VMOVLPS xmm1, m64 = VEX.128.0F.WIG 13 /r */
   24414       /* Insn exists only in mem form, it appears. */
   24415       /* VMOVLPD xmm1, m64 = VEX.128.66.0F.WIG 13 /r */
   24416       /* Insn exists only in mem form, it appears. */
   24417       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   24418           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
   24419          UChar modrm = getUChar(delta);
   24420          UInt  rG    = gregOfRexRM(pfx, modrm);
   24421          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24422          delta += alen;
   24423          storeLE( mkexpr(addr), getXMMRegLane64( rG, 0));
   24424          DIP("vmovlpd %s,%s\n", nameXMMReg(rG), dis_buf);
   24425          goto decode_success;
   24426       }
   24427       break;
   24428 
   24429    case 0x14:
   24430    case 0x15:
   24431       /* VUNPCKLPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 14 /r */
   24432       /* VUNPCKHPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 15 /r */
   24433       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24434          Bool   hi    = opc == 0x15;
   24435          UChar  modrm = getUChar(delta);
   24436          UInt   rG    = gregOfRexRM(pfx,modrm);
   24437          UInt   rV    = getVexNvvvv(pfx);
   24438          IRTemp eV    = newTemp(Ity_V128);
   24439          IRTemp vV    = newTemp(Ity_V128);
   24440          assign( vV, getXMMReg(rV) );
   24441          if (epartIsReg(modrm)) {
   24442             UInt rE = eregOfRexRM(pfx,modrm);
   24443             assign( eV, getXMMReg(rE) );
   24444             delta += 1;
   24445             DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
   24446                 nameXMMReg(rE), nameXMMReg(rG));
   24447          } else {
   24448             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24449             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   24450             delta += alen;
   24451             DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
   24452                 dis_buf, nameXMMReg(rG));
   24453          }
   24454          IRTemp res = math_UNPCKxPS_128( eV, vV, hi );
   24455          putYMMRegLoAndZU( rG, mkexpr(res) );
   24456          *uses_vvvv = True;
   24457          goto decode_success;
   24458       }
   24459       /* VUNPCKLPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 14 /r */
   24460       /* VUNPCKHPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 15 /r */
   24461       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24462          Bool   hi    = opc == 0x15;
   24463          UChar  modrm = getUChar(delta);
   24464          UInt   rG    = gregOfRexRM(pfx,modrm);
   24465          UInt   rV    = getVexNvvvv(pfx);
   24466          IRTemp eV    = newTemp(Ity_V256);
   24467          IRTemp vV    = newTemp(Ity_V256);
   24468          assign( vV, getYMMReg(rV) );
   24469          if (epartIsReg(modrm)) {
   24470             UInt rE = eregOfRexRM(pfx,modrm);
   24471             assign( eV, getYMMReg(rE) );
   24472             delta += 1;
   24473             DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
   24474                 nameYMMReg(rE), nameYMMReg(rG));
   24475          } else {
   24476             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24477             assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
   24478             delta += alen;
   24479             DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
   24480                 dis_buf, nameYMMReg(rG));
   24481          }
   24482          IRTemp res = math_UNPCKxPS_256( eV, vV, hi );
   24483          putYMMReg( rG, mkexpr(res) );
   24484          *uses_vvvv = True;
   24485          goto decode_success;
   24486       }
   24487       /* VUNPCKLPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 14 /r */
   24488       /* VUNPCKHPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 15 /r */
   24489       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24490          Bool   hi    = opc == 0x15;
   24491          UChar  modrm = getUChar(delta);
   24492          UInt   rG    = gregOfRexRM(pfx,modrm);
   24493          UInt   rV    = getVexNvvvv(pfx);
   24494          IRTemp eV    = newTemp(Ity_V128);
   24495          IRTemp vV    = newTemp(Ity_V128);
   24496          assign( vV, getXMMReg(rV) );
   24497          if (epartIsReg(modrm)) {
   24498             UInt rE = eregOfRexRM(pfx,modrm);
   24499             assign( eV, getXMMReg(rE) );
   24500             delta += 1;
   24501             DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
   24502                 nameXMMReg(rE), nameXMMReg(rG));
   24503          } else {
   24504             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24505             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   24506             delta += alen;
   24507             DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
   24508                 dis_buf, nameXMMReg(rG));
   24509          }
   24510          IRTemp res = math_UNPCKxPD_128( eV, vV, hi );
   24511          putYMMRegLoAndZU( rG, mkexpr(res) );
   24512          *uses_vvvv = True;
   24513          goto decode_success;
   24514       }
   24515       /* VUNPCKLPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 14 /r */
   24516       /* VUNPCKHPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 15 /r */
   24517       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24518          Bool   hi    = opc == 0x15;
   24519          UChar  modrm = getUChar(delta);
   24520          UInt   rG    = gregOfRexRM(pfx,modrm);
   24521          UInt   rV    = getVexNvvvv(pfx);
   24522          IRTemp eV    = newTemp(Ity_V256);
   24523          IRTemp vV    = newTemp(Ity_V256);
   24524          assign( vV, getYMMReg(rV) );
   24525          if (epartIsReg(modrm)) {
   24526             UInt rE = eregOfRexRM(pfx,modrm);
   24527             assign( eV, getYMMReg(rE) );
   24528             delta += 1;
   24529             DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
   24530                 nameYMMReg(rE), nameYMMReg(rG));
   24531          } else {
   24532             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24533             assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
   24534             delta += alen;
   24535             DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
   24536                 dis_buf, nameYMMReg(rG));
   24537          }
   24538          IRTemp res = math_UNPCKxPD_256( eV, vV, hi );
   24539          putYMMReg( rG, mkexpr(res) );
   24540          *uses_vvvv = True;
   24541          goto decode_success;
   24542       }
   24543       break;
   24544 
   24545    case 0x16:
   24546       /* VMOVLHPS xmm3, xmm2, xmm1 = VEX.NDS.128.0F.WIG 16 /r */
   24547       /* Insn only exists in reg form */
   24548       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   24549           && epartIsReg(getUChar(delta))) {
   24550          UChar modrm = getUChar(delta);
   24551          UInt  rG    = gregOfRexRM(pfx, modrm);
   24552          UInt  rE    = eregOfRexRM(pfx, modrm);
   24553          UInt  rV    = getVexNvvvv(pfx);
   24554          delta++;
   24555          DIP("vmovlhps %s,%s,%s\n",
   24556              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   24557          IRTemp res = newTemp(Ity_V128);
   24558          assign(res, binop(Iop_64HLtoV128,
   24559                            getXMMRegLane64(rE, 0),
   24560                            getXMMRegLane64(rV, 0)));
   24561          putYMMRegLoAndZU(rG, mkexpr(res));
   24562          *uses_vvvv = True;
   24563          goto decode_success;
   24564       }
   24565       /* VMOVHPS m64, xmm1, xmm2 = VEX.NDS.128.0F.WIG 16 /r */
   24566       /* Insn exists only in mem form, it appears. */
   24567       /* VMOVHPD m64, xmm1, xmm2 = VEX.NDS.128.66.0F.WIG 16 /r */
   24568       /* Insn exists only in mem form, it appears. */
   24569       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   24570           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
   24571          UChar modrm = getUChar(delta);
   24572          UInt  rG    = gregOfRexRM(pfx, modrm);
   24573          UInt  rV    = getVexNvvvv(pfx);
   24574          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24575          delta += alen;
   24576          DIP("vmovhp%c %s,%s,%s\n", have66(pfx) ? 'd' : 's',
   24577              dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   24578          IRTemp res = newTemp(Ity_V128);
   24579          assign(res, binop(Iop_64HLtoV128,
   24580                            loadLE(Ity_I64, mkexpr(addr)),
   24581                            getXMMRegLane64(rV, 0)));
   24582          putYMMRegLoAndZU(rG, mkexpr(res));
   24583          *uses_vvvv = True;
   24584          goto decode_success;
   24585       }
   24586       /* VMOVSHDUP xmm2/m128, xmm1 = VEX.NDS.128.F3.0F.WIG 16 /r */
   24587       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
   24588          delta = dis_MOVSxDUP_128( vbi, pfx, delta, True/*isAvx*/,
   24589                                    False/*!isL*/ );
   24590          goto decode_success;
   24591       }
   24592       /* VMOVSHDUP ymm2/m256, ymm1 = VEX.NDS.256.F3.0F.WIG 16 /r */
   24593       if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
   24594          delta = dis_MOVSxDUP_256( vbi, pfx, delta, False/*!isL*/ );
   24595          goto decode_success;
   24596       }
   24597       break;
   24598 
   24599    case 0x17:
   24600       /* VMOVHPS xmm1, m64 = VEX.128.0F.WIG 17 /r */
   24601       /* Insn exists only in mem form, it appears. */
   24602       /* VMOVHPD xmm1, m64 = VEX.128.66.0F.WIG 17 /r */
   24603       /* Insn exists only in mem form, it appears. */
   24604       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   24605           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
   24606          UChar modrm = getUChar(delta);
   24607          UInt  rG    = gregOfRexRM(pfx, modrm);
   24608          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24609          delta += alen;
   24610          storeLE( mkexpr(addr), getXMMRegLane64( rG, 1));
   24611          DIP("vmovhp%c %s,%s\n", have66(pfx) ? 'd' : 's',
   24612              nameXMMReg(rG), dis_buf);
   24613          goto decode_success;
   24614       }
   24615       break;
   24616 
   24617    case 0x28:
   24618       /* VMOVAPD xmm2/m128, xmm1 = VEX.128.66.0F.WIG 28 /r */
   24619       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24620          UChar modrm = getUChar(delta);
   24621          UInt  rG    = gregOfRexRM(pfx, modrm);
   24622          if (epartIsReg(modrm)) {
   24623             UInt rE = eregOfRexRM(pfx,modrm);
   24624             putYMMRegLoAndZU( rG, getXMMReg( rE ));
   24625             DIP("vmovapd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   24626             delta += 1;
   24627          } else {
   24628             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24629             gen_SEGV_if_not_16_aligned( addr );
   24630             putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
   24631             DIP("vmovapd %s,%s\n", dis_buf, nameXMMReg(rG));
   24632             delta += alen;
   24633          }
   24634          goto decode_success;
   24635       }
   24636       /* VMOVAPD ymm2/m256, ymm1 = VEX.256.66.0F.WIG 28 /r */
   24637       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24638          UChar modrm = getUChar(delta);
   24639          UInt  rG    = gregOfRexRM(pfx, modrm);
   24640          if (epartIsReg(modrm)) {
   24641             UInt rE = eregOfRexRM(pfx,modrm);
   24642             putYMMReg( rG, getYMMReg( rE ));
   24643             DIP("vmovapd %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   24644             delta += 1;
   24645          } else {
   24646             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24647             gen_SEGV_if_not_32_aligned( addr );
   24648             putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
   24649             DIP("vmovapd %s,%s\n", dis_buf, nameYMMReg(rG));
   24650             delta += alen;
   24651          }
   24652          goto decode_success;
   24653       }
   24654       /* VMOVAPS xmm2/m128, xmm1 = VEX.128.0F.WIG 28 /r */
   24655       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24656          UChar modrm = getUChar(delta);
   24657          UInt  rG    = gregOfRexRM(pfx, modrm);
   24658          if (epartIsReg(modrm)) {
   24659             UInt rE = eregOfRexRM(pfx,modrm);
   24660             putYMMRegLoAndZU( rG, getXMMReg( rE ));
   24661             DIP("vmovaps %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   24662             delta += 1;
   24663          } else {
   24664             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24665             gen_SEGV_if_not_16_aligned( addr );
   24666             putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
   24667             DIP("vmovaps %s,%s\n", dis_buf, nameXMMReg(rG));
   24668             delta += alen;
   24669          }
   24670          goto decode_success;
   24671       }
   24672       /* VMOVAPS ymm2/m256, ymm1 = VEX.256.0F.WIG 28 /r */
   24673       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24674          UChar modrm = getUChar(delta);
   24675          UInt  rG    = gregOfRexRM(pfx, modrm);
   24676          if (epartIsReg(modrm)) {
   24677             UInt rE = eregOfRexRM(pfx,modrm);
   24678             putYMMReg( rG, getYMMReg( rE ));
   24679             DIP("vmovaps %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   24680             delta += 1;
   24681          } else {
   24682             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24683             gen_SEGV_if_not_32_aligned( addr );
   24684             putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
   24685             DIP("vmovaps %s,%s\n", dis_buf, nameYMMReg(rG));
   24686             delta += alen;
   24687          }
   24688          goto decode_success;
   24689       }
   24690       break;
   24691 
   24692    case 0x29:
   24693       /* VMOVAPD xmm1, xmm2/m128 = VEX.128.66.0F.WIG 29 /r */
   24694       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24695          UChar modrm = getUChar(delta);
   24696          UInt  rG    = gregOfRexRM(pfx,modrm);
   24697          if (epartIsReg(modrm)) {
   24698             UInt rE = eregOfRexRM(pfx,modrm);
   24699             putYMMRegLoAndZU( rE, getXMMReg(rG) );
   24700             DIP("vmovapd %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
   24701             delta += 1;
   24702          } else {
   24703             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24704             gen_SEGV_if_not_16_aligned( addr );
   24705             storeLE( mkexpr(addr), getXMMReg(rG) );
   24706             DIP("vmovapd %s,%s\n", nameXMMReg(rG), dis_buf );
   24707             delta += alen;
   24708          }
   24709          goto decode_success;
   24710       }
   24711       /* VMOVAPD ymm1, ymm2/m256 = VEX.256.66.0F.WIG 29 /r */
   24712       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24713          UChar modrm = getUChar(delta);
   24714          UInt  rG    = gregOfRexRM(pfx,modrm);
   24715          if (epartIsReg(modrm)) {
   24716             UInt rE = eregOfRexRM(pfx,modrm);
   24717             putYMMReg( rE, getYMMReg(rG) );
   24718             DIP("vmovapd %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
   24719             delta += 1;
   24720          } else {
   24721             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24722             gen_SEGV_if_not_32_aligned( addr );
   24723             storeLE( mkexpr(addr), getYMMReg(rG) );
   24724             DIP("vmovapd %s,%s\n", nameYMMReg(rG), dis_buf );
   24725             delta += alen;
   24726          }
   24727          goto decode_success;
   24728       }
   24729       /* VMOVAPS xmm1, xmm2/m128 = VEX.128.0F.WIG 29 /r */
   24730       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24731          UChar modrm = getUChar(delta);
   24732          UInt  rG    = gregOfRexRM(pfx,modrm);
   24733          if (epartIsReg(modrm)) {
   24734             UInt rE = eregOfRexRM(pfx,modrm);
   24735             putYMMRegLoAndZU( rE, getXMMReg(rG) );
   24736             DIP("vmovaps %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
   24737             delta += 1;
   24738             goto decode_success;
   24739          } else {
   24740             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24741             gen_SEGV_if_not_16_aligned( addr );
   24742             storeLE( mkexpr(addr), getXMMReg(rG) );
   24743             DIP("vmovaps %s,%s\n", nameXMMReg(rG), dis_buf );
   24744             delta += alen;
   24745             goto decode_success;
   24746          }
   24747       }
   24748       /* VMOVAPS ymm1, ymm2/m256 = VEX.256.0F.WIG 29 /r */
   24749       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24750          UChar modrm = getUChar(delta);
   24751          UInt  rG    = gregOfRexRM(pfx,modrm);
   24752          if (epartIsReg(modrm)) {
   24753             UInt rE = eregOfRexRM(pfx,modrm);
   24754             putYMMReg( rE, getYMMReg(rG) );
   24755             DIP("vmovaps %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
   24756             delta += 1;
   24757             goto decode_success;
   24758          } else {
   24759             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24760             gen_SEGV_if_not_32_aligned( addr );
   24761             storeLE( mkexpr(addr), getYMMReg(rG) );
   24762             DIP("vmovaps %s,%s\n", nameYMMReg(rG), dis_buf );
   24763             delta += alen;
   24764             goto decode_success;
   24765          }
   24766       }
   24767       break;
   24768 
   24769    case 0x2A: {
   24770       IRTemp rmode = newTemp(Ity_I32);
   24771       assign( rmode, get_sse_roundingmode() );
   24772       /* VCVTSI2SD r/m32, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.W0 2A /r */
   24773       if (haveF2no66noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
   24774          UChar  modrm = getUChar(delta);
   24775          UInt   rV    = getVexNvvvv(pfx);
   24776          UInt   rD    = gregOfRexRM(pfx, modrm);
   24777          IRTemp arg32 = newTemp(Ity_I32);
   24778          if (epartIsReg(modrm)) {
   24779             UInt rS = eregOfRexRM(pfx,modrm);
   24780             assign( arg32, getIReg32(rS) );
   24781             delta += 1;
   24782             DIP("vcvtsi2sdl %s,%s,%s\n",
   24783                 nameIReg32(rS), nameXMMReg(rV), nameXMMReg(rD));
   24784          } else {
   24785             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24786             assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   24787             delta += alen;
   24788             DIP("vcvtsi2sdl %s,%s,%s\n",
   24789                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   24790          }
   24791          putXMMRegLane64F( rD, 0,
   24792                            unop(Iop_I32StoF64, mkexpr(arg32)));
   24793          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   24794          putYMMRegLane128( rD, 1, mkV128(0) );
   24795          *uses_vvvv = True;
   24796          goto decode_success;
   24797       }
   24798       /* VCVTSI2SD r/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.W1 2A /r */
   24799       if (haveF2no66noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
   24800          UChar  modrm = getUChar(delta);
   24801          UInt   rV    = getVexNvvvv(pfx);
   24802          UInt   rD    = gregOfRexRM(pfx, modrm);
   24803          IRTemp arg64 = newTemp(Ity_I64);
   24804          if (epartIsReg(modrm)) {
   24805             UInt rS = eregOfRexRM(pfx,modrm);
   24806             assign( arg64, getIReg64(rS) );
   24807             delta += 1;
   24808             DIP("vcvtsi2sdq %s,%s,%s\n",
   24809                 nameIReg64(rS), nameXMMReg(rV), nameXMMReg(rD));
   24810          } else {
   24811             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24812             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   24813             delta += alen;
   24814             DIP("vcvtsi2sdq %s,%s,%s\n",
   24815                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   24816          }
   24817          putXMMRegLane64F( rD, 0,
   24818                            binop( Iop_I64StoF64,
   24819                                   get_sse_roundingmode(),
   24820                                   mkexpr(arg64)) );
   24821          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   24822          putYMMRegLane128( rD, 1, mkV128(0) );
   24823          *uses_vvvv = True;
   24824          goto decode_success;
   24825       }
   24826       /* VCVTSI2SS r/m64, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.W1 2A /r */
   24827       if (haveF3no66noF2(pfx) && 1==getRexW(pfx)/*W1*/) {
   24828          UChar  modrm = getUChar(delta);
   24829          UInt   rV    = getVexNvvvv(pfx);
   24830          UInt   rD    = gregOfRexRM(pfx, modrm);
   24831          IRTemp arg64 = newTemp(Ity_I64);
   24832          if (epartIsReg(modrm)) {
   24833             UInt rS = eregOfRexRM(pfx,modrm);
   24834             assign( arg64, getIReg64(rS) );
   24835             delta += 1;
   24836             DIP("vcvtsi2ssq %s,%s,%s\n",
   24837                 nameIReg64(rS), nameXMMReg(rV), nameXMMReg(rD));
   24838          } else {
   24839             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24840             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   24841             delta += alen;
   24842             DIP("vcvtsi2ssq %s,%s,%s\n",
   24843                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   24844          }
   24845          putXMMRegLane32F( rD, 0,
   24846                            binop(Iop_F64toF32,
   24847                                  mkexpr(rmode),
   24848                                  binop(Iop_I64StoF64, mkexpr(rmode),
   24849                                                       mkexpr(arg64)) ) );
   24850          putXMMRegLane32( rD, 1, getXMMRegLane32( rV, 1 ));
   24851          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   24852          putYMMRegLane128( rD, 1, mkV128(0) );
   24853          *uses_vvvv = True;
   24854          goto decode_success;
   24855       }
   24856       /* VCVTSI2SS r/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.W0 2A /r */
   24857       if (haveF3no66noF2(pfx) && 0==getRexW(pfx)/*W0*/) {
   24858          UChar  modrm = getUChar(delta);
   24859          UInt   rV    = getVexNvvvv(pfx);
   24860          UInt   rD    = gregOfRexRM(pfx, modrm);
   24861          IRTemp arg32 = newTemp(Ity_I32);
   24862          if (epartIsReg(modrm)) {
   24863             UInt rS = eregOfRexRM(pfx,modrm);
   24864             assign( arg32, getIReg32(rS) );
   24865             delta += 1;
   24866             DIP("vcvtsi2ssl %s,%s,%s\n",
   24867                 nameIReg32(rS), nameXMMReg(rV), nameXMMReg(rD));
   24868          } else {
   24869             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24870             assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   24871             delta += alen;
   24872             DIP("vcvtsi2ssl %s,%s,%s\n",
   24873                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   24874          }
   24875          putXMMRegLane32F( rD, 0,
   24876                            binop(Iop_F64toF32,
   24877                                  mkexpr(rmode),
   24878                                  unop(Iop_I32StoF64, mkexpr(arg32)) ) );
   24879          putXMMRegLane32( rD, 1, getXMMRegLane32( rV, 1 ));
   24880          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   24881          putYMMRegLane128( rD, 1, mkV128(0) );
   24882          *uses_vvvv = True;
   24883          goto decode_success;
   24884       }
   24885       break;
   24886    }
   24887 
   24888    case 0x2B:
   24889       /* VMOVNTPD xmm1, m128 = VEX.128.66.0F.WIG 2B /r */
   24890       /* VMOVNTPS xmm1, m128 = VEX.128.0F.WIG 2B /r */
   24891       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   24892           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
   24893          UChar  modrm = getUChar(delta);
   24894          UInt   rS    = gregOfRexRM(pfx, modrm);
   24895          IRTemp tS    = newTemp(Ity_V128);
   24896          assign(tS, getXMMReg(rS));
   24897          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   24898          delta += alen;
   24899          gen_SEGV_if_not_16_aligned(addr);
   24900          storeLE(mkexpr(addr), mkexpr(tS));
   24901          DIP("vmovntp%c %s,%s\n", have66(pfx) ? 'd' : 's',
   24902              nameXMMReg(rS), dis_buf);
   24903          goto decode_success;
   24904       }
   24905       /* VMOVNTPD ymm1, m256 = VEX.256.66.0F.WIG 2B /r */
   24906       /* VMOVNTPS ymm1, m256 = VEX.256.0F.WIG 2B /r */
   24907       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   24908           && 1==getVexL(pfx)/*256*/ && !epartIsReg(getUChar(delta))) {
   24909          UChar  modrm = getUChar(delta);
   24910          UInt   rS    = gregOfRexRM(pfx, modrm);
   24911          IRTemp tS    = newTemp(Ity_V256);
   24912          assign(tS, getYMMReg(rS));
   24913          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   24914          delta += alen;
   24915          gen_SEGV_if_not_32_aligned(addr);
   24916          storeLE(mkexpr(addr), mkexpr(tS));
   24917          DIP("vmovntp%c %s,%s\n", have66(pfx) ? 'd' : 's',
   24918              nameYMMReg(rS), dis_buf);
   24919          goto decode_success;
   24920       }
   24921       break;
   24922 
   24923    case 0x2C:
   24924       /* VCVTTSD2SI xmm1/m32, r32 = VEX.LIG.F2.0F.W0 2C /r */
   24925       if (haveF2no66noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
   24926          delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
   24927          goto decode_success;
   24928       }
   24929       /* VCVTTSD2SI xmm1/m64, r64 = VEX.LIG.F2.0F.W1 2C /r */
   24930       if (haveF2no66noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
   24931          delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
   24932          goto decode_success;
   24933       }
   24934       /* VCVTTSS2SI xmm1/m32, r32 = VEX.LIG.F3.0F.W0 2C /r */
   24935       if (haveF3no66noF2(pfx) && 0==getRexW(pfx)/*W0*/) {
   24936          delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
   24937          goto decode_success;
   24938       }
   24939       /* VCVTTSS2SI xmm1/m64, r64 = VEX.LIG.F3.0F.W1 2C /r */
   24940       if (haveF3no66noF2(pfx) && 1==getRexW(pfx)/*W1*/) {
   24941          delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
   24942          goto decode_success;
   24943       }
   24944       break;
   24945 
   24946    case 0x2D:
   24947       /* VCVTSD2SI xmm1/m32, r32 = VEX.LIG.F2.0F.W0 2D /r */
   24948       if (haveF2no66noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
   24949          delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
   24950          goto decode_success;
   24951       }
   24952       /* VCVTSD2SI xmm1/m64, r64 = VEX.LIG.F2.0F.W1 2D /r */
   24953       if (haveF2no66noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
   24954          delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
   24955          goto decode_success;
   24956       }
   24957       /* VCVTSS2SI xmm1/m32, r32 = VEX.LIG.F3.0F.W0 2D /r */
   24958       if (haveF3no66noF2(pfx) && 0==getRexW(pfx)/*W0*/) {
   24959          delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
   24960          goto decode_success;
   24961       }
   24962       /* VCVTSS2SI xmm1/m64, r64 = VEX.LIG.F3.0F.W1 2D /r */
   24963       if (haveF3no66noF2(pfx) && 1==getRexW(pfx)/*W1*/) {
   24964          delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
   24965          goto decode_success;
   24966       }
   24967       break;
   24968 
   24969    case 0x2E:
   24970    case 0x2F:
   24971       /* VUCOMISD xmm2/m64, xmm1 = VEX.LIG.66.0F.WIG 2E /r */
   24972       /* VCOMISD  xmm2/m64, xmm1 = VEX.LIG.66.0F.WIG 2F /r */
   24973       if (have66noF2noF3(pfx)) {
   24974          delta = dis_COMISD( vbi, pfx, delta, True/*isAvx*/, opc );
   24975          goto decode_success;
   24976       }
   24977       /* VUCOMISS xmm2/m32, xmm1 = VEX.LIG.0F.WIG 2E /r */
   24978       /* VCOMISS xmm2/m32, xmm1  = VEX.LIG.0F.WIG 2F /r */
   24979       if (haveNo66noF2noF3(pfx)) {
   24980          delta = dis_COMISS( vbi, pfx, delta, True/*isAvx*/, opc );
   24981          goto decode_success;
   24982       }
   24983       break;
   24984 
   24985    case 0x50:
   24986       /* VMOVMSKPD xmm2, r32 = VEX.128.66.0F.WIG 50 /r */
   24987       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24988          delta = dis_MOVMSKPD_128( vbi, pfx, delta, True/*isAvx*/ );
   24989          goto decode_success;
   24990       }
   24991       /* VMOVMSKPD ymm2, r32 = VEX.256.66.0F.WIG 50 /r */
   24992       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24993          delta = dis_MOVMSKPD_256( vbi, pfx, delta );
   24994          goto decode_success;
   24995       }
   24996       /* VMOVMSKPS xmm2, r32 = VEX.128.0F.WIG 50 /r */
   24997       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24998          delta = dis_MOVMSKPS_128( vbi, pfx, delta, True/*isAvx*/ );
   24999          goto decode_success;
   25000       }
   25001       /* VMOVMSKPS ymm2, r32 = VEX.256.0F.WIG 50 /r */
   25002       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25003          delta = dis_MOVMSKPS_256( vbi, pfx, delta );
   25004          goto decode_success;
   25005       }
   25006       break;
   25007 
   25008    case 0x51:
   25009       /* VSQRTSS xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F3.0F.WIG 51 /r */
   25010       if (haveF3no66noF2(pfx)) {
   25011          delta = dis_AVX128_E_V_to_G_lo32_unary(
   25012                     uses_vvvv, vbi, pfx, delta, "vsqrtss", Iop_Sqrt32F0x4 );
   25013          goto decode_success;
   25014       }
   25015       /* VSQRTPS xmm2/m128(E), xmm1(G) = VEX.NDS.128.0F.WIG 51 /r */
   25016       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25017          delta = dis_AVX128_E_to_G_unary_all(
   25018                     uses_vvvv, vbi, pfx, delta, "vsqrtps", Iop_Sqrt32Fx4 );
   25019          goto decode_success;
   25020       }
   25021       /* VSQRTPS ymm2/m256(E), ymm1(G) = VEX.NDS.256.0F.WIG 51 /r */
   25022       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25023          delta = dis_AVX256_E_to_G_unary_all(
   25024                     uses_vvvv, vbi, pfx, delta, "vsqrtps", Iop_Sqrt32Fx8 );
   25025          goto decode_success;
   25026       }
   25027       /* VSQRTSD xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F2.0F.WIG 51 /r */
   25028       if (haveF2no66noF3(pfx)) {
   25029          delta = dis_AVX128_E_V_to_G_lo64_unary(
   25030                     uses_vvvv, vbi, pfx, delta, "vsqrtsd", Iop_Sqrt64F0x2 );
   25031          goto decode_success;
   25032       }
   25033       /* VSQRTPD xmm2/m128(E), xmm1(G) = VEX.NDS.128.66.0F.WIG 51 /r */
   25034       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25035          delta = dis_AVX128_E_to_G_unary_all(
   25036                     uses_vvvv, vbi, pfx, delta, "vsqrtpd", Iop_Sqrt64Fx2 );
   25037          goto decode_success;
   25038       }
   25039       /* VSQRTPD ymm2/m256(E), ymm1(G) = VEX.NDS.256.66.0F.WIG 51 /r */
   25040       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25041          delta = dis_AVX256_E_to_G_unary_all(
   25042                     uses_vvvv, vbi, pfx, delta, "vsqrtpd", Iop_Sqrt64Fx4 );
   25043          goto decode_success;
   25044       }
   25045       break;
   25046 
   25047    case 0x52:
   25048       /* VRSQRTSS xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F3.0F.WIG 52 /r */
   25049       if (haveF3no66noF2(pfx)) {
   25050          delta = dis_AVX128_E_V_to_G_lo32_unary(
   25051                     uses_vvvv, vbi, pfx, delta, "vrsqrtss",
   25052                     Iop_RSqrtEst32F0x4 );
   25053          goto decode_success;
   25054       }
   25055       /* VRSQRTPS xmm2/m128(E), xmm1(G) = VEX.NDS.128.0F.WIG 52 /r */
   25056       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25057          delta = dis_AVX128_E_to_G_unary_all(
   25058                     uses_vvvv, vbi, pfx, delta, "vrsqrtps", Iop_RSqrtEst32Fx4 );
   25059          goto decode_success;
   25060       }
   25061       /* VRSQRTPS ymm2/m256(E), ymm1(G) = VEX.NDS.256.0F.WIG 52 /r */
   25062       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25063          delta = dis_AVX256_E_to_G_unary_all(
   25064                     uses_vvvv, vbi, pfx, delta, "vrsqrtps", Iop_RSqrtEst32Fx8 );
   25065          goto decode_success;
   25066       }
   25067       break;
   25068 
   25069    case 0x53:
   25070       /* VRCPSS xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F3.0F.WIG 53 /r */
   25071       if (haveF3no66noF2(pfx)) {
   25072          delta = dis_AVX128_E_V_to_G_lo32_unary(
   25073                     uses_vvvv, vbi, pfx, delta, "vrcpss", Iop_RecipEst32F0x4 );
   25074          goto decode_success;
   25075       }
   25076       /* VRCPPS xmm2/m128(E), xmm1(G) = VEX.NDS.128.0F.WIG 53 /r */
   25077       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25078          delta = dis_AVX128_E_to_G_unary_all(
   25079                     uses_vvvv, vbi, pfx, delta, "vrcpps", Iop_RecipEst32Fx4 );
   25080          goto decode_success;
   25081       }
   25082       /* VRCPPS ymm2/m256(E), ymm1(G) = VEX.NDS.256.0F.WIG 53 /r */
   25083       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25084          delta = dis_AVX256_E_to_G_unary_all(
   25085                     uses_vvvv, vbi, pfx, delta, "vrcpps", Iop_RecipEst32Fx8 );
   25086          goto decode_success;
   25087       }
   25088       break;
   25089 
   25090    case 0x54:
   25091       /* VANDPD r/m, rV, r ::: r = rV & r/m */
   25092       /* VANDPD = VEX.NDS.128.66.0F.WIG 54 /r */
   25093       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25094          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25095                     uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV128 );
   25096          goto decode_success;
   25097       }
   25098       /* VANDPD r/m, rV, r ::: r = rV & r/m */
   25099       /* VANDPD = VEX.NDS.256.66.0F.WIG 54 /r */
   25100       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25101          delta = dis_AVX256_E_V_to_G(
   25102                     uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV256 );
   25103          goto decode_success;
   25104       }
   25105       /* VANDPS = VEX.NDS.128.0F.WIG 54 /r */
   25106       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25107          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25108                     uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV128 );
   25109          goto decode_success;
   25110       }
   25111       /* VANDPS = VEX.NDS.256.0F.WIG 54 /r */
   25112       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25113          delta = dis_AVX256_E_V_to_G(
   25114                     uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV256 );
   25115          goto decode_success;
   25116       }
   25117       break;
   25118 
   25119    case 0x55:
   25120       /* VANDNPD r/m, rV, r ::: r = (not rV) & r/m */
   25121       /* VANDNPD = VEX.NDS.128.66.0F.WIG 55 /r */
   25122       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25123          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25124                     uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV128,
   25125                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
   25126          goto decode_success;
   25127       }
   25128       /* VANDNPD = VEX.NDS.256.66.0F.WIG 55 /r */
   25129       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25130          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
   25131                     uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV256,
   25132                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
   25133          goto decode_success;
   25134       }
   25135       /* VANDNPS = VEX.NDS.128.0F.WIG 55 /r */
   25136       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25137          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25138                     uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV128,
   25139                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
   25140          goto decode_success;
   25141       }
   25142       /* VANDNPS = VEX.NDS.256.0F.WIG 55 /r */
   25143       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25144          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
   25145                     uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV256,
   25146                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
   25147          goto decode_success;
   25148       }
   25149       break;
   25150 
   25151    case 0x56:
   25152       /* VORPD r/m, rV, r ::: r = rV | r/m */
   25153       /* VORPD = VEX.NDS.128.66.0F.WIG 56 /r */
   25154       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25155          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25156                     uses_vvvv, vbi, pfx, delta, "vorpd", Iop_OrV128 );
   25157          goto decode_success;
   25158       }
   25159       /* VORPD r/m, rV, r ::: r = rV | r/m */
   25160       /* VORPD = VEX.NDS.256.66.0F.WIG 56 /r */
   25161       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25162          delta = dis_AVX256_E_V_to_G(
   25163                     uses_vvvv, vbi, pfx, delta, "vorpd", Iop_OrV256 );
   25164          goto decode_success;
   25165       }
   25166       /* VORPS r/m, rV, r ::: r = rV | r/m */
   25167       /* VORPS = VEX.NDS.128.0F.WIG 56 /r */
   25168       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25169          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25170                     uses_vvvv, vbi, pfx, delta, "vorps", Iop_OrV128 );
   25171          goto decode_success;
   25172       }
   25173       /* VORPS r/m, rV, r ::: r = rV | r/m */
   25174       /* VORPS = VEX.NDS.256.0F.WIG 56 /r */
   25175       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25176          delta = dis_AVX256_E_V_to_G(
   25177                     uses_vvvv, vbi, pfx, delta, "vorps", Iop_OrV256 );
   25178          goto decode_success;
   25179       }
   25180       break;
   25181 
   25182    case 0x57:
   25183       /* VXORPD r/m, rV, r ::: r = rV ^ r/m */
   25184       /* VXORPD = VEX.NDS.128.66.0F.WIG 57 /r */
   25185       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25186          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25187                     uses_vvvv, vbi, pfx, delta, "vxorpd", Iop_XorV128 );
   25188          goto decode_success;
   25189       }
   25190       /* VXORPD r/m, rV, r ::: r = rV ^ r/m */
   25191       /* VXORPD = VEX.NDS.256.66.0F.WIG 57 /r */
   25192       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25193          delta = dis_AVX256_E_V_to_G(
   25194                     uses_vvvv, vbi, pfx, delta, "vxorpd", Iop_XorV256 );
   25195          goto decode_success;
   25196       }
   25197       /* VXORPS r/m, rV, r ::: r = rV ^ r/m */
   25198       /* VXORPS = VEX.NDS.128.0F.WIG 57 /r */
   25199       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25200          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25201                     uses_vvvv, vbi, pfx, delta, "vxorps", Iop_XorV128 );
   25202          goto decode_success;
   25203       }
   25204       /* VXORPS r/m, rV, r ::: r = rV ^ r/m */
   25205       /* VXORPS = VEX.NDS.256.0F.WIG 57 /r */
   25206       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25207          delta = dis_AVX256_E_V_to_G(
   25208                     uses_vvvv, vbi, pfx, delta, "vxorps", Iop_XorV256 );
   25209          goto decode_success;
   25210       }
   25211       break;
   25212 
   25213    case 0x58:
   25214       /* VADDSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 58 /r */
   25215       if (haveF2no66noF3(pfx)) {
   25216          delta = dis_AVX128_E_V_to_G_lo64(
   25217                     uses_vvvv, vbi, pfx, delta, "vaddsd", Iop_Add64F0x2 );
   25218          goto decode_success;
   25219       }
   25220       /* VADDSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 58 /r */
   25221       if (haveF3no66noF2(pfx)) {
   25222          delta = dis_AVX128_E_V_to_G_lo32(
   25223                     uses_vvvv, vbi, pfx, delta, "vaddss", Iop_Add32F0x4 );
   25224          goto decode_success;
   25225       }
   25226       /* VADDPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 58 /r */
   25227       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25228          delta = dis_AVX128_E_V_to_G(
   25229                     uses_vvvv, vbi, pfx, delta, "vaddps", Iop_Add32Fx4 );
   25230          goto decode_success;
   25231       }
   25232       /* VADDPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 58 /r */
   25233       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25234          delta = dis_AVX256_E_V_to_G(
   25235                     uses_vvvv, vbi, pfx, delta, "vaddps", Iop_Add32Fx8 );
   25236          goto decode_success;
   25237       }
   25238       /* VADDPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 58 /r */
   25239       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25240          delta = dis_AVX128_E_V_to_G(
   25241                     uses_vvvv, vbi, pfx, delta, "vaddpd", Iop_Add64Fx2 );
   25242          goto decode_success;
   25243       }
   25244       /* VADDPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 58 /r */
   25245       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25246          delta = dis_AVX256_E_V_to_G(
   25247                     uses_vvvv, vbi, pfx, delta, "vaddpd", Iop_Add64Fx4 );
   25248          goto decode_success;
   25249       }
   25250       break;
   25251 
   25252    case 0x59:
   25253       /* VMULSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 59 /r */
   25254       if (haveF2no66noF3(pfx)) {
   25255          delta = dis_AVX128_E_V_to_G_lo64(
   25256                     uses_vvvv, vbi, pfx, delta, "vmulsd", Iop_Mul64F0x2 );
   25257          goto decode_success;
   25258       }
   25259       /* VMULSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 59 /r */
   25260       if (haveF3no66noF2(pfx)) {
   25261          delta = dis_AVX128_E_V_to_G_lo32(
   25262                     uses_vvvv, vbi, pfx, delta, "vmulss", Iop_Mul32F0x4 );
   25263          goto decode_success;
   25264       }
   25265       /* VMULPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 59 /r */
   25266       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25267          delta = dis_AVX128_E_V_to_G(
   25268                     uses_vvvv, vbi, pfx, delta, "vmulps", Iop_Mul32Fx4 );
   25269          goto decode_success;
   25270       }
   25271       /* VMULPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 59 /r */
   25272       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25273          delta = dis_AVX256_E_V_to_G(
   25274                     uses_vvvv, vbi, pfx, delta, "vmulps", Iop_Mul32Fx8 );
   25275          goto decode_success;
   25276       }
   25277       /* VMULPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 59 /r */
   25278       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25279          delta = dis_AVX128_E_V_to_G(
   25280                     uses_vvvv, vbi, pfx, delta, "vmulpd", Iop_Mul64Fx2 );
   25281          goto decode_success;
   25282       }
   25283       /* VMULPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 59 /r */
   25284       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25285          delta = dis_AVX256_E_V_to_G(
   25286                     uses_vvvv, vbi, pfx, delta, "vmulpd", Iop_Mul64Fx4 );
   25287          goto decode_success;
   25288       }
   25289       break;
   25290 
   25291    case 0x5A:
   25292       /* VCVTPS2PD xmm2/m64, xmm1 = VEX.128.0F.WIG 5A /r */
   25293       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25294          delta = dis_CVTPS2PD_128( vbi, pfx, delta, True/*isAvx*/ );
   25295          goto decode_success;
   25296       }
   25297       /* VCVTPS2PD xmm2/m128, ymm1 = VEX.256.0F.WIG 5A /r */
   25298       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25299          delta = dis_CVTPS2PD_256( vbi, pfx, delta );
   25300          goto decode_success;
   25301       }
   25302       /* VCVTPD2PS xmm2/m128, xmm1 = VEX.128.66.0F.WIG 5A /r */
   25303       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25304          delta = dis_CVTPD2PS_128( vbi, pfx, delta, True/*isAvx*/ );
   25305          goto decode_success;
   25306       }
   25307       /* VCVTPD2PS ymm2/m256, xmm1 = VEX.256.66.0F.WIG 5A /r */
   25308       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25309          delta = dis_CVTPD2PS_256( vbi, pfx, delta );
   25310          goto decode_success;
   25311       }
   25312       /* VCVTSD2SS xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5A /r */
   25313       if (haveF2no66noF3(pfx)) {
   25314          UChar  modrm = getUChar(delta);
   25315          UInt   rV    = getVexNvvvv(pfx);
   25316          UInt   rD    = gregOfRexRM(pfx, modrm);
   25317          IRTemp f64lo = newTemp(Ity_F64);
   25318          IRTemp rmode = newTemp(Ity_I32);
   25319          assign( rmode, get_sse_roundingmode() );
   25320          if (epartIsReg(modrm)) {
   25321             UInt rS = eregOfRexRM(pfx,modrm);
   25322             assign(f64lo, getXMMRegLane64F(rS, 0));
   25323             delta += 1;
   25324             DIP("vcvtsd2ss %s,%s,%s\n",
   25325                 nameXMMReg(rS), nameXMMReg(rV), nameXMMReg(rD));
   25326          } else {
   25327             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   25328             assign(f64lo, loadLE(Ity_F64, mkexpr(addr)) );
   25329             delta += alen;
   25330             DIP("vcvtsd2ss %s,%s,%s\n",
   25331                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   25332          }
   25333          putXMMRegLane32F( rD, 0,
   25334                            binop( Iop_F64toF32, mkexpr(rmode),
   25335                                                 mkexpr(f64lo)) );
   25336          putXMMRegLane32( rD, 1, getXMMRegLane32( rV, 1 ));
   25337          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   25338          putYMMRegLane128( rD, 1, mkV128(0) );
   25339          *uses_vvvv = True;
   25340          goto decode_success;
   25341       }
   25342       /* VCVTSS2SD xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5A /r */
   25343       if (haveF3no66noF2(pfx)) {
   25344          UChar  modrm = getUChar(delta);
   25345          UInt   rV    = getVexNvvvv(pfx);
   25346          UInt   rD    = gregOfRexRM(pfx, modrm);
   25347          IRTemp f32lo = newTemp(Ity_F32);
   25348          if (epartIsReg(modrm)) {
   25349             UInt rS = eregOfRexRM(pfx,modrm);
   25350             assign(f32lo, getXMMRegLane32F(rS, 0));
   25351             delta += 1;
   25352             DIP("vcvtss2sd %s,%s,%s\n",
   25353                 nameXMMReg(rS), nameXMMReg(rV), nameXMMReg(rD));
   25354          } else {
   25355             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   25356             assign(f32lo, loadLE(Ity_F32, mkexpr(addr)) );
   25357             delta += alen;
   25358             DIP("vcvtss2sd %s,%s,%s\n",
   25359                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   25360          }
   25361          putXMMRegLane64F( rD, 0,
   25362                            unop( Iop_F32toF64, mkexpr(f32lo)) );
   25363          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   25364          putYMMRegLane128( rD, 1, mkV128(0) );
   25365          *uses_vvvv = True;
   25366          goto decode_success;
   25367       }
   25368       break;
   25369 
   25370    case 0x5B:
   25371       /* VCVTPS2DQ xmm2/m128, xmm1 = VEX.128.66.0F.WIG 5B /r */
   25372       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25373          delta = dis_CVTxPS2DQ_128( vbi, pfx, delta,
   25374                                     True/*isAvx*/, False/*!r2zero*/ );
   25375          goto decode_success;
   25376       }
   25377       /* VCVTPS2DQ ymm2/m256, ymm1 = VEX.256.66.0F.WIG 5B /r */
   25378       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25379          delta = dis_CVTxPS2DQ_256( vbi, pfx, delta,
   25380                                     False/*!r2zero*/ );
   25381          goto decode_success;
   25382       }
   25383       /* VCVTTPS2DQ xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 5B /r */
   25384       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
   25385          delta = dis_CVTxPS2DQ_128( vbi, pfx, delta,
   25386                                     True/*isAvx*/, True/*r2zero*/ );
   25387          goto decode_success;
   25388       }
   25389       /* VCVTTPS2DQ ymm2/m256, ymm1 = VEX.256.F3.0F.WIG 5B /r */
   25390       if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
   25391          delta = dis_CVTxPS2DQ_256( vbi, pfx, delta,
   25392                                     True/*r2zero*/ );
   25393          goto decode_success;
   25394       }
   25395       /* VCVTDQ2PS xmm2/m128, xmm1 = VEX.128.0F.WIG 5B /r */
   25396       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25397          delta = dis_CVTDQ2PS_128 ( vbi, pfx, delta, True/*isAvx*/ );
   25398          goto decode_success;
   25399       }
   25400       /* VCVTDQ2PS ymm2/m256, ymm1 = VEX.256.0F.WIG 5B /r */
   25401       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25402          delta = dis_CVTDQ2PS_256 ( vbi, pfx, delta );
   25403          goto decode_success;
   25404       }
   25405       break;
   25406 
   25407    case 0x5C:
   25408       /* VSUBSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5C /r */
   25409       if (haveF2no66noF3(pfx)) {
   25410          delta = dis_AVX128_E_V_to_G_lo64(
   25411                     uses_vvvv, vbi, pfx, delta, "vsubsd", Iop_Sub64F0x2 );
   25412          goto decode_success;
   25413       }
   25414       /* VSUBSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5C /r */
   25415       if (haveF3no66noF2(pfx)) {
   25416          delta = dis_AVX128_E_V_to_G_lo32(
   25417                     uses_vvvv, vbi, pfx, delta, "vsubss", Iop_Sub32F0x4 );
   25418          goto decode_success;
   25419       }
   25420       /* VSUBPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5C /r */
   25421       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25422          delta = dis_AVX128_E_V_to_G(
   25423                     uses_vvvv, vbi, pfx, delta, "vsubps", Iop_Sub32Fx4 );
   25424          goto decode_success;
   25425       }
   25426       /* VSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5C /r */
   25427       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25428          delta = dis_AVX256_E_V_to_G(
   25429                     uses_vvvv, vbi, pfx, delta, "vsubps", Iop_Sub32Fx8 );
   25430          goto decode_success;
   25431       }
   25432       /* VSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5C /r */
   25433       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25434          delta = dis_AVX128_E_V_to_G(
   25435                     uses_vvvv, vbi, pfx, delta, "vsubpd", Iop_Sub64Fx2 );
   25436          goto decode_success;
   25437       }
   25438       /* VSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5C /r */
   25439       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25440          delta = dis_AVX256_E_V_to_G(
   25441                     uses_vvvv, vbi, pfx, delta, "vsubpd", Iop_Sub64Fx4 );
   25442          goto decode_success;
   25443       }
   25444       break;
   25445 
   25446    case 0x5D:
   25447       /* VMINSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5D /r */
   25448       if (haveF2no66noF3(pfx)) {
   25449          delta = dis_AVX128_E_V_to_G_lo64(
   25450                     uses_vvvv, vbi, pfx, delta, "vminsd", Iop_Min64F0x2 );
   25451          goto decode_success;
   25452       }
   25453       /* VMINSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5D /r */
   25454       if (haveF3no66noF2(pfx)) {
   25455          delta = dis_AVX128_E_V_to_G_lo32(
   25456                     uses_vvvv, vbi, pfx, delta, "vminss", Iop_Min32F0x4 );
   25457          goto decode_success;
   25458       }
   25459       /* VMINPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5D /r */
   25460       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25461          delta = dis_AVX128_E_V_to_G(
   25462                     uses_vvvv, vbi, pfx, delta, "vminps", Iop_Min32Fx4 );
   25463          goto decode_success;
   25464       }
   25465       /* VMINPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5D /r */
   25466       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25467          delta = dis_AVX256_E_V_to_G(
   25468                     uses_vvvv, vbi, pfx, delta, "vminps", Iop_Min32Fx8 );
   25469          goto decode_success;
   25470       }
   25471       /* VMINPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5D /r */
   25472       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25473          delta = dis_AVX128_E_V_to_G(
   25474                     uses_vvvv, vbi, pfx, delta, "vminpd", Iop_Min64Fx2 );
   25475          goto decode_success;
   25476       }
   25477       /* VMINPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5D /r */
   25478       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25479          delta = dis_AVX256_E_V_to_G(
   25480                     uses_vvvv, vbi, pfx, delta, "vminpd", Iop_Min64Fx4 );
   25481          goto decode_success;
   25482       }
   25483       break;
   25484 
   25485    case 0x5E:
   25486       /* VDIVSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5E /r */
   25487       if (haveF2no66noF3(pfx)) {
   25488          delta = dis_AVX128_E_V_to_G_lo64(
   25489                     uses_vvvv, vbi, pfx, delta, "vdivsd", Iop_Div64F0x2 );
   25490          goto decode_success;
   25491       }
   25492       /* VDIVSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5E /r */
   25493       if (haveF3no66noF2(pfx)) {
   25494          delta = dis_AVX128_E_V_to_G_lo32(
   25495                     uses_vvvv, vbi, pfx, delta, "vdivss", Iop_Div32F0x4 );
   25496          goto decode_success;
   25497       }
   25498       /* VDIVPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5E /r */
   25499       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25500          delta = dis_AVX128_E_V_to_G(
   25501                     uses_vvvv, vbi, pfx, delta, "vdivps", Iop_Div32Fx4 );
   25502          goto decode_success;
   25503       }
   25504       /* VDIVPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5E /r */
   25505       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25506          delta = dis_AVX256_E_V_to_G(
   25507                     uses_vvvv, vbi, pfx, delta, "vdivps", Iop_Div32Fx8 );
   25508          goto decode_success;
   25509       }
   25510       /* VDIVPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5E /r */
   25511       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25512          delta = dis_AVX128_E_V_to_G(
   25513                     uses_vvvv, vbi, pfx, delta, "vdivpd", Iop_Div64Fx2 );
   25514          goto decode_success;
   25515       }
   25516       /* VDIVPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5E /r */
   25517       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25518          delta = dis_AVX256_E_V_to_G(
   25519                     uses_vvvv, vbi, pfx, delta, "vdivpd", Iop_Div64Fx4 );
   25520          goto decode_success;
   25521       }
   25522       break;
   25523 
   25524    case 0x5F:
   25525       /* VMAXSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5F /r */
   25526       if (haveF2no66noF3(pfx)) {
   25527          delta = dis_AVX128_E_V_to_G_lo64(
   25528                     uses_vvvv, vbi, pfx, delta, "vmaxsd", Iop_Max64F0x2 );
   25529          goto decode_success;
   25530       }
   25531       /* VMAXSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5F /r */
   25532       if (haveF3no66noF2(pfx)) {
   25533          delta = dis_AVX128_E_V_to_G_lo32(
   25534                     uses_vvvv, vbi, pfx, delta, "vmaxss", Iop_Max32F0x4 );
   25535          goto decode_success;
   25536       }
   25537       /* VMAXPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5F /r */
   25538       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25539          delta = dis_AVX128_E_V_to_G(
   25540                     uses_vvvv, vbi, pfx, delta, "vmaxps", Iop_Max32Fx4 );
   25541          goto decode_success;
   25542       }
   25543       /* VMAXPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5F /r */
   25544       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25545          delta = dis_AVX256_E_V_to_G(
   25546                     uses_vvvv, vbi, pfx, delta, "vmaxps", Iop_Max32Fx8 );
   25547          goto decode_success;
   25548       }
   25549       /* VMAXPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5F /r */
   25550       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25551          delta = dis_AVX128_E_V_to_G(
   25552                     uses_vvvv, vbi, pfx, delta, "vmaxpd", Iop_Max64Fx2 );
   25553          goto decode_success;
   25554       }
   25555       /* VMAXPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5F /r */
   25556       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25557          delta = dis_AVX256_E_V_to_G(
   25558                     uses_vvvv, vbi, pfx, delta, "vmaxpd", Iop_Max64Fx4 );
   25559          goto decode_success;
   25560       }
   25561       break;
   25562 
   25563    case 0x60:
   25564       /* VPUNPCKLBW r/m, rV, r ::: r = interleave-lo-bytes(rV, r/m) */
   25565       /* VPUNPCKLBW = VEX.NDS.128.66.0F.WIG 60 /r */
   25566       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25567          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25568                     uses_vvvv, vbi, pfx, delta, "vpunpcklbw",
   25569                     Iop_InterleaveLO8x16, NULL,
   25570                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25571          goto decode_success;
   25572       }
   25573       /* VPUNPCKLBW r/m, rV, r ::: r = interleave-lo-bytes(rV, r/m) */
   25574       /* VPUNPCKLBW = VEX.NDS.256.66.0F.WIG 60 /r */
   25575       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25576          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25577                     uses_vvvv, vbi, pfx, delta, "vpunpcklbw",
   25578                     math_VPUNPCKLBW_YMM );
   25579          goto decode_success;
   25580       }
   25581       break;
   25582 
   25583    case 0x61:
   25584       /* VPUNPCKLWD r/m, rV, r ::: r = interleave-lo-words(rV, r/m) */
   25585       /* VPUNPCKLWD = VEX.NDS.128.66.0F.WIG 61 /r */
   25586       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25587          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25588                     uses_vvvv, vbi, pfx, delta, "vpunpcklwd",
   25589                     Iop_InterleaveLO16x8, NULL,
   25590                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25591          goto decode_success;
   25592       }
   25593       /* VPUNPCKLWD r/m, rV, r ::: r = interleave-lo-words(rV, r/m) */
   25594       /* VPUNPCKLWD = VEX.NDS.256.66.0F.WIG 61 /r */
   25595       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25596          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25597                     uses_vvvv, vbi, pfx, delta, "vpunpcklwd",
   25598                     math_VPUNPCKLWD_YMM );
   25599          goto decode_success;
   25600       }
   25601       break;
   25602 
   25603    case 0x62:
   25604       /* VPUNPCKLDQ r/m, rV, r ::: r = interleave-lo-dwords(rV, r/m) */
   25605       /* VPUNPCKLDQ = VEX.NDS.128.66.0F.WIG 62 /r */
   25606       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25607          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25608                     uses_vvvv, vbi, pfx, delta, "vpunpckldq",
   25609                     Iop_InterleaveLO32x4, NULL,
   25610                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25611          goto decode_success;
   25612       }
   25613       /* VPUNPCKLDQ r/m, rV, r ::: r = interleave-lo-dwords(rV, r/m) */
   25614       /* VPUNPCKLDQ = VEX.NDS.256.66.0F.WIG 62 /r */
   25615       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25616          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25617                     uses_vvvv, vbi, pfx, delta, "vpunpckldq",
   25618                     math_VPUNPCKLDQ_YMM );
   25619          goto decode_success;
   25620       }
   25621       break;
   25622 
   25623    case 0x63:
   25624       /* VPACKSSWB r/m, rV, r ::: r = QNarrowBin16Sto8Sx16(rV, r/m) */
   25625       /* VPACKSSWB = VEX.NDS.128.66.0F.WIG 63 /r */
   25626       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25627          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25628                     uses_vvvv, vbi, pfx, delta, "vpacksswb",
   25629                     Iop_QNarrowBin16Sto8Sx16, NULL,
   25630                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25631          goto decode_success;
   25632       }
   25633       /* VPACKSSWB r/m, rV, r ::: r = QNarrowBin16Sto8Sx16(rV, r/m) */
   25634       /* VPACKSSWB = VEX.NDS.256.66.0F.WIG 63 /r */
   25635       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25636          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25637                     uses_vvvv, vbi, pfx, delta, "vpacksswb",
   25638                     math_VPACKSSWB_YMM );
   25639          goto decode_success;
   25640       }
   25641       break;
   25642 
   25643    case 0x64:
   25644       /* VPCMPGTB r/m, rV, r ::: r = rV `>s-by-8s` r/m */
   25645       /* VPCMPGTB = VEX.NDS.128.66.0F.WIG 64 /r */
   25646       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25647          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25648                     uses_vvvv, vbi, pfx, delta, "vpcmpgtb", Iop_CmpGT8Sx16 );
   25649          goto decode_success;
   25650       }
   25651       /* VPCMPGTB r/m, rV, r ::: r = rV `>s-by-8s` r/m */
   25652       /* VPCMPGTB = VEX.NDS.256.66.0F.WIG 64 /r */
   25653       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25654          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   25655                     uses_vvvv, vbi, pfx, delta, "vpcmpgtb", Iop_CmpGT8Sx32 );
   25656          goto decode_success;
   25657       }
   25658       break;
   25659 
   25660    case 0x65:
   25661       /* VPCMPGTW r/m, rV, r ::: r = rV `>s-by-16s` r/m */
   25662       /* VPCMPGTW = VEX.NDS.128.66.0F.WIG 65 /r */
   25663       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25664          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25665                     uses_vvvv, vbi, pfx, delta, "vpcmpgtw", Iop_CmpGT16Sx8 );
   25666          goto decode_success;
   25667       }
   25668       /* VPCMPGTW r/m, rV, r ::: r = rV `>s-by-16s` r/m */
   25669       /* VPCMPGTW = VEX.NDS.256.66.0F.WIG 65 /r */
   25670       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25671          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   25672                     uses_vvvv, vbi, pfx, delta, "vpcmpgtw", Iop_CmpGT16Sx16 );
   25673          goto decode_success;
   25674       }
   25675       break;
   25676 
   25677    case 0x66:
   25678       /* VPCMPGTD r/m, rV, r ::: r = rV `>s-by-32s` r/m */
   25679       /* VPCMPGTD = VEX.NDS.128.66.0F.WIG 66 /r */
   25680       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25681          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25682                     uses_vvvv, vbi, pfx, delta, "vpcmpgtd", Iop_CmpGT32Sx4 );
   25683          goto decode_success;
   25684       }
   25685       /* VPCMPGTD r/m, rV, r ::: r = rV `>s-by-32s` r/m */
   25686       /* VPCMPGTD = VEX.NDS.256.66.0F.WIG 66 /r */
   25687       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25688          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   25689                     uses_vvvv, vbi, pfx, delta, "vpcmpgtd", Iop_CmpGT32Sx8 );
   25690          goto decode_success;
   25691       }
   25692       break;
   25693 
   25694    case 0x67:
   25695       /* VPACKUSWB r/m, rV, r ::: r = QNarrowBin16Sto8Ux16(rV, r/m) */
   25696       /* VPACKUSWB = VEX.NDS.128.66.0F.WIG 67 /r */
   25697       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25698          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25699                     uses_vvvv, vbi, pfx, delta, "vpackuswb",
   25700                     Iop_QNarrowBin16Sto8Ux16, NULL,
   25701                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25702          goto decode_success;
   25703       }
   25704       /* VPACKUSWB r/m, rV, r ::: r = QNarrowBin16Sto8Ux16(rV, r/m) */
   25705       /* VPACKUSWB = VEX.NDS.256.66.0F.WIG 67 /r */
   25706       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25707          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25708                     uses_vvvv, vbi, pfx, delta, "vpackuswb",
   25709                     math_VPACKUSWB_YMM );
   25710          goto decode_success;
   25711       }
   25712       break;
   25713 
   25714    case 0x68:
   25715       /* VPUNPCKHBW r/m, rV, r ::: r = interleave-hi-bytes(rV, r/m) */
   25716       /* VPUNPCKHBW = VEX.NDS.128.0F.WIG 68 /r */
   25717       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25718          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25719                     uses_vvvv, vbi, pfx, delta, "vpunpckhbw",
   25720                     Iop_InterleaveHI8x16, NULL,
   25721                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25722          goto decode_success;
   25723       }
   25724       /* VPUNPCKHBW r/m, rV, r ::: r = interleave-hi-bytes(rV, r/m) */
   25725       /* VPUNPCKHBW = VEX.NDS.256.0F.WIG 68 /r */
   25726       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25727          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25728                     uses_vvvv, vbi, pfx, delta, "vpunpckhbw",
   25729                     math_VPUNPCKHBW_YMM );
   25730          goto decode_success;
   25731       }
   25732       break;
   25733 
   25734    case 0x69:
   25735       /* VPUNPCKHWD r/m, rV, r ::: r = interleave-hi-words(rV, r/m) */
   25736       /* VPUNPCKHWD = VEX.NDS.128.0F.WIG 69 /r */
   25737       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25738          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25739                     uses_vvvv, vbi, pfx, delta, "vpunpckhwd",
   25740                     Iop_InterleaveHI16x8, NULL,
   25741                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25742          goto decode_success;
   25743       }
   25744       /* VPUNPCKHWD r/m, rV, r ::: r = interleave-hi-words(rV, r/m) */
   25745       /* VPUNPCKHWD = VEX.NDS.256.0F.WIG 69 /r */
   25746       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25747          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25748                     uses_vvvv, vbi, pfx, delta, "vpunpckhwd",
   25749                     math_VPUNPCKHWD_YMM );
   25750          goto decode_success;
   25751       }
   25752       break;
   25753 
   25754    case 0x6A:
   25755       /* VPUNPCKHDQ r/m, rV, r ::: r = interleave-hi-dwords(rV, r/m) */
   25756       /* VPUNPCKHDQ = VEX.NDS.128.66.0F.WIG 6A /r */
   25757       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25758          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25759                     uses_vvvv, vbi, pfx, delta, "vpunpckhdq",
   25760                     Iop_InterleaveHI32x4, NULL,
   25761                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25762          goto decode_success;
   25763       }
   25764       /* VPUNPCKHDQ r/m, rV, r ::: r = interleave-hi-dwords(rV, r/m) */
   25765       /* VPUNPCKHDQ = VEX.NDS.256.66.0F.WIG 6A /r */
   25766       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25767          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25768                     uses_vvvv, vbi, pfx, delta, "vpunpckhdq",
   25769                     math_VPUNPCKHDQ_YMM );
   25770          goto decode_success;
   25771       }
   25772       break;
   25773 
   25774    case 0x6B:
   25775       /* VPACKSSDW r/m, rV, r ::: r = QNarrowBin32Sto16Sx8(rV, r/m) */
   25776       /* VPACKSSDW = VEX.NDS.128.66.0F.WIG 6B /r */
   25777       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25778          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25779                     uses_vvvv, vbi, pfx, delta, "vpackssdw",
   25780                     Iop_QNarrowBin32Sto16Sx8, NULL,
   25781                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25782          goto decode_success;
   25783       }
   25784       /* VPACKSSDW r/m, rV, r ::: r = QNarrowBin32Sto16Sx8(rV, r/m) */
   25785       /* VPACKSSDW = VEX.NDS.256.66.0F.WIG 6B /r */
   25786       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25787          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25788                     uses_vvvv, vbi, pfx, delta, "vpackssdw",
   25789                     math_VPACKSSDW_YMM );
   25790          goto decode_success;
   25791       }
   25792       break;
   25793 
   25794    case 0x6C:
   25795       /* VPUNPCKLQDQ r/m, rV, r ::: r = interleave-lo-64bitses(rV, r/m) */
   25796       /* VPUNPCKLQDQ = VEX.NDS.128.0F.WIG 6C /r */
   25797       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25798          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25799                     uses_vvvv, vbi, pfx, delta, "vpunpcklqdq",
   25800                     Iop_InterleaveLO64x2, NULL,
   25801                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25802          goto decode_success;
   25803       }
   25804       /* VPUNPCKLQDQ r/m, rV, r ::: r = interleave-lo-64bitses(rV, r/m) */
   25805       /* VPUNPCKLQDQ = VEX.NDS.256.0F.WIG 6C /r */
   25806       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25807          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25808                     uses_vvvv, vbi, pfx, delta, "vpunpcklqdq",
   25809                     math_VPUNPCKLQDQ_YMM );
   25810          goto decode_success;
   25811       }
   25812       break;
   25813 
   25814    case 0x6D:
   25815       /* VPUNPCKHQDQ r/m, rV, r ::: r = interleave-hi-64bitses(rV, r/m) */
   25816       /* VPUNPCKHQDQ = VEX.NDS.128.0F.WIG 6D /r */
   25817       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25818          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25819                     uses_vvvv, vbi, pfx, delta, "vpunpckhqdq",
   25820                     Iop_InterleaveHI64x2, NULL,
   25821                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25822          goto decode_success;
   25823       }
   25824       /* VPUNPCKHQDQ r/m, rV, r ::: r = interleave-hi-64bitses(rV, r/m) */
   25825       /* VPUNPCKHQDQ = VEX.NDS.256.0F.WIG 6D /r */
   25826       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25827          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25828                     uses_vvvv, vbi, pfx, delta, "vpunpckhqdq",
   25829                     math_VPUNPCKHQDQ_YMM );
   25830          goto decode_success;
   25831       }
   25832       break;
   25833 
   25834    case 0x6E:
   25835       /* VMOVD r32/m32, xmm1 = VEX.128.66.0F.W0 6E */
   25836       if (have66noF2noF3(pfx)
   25837           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   25838          vassert(sz == 2); /* even tho we are transferring 4, not 2. */
   25839          UChar modrm = getUChar(delta);
   25840          if (epartIsReg(modrm)) {
   25841             delta += 1;
   25842             putYMMRegLoAndZU(
   25843                gregOfRexRM(pfx,modrm),
   25844                unop( Iop_32UtoV128, getIReg32(eregOfRexRM(pfx,modrm)) )
   25845             );
   25846             DIP("vmovd %s, %s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   25847                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   25848         } else {
   25849             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   25850             delta += alen;
   25851             putYMMRegLoAndZU(
   25852                gregOfRexRM(pfx,modrm),
   25853                unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)))
   25854                              );
   25855             DIP("vmovd %s, %s\n", dis_buf,
   25856                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   25857          }
   25858          goto decode_success;
   25859       }
   25860       /* VMOVQ r64/m64, xmm1 = VEX.128.66.0F.W1 6E */
   25861       if (have66noF2noF3(pfx)
   25862           && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
   25863          vassert(sz == 2); /* even tho we are transferring 8, not 2. */
   25864          UChar modrm = getUChar(delta);
   25865          if (epartIsReg(modrm)) {
   25866             delta += 1;
   25867             putYMMRegLoAndZU(
   25868                gregOfRexRM(pfx,modrm),
   25869                unop( Iop_64UtoV128, getIReg64(eregOfRexRM(pfx,modrm)) )
   25870             );
   25871             DIP("vmovq %s, %s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   25872                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   25873         } else {
   25874             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   25875             delta += alen;
   25876             putYMMRegLoAndZU(
   25877                gregOfRexRM(pfx,modrm),
   25878                unop( Iop_64UtoV128,loadLE(Ity_I64, mkexpr(addr)))
   25879                              );
   25880             DIP("vmovq %s, %s\n", dis_buf,
   25881                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   25882          }
   25883          goto decode_success;
   25884       }
   25885       break;
   25886 
   25887    case 0x6F:
   25888       /* VMOVDQA ymm2/m256, ymm1 = VEX.256.66.0F.WIG 6F */
   25889       /* VMOVDQU ymm2/m256, ymm1 = VEX.256.F3.0F.WIG 6F */
   25890       if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
   25891           && 1==getVexL(pfx)/*256*/) {
   25892          UChar  modrm = getUChar(delta);
   25893          UInt   rD    = gregOfRexRM(pfx, modrm);
   25894          IRTemp tD    = newTemp(Ity_V256);
   25895          Bool   isA   = have66noF2noF3(pfx);
   25896          HChar  ch    = isA ? 'a' : 'u';
   25897          if (epartIsReg(modrm)) {
   25898             UInt rS = eregOfRexRM(pfx, modrm);
   25899             delta += 1;
   25900             assign(tD, getYMMReg(rS));
   25901             DIP("vmovdq%c %s,%s\n", ch, nameYMMReg(rS), nameYMMReg(rD));
   25902          } else {
   25903             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   25904             delta += alen;
   25905             if (isA)
   25906                gen_SEGV_if_not_32_aligned(addr);
   25907             assign(tD, loadLE(Ity_V256, mkexpr(addr)));
   25908             DIP("vmovdq%c %s,%s\n", ch, dis_buf, nameYMMReg(rD));
   25909          }
   25910          putYMMReg(rD, mkexpr(tD));
   25911          goto decode_success;
   25912       }
   25913       /* VMOVDQA xmm2/m128, xmm1 = VEX.128.66.0F.WIG 6F */
   25914       /* VMOVDQU xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 6F */
   25915       if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
   25916           && 0==getVexL(pfx)/*128*/) {
   25917          UChar  modrm = getUChar(delta);
   25918          UInt   rD    = gregOfRexRM(pfx, modrm);
   25919          IRTemp tD    = newTemp(Ity_V128);
   25920          Bool   isA   = have66noF2noF3(pfx);
   25921          HChar  ch    = isA ? 'a' : 'u';
   25922          if (epartIsReg(modrm)) {
   25923             UInt rS = eregOfRexRM(pfx, modrm);
   25924             delta += 1;
   25925             assign(tD, getXMMReg(rS));
   25926             DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), nameXMMReg(rD));
   25927          } else {
   25928             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   25929             delta += alen;
   25930             if (isA)
   25931                gen_SEGV_if_not_16_aligned(addr);
   25932             assign(tD, loadLE(Ity_V128, mkexpr(addr)));
   25933             DIP("vmovdq%c %s,%s\n", ch, dis_buf, nameXMMReg(rD));
   25934          }
   25935          putYMMRegLoAndZU(rD, mkexpr(tD));
   25936          goto decode_success;
   25937       }
   25938       break;
   25939 
   25940    case 0x70:
   25941       /* VPSHUFD imm8, xmm2/m128, xmm1 = VEX.128.66.0F.WIG 70 /r ib */
   25942       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25943          delta = dis_PSHUFD_32x4( vbi, pfx, delta, True/*writesYmm*/);
   25944          goto decode_success;
   25945       }
   25946       /* VPSHUFD imm8, ymm2/m256, ymm1 = VEX.256.66.0F.WIG 70 /r ib */
   25947       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25948          delta = dis_PSHUFD_32x8( vbi, pfx, delta);
   25949          goto decode_success;
   25950       }
   25951       /* VPSHUFLW imm8, xmm2/m128, xmm1 = VEX.128.F2.0F.WIG 70 /r ib */
   25952       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25953          delta = dis_PSHUFxW_128( vbi, pfx, delta,
   25954                                   True/*isAvx*/, False/*!xIsH*/ );
   25955          goto decode_success;
   25956       }
   25957       /* VPSHUFLW imm8, ymm2/m256, ymm1 = VEX.256.F2.0F.WIG 70 /r ib */
   25958       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25959          delta = dis_PSHUFxW_256( vbi, pfx, delta, False/*!xIsH*/ );
   25960          goto decode_success;
   25961       }
   25962       /* VPSHUFHW imm8, xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 70 /r ib */
   25963       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
   25964          delta = dis_PSHUFxW_128( vbi, pfx, delta,
   25965                                   True/*isAvx*/, True/*xIsH*/ );
   25966          goto decode_success;
   25967       }
   25968       /* VPSHUFHW imm8, ymm2/m256, ymm1 = VEX.256.F3.0F.WIG 70 /r ib */
   25969       if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
   25970          delta = dis_PSHUFxW_256( vbi, pfx, delta, True/*xIsH*/ );
   25971          goto decode_success;
   25972       }
   25973       break;
   25974 
   25975    case 0x71:
   25976       /* VPSRLW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /2 ib */
   25977       /* VPSRAW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /4 ib */
   25978       /* VPSLLW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /6 ib */
   25979       if (have66noF2noF3(pfx)
   25980           && 0==getVexL(pfx)/*128*/
   25981           && epartIsReg(getUChar(delta))) {
   25982          if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
   25983             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   25984                                                 "vpsrlw", Iop_ShrN16x8 );
   25985             *uses_vvvv = True;
   25986             goto decode_success;
   25987          }
   25988          if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
   25989             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   25990                                                 "vpsraw", Iop_SarN16x8 );
   25991             *uses_vvvv = True;
   25992             goto decode_success;
   25993          }
   25994          if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
   25995             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   25996                                                 "vpsllw", Iop_ShlN16x8 );
   25997             *uses_vvvv = True;
   25998             goto decode_success;
   25999          }
   26000          /* else fall through */
   26001       }
   26002       /* VPSRLW imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 71 /2 ib */
   26003       /* VPSRAW imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 71 /4 ib */
   26004       /* VPSLLW imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 71 /6 ib */
   26005       if (have66noF2noF3(pfx)
   26006           && 1==getVexL(pfx)/*256*/
   26007           && epartIsReg(getUChar(delta))) {
   26008          if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
   26009             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   26010                                                 "vpsrlw", Iop_ShrN16x16 );
   26011             *uses_vvvv = True;
   26012             goto decode_success;
   26013          }
   26014          if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
   26015             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   26016                                                 "vpsraw", Iop_SarN16x16 );
   26017             *uses_vvvv = True;
   26018             goto decode_success;
   26019          }
   26020          if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
   26021             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   26022                                                 "vpsllw", Iop_ShlN16x16 );
   26023             *uses_vvvv = True;
   26024             goto decode_success;
   26025          }
   26026          /* else fall through */
   26027       }
   26028       break;
   26029 
   26030    case 0x72:
   26031       /* VPSRLD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /2 ib */
   26032       /* VPSRAD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /4 ib */
   26033       /* VPSLLD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /6 ib */
   26034       if (have66noF2noF3(pfx)
   26035           && 0==getVexL(pfx)/*128*/
   26036           && epartIsReg(getUChar(delta))) {
   26037          if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
   26038             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   26039                                                 "vpsrld", Iop_ShrN32x4 );
   26040             *uses_vvvv = True;
   26041             goto decode_success;
   26042          }
   26043          if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
   26044             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   26045                                                 "vpsrad", Iop_SarN32x4 );
   26046             *uses_vvvv = True;
   26047             goto decode_success;
   26048          }
   26049          if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
   26050             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   26051                                                 "vpslld", Iop_ShlN32x4 );
   26052             *uses_vvvv = True;
   26053             goto decode_success;
   26054          }
   26055          /* else fall through */
   26056       }
   26057       /* VPSRLD imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 72 /2 ib */
   26058       /* VPSRAD imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 72 /4 ib */
   26059       /* VPSLLD imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 72 /6 ib */
   26060       if (have66noF2noF3(pfx)
   26061           && 1==getVexL(pfx)/*256*/
   26062           && epartIsReg(getUChar(delta))) {
   26063          if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
   26064             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   26065                                                 "vpsrld", Iop_ShrN32x8 );
   26066             *uses_vvvv = True;
   26067             goto decode_success;
   26068          }
   26069          if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
   26070             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   26071                                                 "vpsrad", Iop_SarN32x8 );
   26072             *uses_vvvv = True;
   26073             goto decode_success;
   26074          }
   26075          if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
   26076             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   26077                                                 "vpslld", Iop_ShlN32x8 );
   26078             *uses_vvvv = True;
   26079             goto decode_success;
   26080          }
   26081          /* else fall through */
   26082       }
   26083       break;
   26084 
   26085    case 0x73:
   26086       /* VPSRLDQ imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /3 ib */
   26087       /* VPSLLDQ imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /7 ib */
   26088       /* VPSRLQ  imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /2 ib */
   26089       /* VPSLLQ  imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /6 ib */
   26090       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   26091           && epartIsReg(getUChar(delta))) {
   26092          Int    rS   = eregOfRexRM(pfx,getUChar(delta));
   26093          Int    rD   = getVexNvvvv(pfx);
   26094          IRTemp vecS = newTemp(Ity_V128);
   26095          if (gregLO3ofRM(getUChar(delta)) == 3) {
   26096             Int imm = (Int)getUChar(delta+1);
   26097             DIP("vpsrldq $%d,%s,%s\n", imm, nameXMMReg(rS), nameXMMReg(rD));
   26098             delta += 2;
   26099             assign( vecS, getXMMReg(rS) );
   26100             putYMMRegLoAndZU(rD, mkexpr(math_PSRLDQ( vecS, imm )));
   26101             *uses_vvvv = True;
   26102             goto decode_success;
   26103          }
   26104          if (gregLO3ofRM(getUChar(delta)) == 7) {
   26105             Int imm = (Int)getUChar(delta+1);
   26106             DIP("vpslldq $%d,%s,%s\n", imm, nameXMMReg(rS), nameXMMReg(rD));
   26107             delta += 2;
   26108             assign( vecS, getXMMReg(rS) );
   26109             putYMMRegLoAndZU(rD, mkexpr(math_PSLLDQ( vecS, imm )));
   26110             *uses_vvvv = True;
   26111             goto decode_success;
   26112          }
   26113          if (gregLO3ofRM(getUChar(delta)) == 2) {
   26114             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   26115                                                 "vpsrlq", Iop_ShrN64x2 );
   26116             *uses_vvvv = True;
   26117             goto decode_success;
   26118          }
   26119          if (gregLO3ofRM(getUChar(delta)) == 6) {
   26120             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   26121                                                 "vpsllq", Iop_ShlN64x2 );
   26122             *uses_vvvv = True;
   26123             goto decode_success;
   26124          }
   26125          /* else fall through */
   26126       }
   26127       /* VPSRLDQ imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /3 ib */
   26128       /* VPSLLDQ imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /7 ib */
   26129       /* VPSRLQ  imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /2 ib */
   26130       /* VPSLLQ  imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /6 ib */
   26131       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   26132           && epartIsReg(getUChar(delta))) {
   26133          Int    rS   = eregOfRexRM(pfx,getUChar(delta));
   26134          Int    rD   = getVexNvvvv(pfx);
   26135          if (gregLO3ofRM(getUChar(delta)) == 3) {
   26136             IRTemp vecS0 = newTemp(Ity_V128);
   26137             IRTemp vecS1 = newTemp(Ity_V128);
   26138             Int imm = (Int)getUChar(delta+1);
   26139             DIP("vpsrldq $%d,%s,%s\n", imm, nameYMMReg(rS), nameYMMReg(rD));
   26140             delta += 2;
   26141             assign( vecS0, getYMMRegLane128(rS, 0));
   26142             assign( vecS1, getYMMRegLane128(rS, 1));
   26143             putYMMRegLane128(rD, 0, mkexpr(math_PSRLDQ( vecS0, imm )));
   26144             putYMMRegLane128(rD, 1, mkexpr(math_PSRLDQ( vecS1, imm )));
   26145             *uses_vvvv = True;
   26146             goto decode_success;
   26147          }
   26148          if (gregLO3ofRM(getUChar(delta)) == 7) {
   26149             IRTemp vecS0 = newTemp(Ity_V128);
   26150             IRTemp vecS1 = newTemp(Ity_V128);
   26151             Int imm = (Int)getUChar(delta+1);
   26152             DIP("vpslldq $%d,%s,%s\n", imm, nameYMMReg(rS), nameYMMReg(rD));
   26153             delta += 2;
   26154             assign( vecS0, getYMMRegLane128(rS, 0));
   26155             assign( vecS1, getYMMRegLane128(rS, 1));
   26156             putYMMRegLane128(rD, 0, mkexpr(math_PSLLDQ( vecS0, imm )));
   26157             putYMMRegLane128(rD, 1, mkexpr(math_PSLLDQ( vecS1, imm )));
   26158             *uses_vvvv = True;
   26159             goto decode_success;
   26160          }
   26161          if (gregLO3ofRM(getUChar(delta)) == 2) {
   26162             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   26163                                                 "vpsrlq", Iop_ShrN64x4 );
   26164             *uses_vvvv = True;
   26165             goto decode_success;
   26166          }
   26167          if (gregLO3ofRM(getUChar(delta)) == 6) {
   26168             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   26169                                                 "vpsllq", Iop_ShlN64x4 );
   26170             *uses_vvvv = True;
   26171             goto decode_success;
   26172          }
   26173          /* else fall through */
   26174       }
   26175       break;
   26176 
   26177    case 0x74:
   26178       /* VPCMPEQB r/m, rV, r ::: r = rV `eq-by-8s` r/m */
   26179       /* VPCMPEQB = VEX.NDS.128.66.0F.WIG 74 /r */
   26180       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26181          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26182                     uses_vvvv, vbi, pfx, delta, "vpcmpeqb", Iop_CmpEQ8x16 );
   26183          goto decode_success;
   26184       }
   26185       /* VPCMPEQB r/m, rV, r ::: r = rV `eq-by-8s` r/m */
   26186       /* VPCMPEQB = VEX.NDS.256.66.0F.WIG 74 /r */
   26187       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26188          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26189                     uses_vvvv, vbi, pfx, delta, "vpcmpeqb", Iop_CmpEQ8x32 );
   26190          goto decode_success;
   26191       }
   26192       break;
   26193 
   26194    case 0x75:
   26195       /* VPCMPEQW r/m, rV, r ::: r = rV `eq-by-16s` r/m */
   26196       /* VPCMPEQW = VEX.NDS.128.66.0F.WIG 75 /r */
   26197       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26198          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26199                     uses_vvvv, vbi, pfx, delta, "vpcmpeqw", Iop_CmpEQ16x8 );
   26200          goto decode_success;
   26201       }
   26202       /* VPCMPEQW r/m, rV, r ::: r = rV `eq-by-16s` r/m */
   26203       /* VPCMPEQW = VEX.NDS.256.66.0F.WIG 75 /r */
   26204       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26205          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26206                     uses_vvvv, vbi, pfx, delta, "vpcmpeqw", Iop_CmpEQ16x16 );
   26207          goto decode_success;
   26208       }
   26209       break;
   26210 
   26211    case 0x76:
   26212       /* VPCMPEQD r/m, rV, r ::: r = rV `eq-by-32s` r/m */
   26213       /* VPCMPEQD = VEX.NDS.128.66.0F.WIG 76 /r */
   26214       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26215          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26216                     uses_vvvv, vbi, pfx, delta, "vpcmpeqd", Iop_CmpEQ32x4 );
   26217          goto decode_success;
   26218       }
   26219       /* VPCMPEQD r/m, rV, r ::: r = rV `eq-by-32s` r/m */
   26220       /* VPCMPEQD = VEX.NDS.256.66.0F.WIG 76 /r */
   26221       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26222          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26223                     uses_vvvv, vbi, pfx, delta, "vpcmpeqd", Iop_CmpEQ32x8 );
   26224          goto decode_success;
   26225       }
   26226       break;
   26227 
   26228    case 0x77:
   26229       /* VZEROUPPER = VEX.128.0F.WIG 77 */
   26230       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26231          Int i;
   26232          IRTemp zero128 = newTemp(Ity_V128);
   26233          assign(zero128, mkV128(0));
   26234          for (i = 0; i < 16; i++) {
   26235             putYMMRegLane128(i, 1, mkexpr(zero128));
   26236          }
   26237          DIP("vzeroupper\n");
   26238          goto decode_success;
   26239       }
   26240       /* VZEROALL = VEX.256.0F.WIG 77 */
   26241       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26242          Int i;
   26243          IRTemp zero128 = newTemp(Ity_V128);
   26244          assign(zero128, mkV128(0));
   26245          for (i = 0; i < 16; i++) {
   26246             putYMMRegLoAndZU(i, mkexpr(zero128));
   26247          }
   26248          DIP("vzeroall\n");
   26249          goto decode_success;
   26250       }
   26251       break;
   26252 
   26253    case 0x7C:
   26254    case 0x7D:
   26255       /* VHADDPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.F2.0F.WIG 7C /r */
   26256       /* VHSUBPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.F2.0F.WIG 7D /r */
   26257       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26258          IRTemp sV     = newTemp(Ity_V128);
   26259          IRTemp dV     = newTemp(Ity_V128);
   26260          Bool   isAdd  = opc == 0x7C;
   26261          const HChar* str = isAdd ? "add" : "sub";
   26262          UChar modrm   = getUChar(delta);
   26263          UInt   rG     = gregOfRexRM(pfx,modrm);
   26264          UInt   rV     = getVexNvvvv(pfx);
   26265          if (epartIsReg(modrm)) {
   26266             UInt rE = eregOfRexRM(pfx,modrm);
   26267             assign( sV, getXMMReg(rE) );
   26268             DIP("vh%spd %s,%s,%s\n", str, nameXMMReg(rE),
   26269                 nameXMMReg(rV), nameXMMReg(rG));
   26270             delta += 1;
   26271          } else {
   26272             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   26273             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   26274             DIP("vh%spd %s,%s,%s\n", str, dis_buf,
   26275                 nameXMMReg(rV), nameXMMReg(rG));
   26276             delta += alen;
   26277          }
   26278          assign( dV, getXMMReg(rV) );
   26279          putYMMRegLoAndZU( rG, mkexpr( math_HADDPS_128 ( dV, sV, isAdd ) ) );
   26280          *uses_vvvv = True;
   26281          goto decode_success;
   26282       }
   26283       /* VHADDPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.F2.0F.WIG 7C /r */
   26284       /* VHSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.F2.0F.WIG 7D /r */
   26285       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26286          IRTemp sV     = newTemp(Ity_V256);
   26287          IRTemp dV     = newTemp(Ity_V256);
   26288          IRTemp s1, s0, d1, d0;
   26289          Bool   isAdd  = opc == 0x7C;
   26290          const HChar* str = isAdd ? "add" : "sub";
   26291          UChar modrm   = getUChar(delta);
   26292          UInt   rG     = gregOfRexRM(pfx,modrm);
   26293          UInt   rV     = getVexNvvvv(pfx);
   26294          s1 = s0 = d1 = d0 = IRTemp_INVALID;
   26295          if (epartIsReg(modrm)) {
   26296             UInt rE = eregOfRexRM(pfx,modrm);
   26297             assign( sV, getYMMReg(rE) );
   26298             DIP("vh%spd %s,%s,%s\n", str, nameYMMReg(rE),
   26299                 nameYMMReg(rV), nameYMMReg(rG));
   26300             delta += 1;
   26301          } else {
   26302             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   26303             assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   26304             DIP("vh%spd %s,%s,%s\n", str, dis_buf,
   26305                 nameYMMReg(rV), nameYMMReg(rG));
   26306             delta += alen;
   26307          }
   26308          assign( dV, getYMMReg(rV) );
   26309          breakupV256toV128s( dV, &d1, &d0 );
   26310          breakupV256toV128s( sV, &s1, &s0 );
   26311          putYMMReg( rG, binop(Iop_V128HLtoV256,
   26312                               mkexpr( math_HADDPS_128 ( d1, s1, isAdd ) ),
   26313                               mkexpr( math_HADDPS_128 ( d0, s0, isAdd ) ) ) );
   26314          *uses_vvvv = True;
   26315          goto decode_success;
   26316       }
   26317       /* VHADDPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 7C /r */
   26318       /* VHSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 7D /r */
   26319       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26320          IRTemp sV     = newTemp(Ity_V128);
   26321          IRTemp dV     = newTemp(Ity_V128);
   26322          Bool   isAdd  = opc == 0x7C;
   26323          const HChar* str = isAdd ? "add" : "sub";
   26324          UChar modrm   = getUChar(delta);
   26325          UInt   rG     = gregOfRexRM(pfx,modrm);
   26326          UInt   rV     = getVexNvvvv(pfx);
   26327          if (epartIsReg(modrm)) {
   26328             UInt rE = eregOfRexRM(pfx,modrm);
   26329             assign( sV, getXMMReg(rE) );
   26330             DIP("vh%spd %s,%s,%s\n", str, nameXMMReg(rE),
   26331                 nameXMMReg(rV), nameXMMReg(rG));
   26332             delta += 1;
   26333          } else {
   26334             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   26335             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   26336             DIP("vh%spd %s,%s,%s\n", str, dis_buf,
   26337                 nameXMMReg(rV), nameXMMReg(rG));
   26338             delta += alen;
   26339          }
   26340          assign( dV, getXMMReg(rV) );
   26341          putYMMRegLoAndZU( rG, mkexpr( math_HADDPD_128 ( dV, sV, isAdd ) ) );
   26342          *uses_vvvv = True;
   26343          goto decode_success;
   26344       }
   26345       /* VHADDPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 7C /r */
   26346       /* VHSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 7D /r */
   26347       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26348          IRTemp sV     = newTemp(Ity_V256);
   26349          IRTemp dV     = newTemp(Ity_V256);
   26350          IRTemp s1, s0, d1, d0;
   26351          Bool   isAdd  = opc == 0x7C;
   26352          const HChar* str = isAdd ? "add" : "sub";
   26353          UChar modrm   = getUChar(delta);
   26354          UInt   rG     = gregOfRexRM(pfx,modrm);
   26355          UInt   rV     = getVexNvvvv(pfx);
   26356          s1 = s0 = d1 = d0 = IRTemp_INVALID;
   26357          if (epartIsReg(modrm)) {
   26358             UInt rE = eregOfRexRM(pfx,modrm);
   26359             assign( sV, getYMMReg(rE) );
   26360             DIP("vh%spd %s,%s,%s\n", str, nameYMMReg(rE),
   26361                 nameYMMReg(rV), nameYMMReg(rG));
   26362             delta += 1;
   26363          } else {
   26364             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   26365             assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   26366             DIP("vh%spd %s,%s,%s\n", str, dis_buf,
   26367                 nameYMMReg(rV), nameYMMReg(rG));
   26368             delta += alen;
   26369          }
   26370          assign( dV, getYMMReg(rV) );
   26371          breakupV256toV128s( dV, &d1, &d0 );
   26372          breakupV256toV128s( sV, &s1, &s0 );
   26373          putYMMReg( rG, binop(Iop_V128HLtoV256,
   26374                               mkexpr( math_HADDPD_128 ( d1, s1, isAdd ) ),
   26375                               mkexpr( math_HADDPD_128 ( d0, s0, isAdd ) ) ) );
   26376          *uses_vvvv = True;
   26377          goto decode_success;
   26378       }
   26379       break;
   26380 
   26381    case 0x7E:
   26382       /* Note the Intel docs don't make sense for this.  I think they
   26383          are wrong.  They seem to imply it is a store when in fact I
   26384          think it is a load.  Also it's unclear whether this is W0, W1
   26385          or WIG. */
   26386       /* VMOVQ xmm2/m64, xmm1 = VEX.128.F3.0F.W0 7E /r */
   26387       if (haveF3no66noF2(pfx)
   26388           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   26389          vassert(sz == 4); /* even tho we are transferring 8, not 4. */
   26390          UChar modrm = getUChar(delta);
   26391          UInt  rG    = gregOfRexRM(pfx,modrm);
   26392          if (epartIsReg(modrm)) {
   26393             UInt rE = eregOfRexRM(pfx,modrm);
   26394             putXMMRegLane64( rG, 0, getXMMRegLane64( rE, 0 ));
   26395             DIP("vmovq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   26396             delta += 1;
   26397          } else {
   26398             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   26399             putXMMRegLane64( rG, 0, loadLE(Ity_I64, mkexpr(addr)) );
   26400             DIP("vmovq %s,%s\n", dis_buf, nameXMMReg(rG));
   26401             delta += alen;
   26402          }
   26403          /* zero bits 255:64 */
   26404          putXMMRegLane64( rG, 1, mkU64(0) );
   26405          putYMMRegLane128( rG, 1, mkV128(0) );
   26406          goto decode_success;
   26407       }
   26408       /* VMOVQ xmm1, r64 = VEX.128.66.0F.W1 7E /r (reg case only) */
   26409       /* Moves from G to E, so is a store-form insn */
   26410       /* Intel docs list this in the VMOVD entry for some reason. */
   26411       if (have66noF2noF3(pfx)
   26412           && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
   26413          UChar modrm = getUChar(delta);
   26414          UInt  rG    = gregOfRexRM(pfx,modrm);
   26415          if (epartIsReg(modrm)) {
   26416             UInt rE = eregOfRexRM(pfx,modrm);
   26417             DIP("vmovq %s,%s\n", nameXMMReg(rG), nameIReg64(rE));
   26418             putIReg64(rE, getXMMRegLane64(rG, 0));
   26419             delta += 1;
   26420          } else {
   26421             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   26422             storeLE( mkexpr(addr), getXMMRegLane64(rG, 0) );
   26423             DIP("vmovq %s,%s\n", dis_buf, nameXMMReg(rG));
   26424             delta += alen;
   26425          }
   26426          goto decode_success;
   26427       }
   26428       /* VMOVD xmm1, m32/r32 = VEX.128.66.0F.W0 7E /r (reg case only) */
   26429       /* Moves from G to E, so is a store-form insn */
   26430       if (have66noF2noF3(pfx)
   26431           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   26432          UChar modrm = getUChar(delta);
   26433          UInt  rG    = gregOfRexRM(pfx,modrm);
   26434          if (epartIsReg(modrm)) {
   26435             UInt rE = eregOfRexRM(pfx,modrm);
   26436             DIP("vmovd %s,%s\n", nameXMMReg(rG), nameIReg32(rE));
   26437             putIReg32(rE, getXMMRegLane32(rG, 0));
   26438             delta += 1;
   26439          } else {
   26440             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   26441             storeLE( mkexpr(addr), getXMMRegLane32(rG, 0) );
   26442             DIP("vmovd %s,%s\n", dis_buf, nameXMMReg(rG));
   26443             delta += alen;
   26444          }
   26445          goto decode_success;
   26446       }
   26447       break;
   26448 
   26449    case 0x7F:
   26450       /* VMOVDQA ymm1, ymm2/m256 = VEX.256.66.0F.WIG 7F */
   26451       /* VMOVDQU ymm1, ymm2/m256 = VEX.256.F3.0F.WIG 7F */
   26452       if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
   26453           && 1==getVexL(pfx)/*256*/) {
   26454          UChar  modrm = getUChar(delta);
   26455          UInt   rS    = gregOfRexRM(pfx, modrm);
   26456          IRTemp tS    = newTemp(Ity_V256);
   26457          Bool   isA   = have66noF2noF3(pfx);
   26458          HChar  ch    = isA ? 'a' : 'u';
   26459          assign(tS, getYMMReg(rS));
   26460          if (epartIsReg(modrm)) {
   26461             UInt rD = eregOfRexRM(pfx, modrm);
   26462             delta += 1;
   26463             putYMMReg(rD, mkexpr(tS));
   26464             DIP("vmovdq%c %s,%s\n", ch, nameYMMReg(rS), nameYMMReg(rD));
   26465          } else {
   26466             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   26467             delta += alen;
   26468             if (isA)
   26469                gen_SEGV_if_not_32_aligned(addr);
   26470             storeLE(mkexpr(addr), mkexpr(tS));
   26471             DIP("vmovdq%c %s,%s\n", ch, nameYMMReg(rS), dis_buf);
   26472          }
   26473          goto decode_success;
   26474       }
   26475       /* VMOVDQA xmm1, xmm2/m128 = VEX.128.66.0F.WIG 7F */
   26476       /* VMOVDQU xmm1, xmm2/m128 = VEX.128.F3.0F.WIG 7F */
   26477       if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
   26478           && 0==getVexL(pfx)/*128*/) {
   26479          UChar  modrm = getUChar(delta);
   26480          UInt   rS    = gregOfRexRM(pfx, modrm);
   26481          IRTemp tS    = newTemp(Ity_V128);
   26482          Bool   isA   = have66noF2noF3(pfx);
   26483          HChar  ch    = isA ? 'a' : 'u';
   26484          assign(tS, getXMMReg(rS));
   26485          if (epartIsReg(modrm)) {
   26486             UInt rD = eregOfRexRM(pfx, modrm);
   26487             delta += 1;
   26488             putYMMRegLoAndZU(rD, mkexpr(tS));
   26489             DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), nameXMMReg(rD));
   26490          } else {
   26491             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   26492             delta += alen;
   26493             if (isA)
   26494                gen_SEGV_if_not_16_aligned(addr);
   26495             storeLE(mkexpr(addr), mkexpr(tS));
   26496             DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), dis_buf);
   26497          }
   26498          goto decode_success;
   26499       }
   26500       break;
   26501 
   26502    case 0xAE:
   26503       /* VSTMXCSR m32 = VEX.LZ.0F.WIG AE /3 */
   26504       if (haveNo66noF2noF3(pfx)
   26505           && 0==getVexL(pfx)/*LZ*/
   26506           && 0==getRexW(pfx) /* be paranoid -- Intel docs don't require this */
   26507           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 3
   26508           && sz == 4) {
   26509          delta = dis_STMXCSR(vbi, pfx, delta, True/*isAvx*/);
   26510          goto decode_success;
   26511       }
   26512       /* VLDMXCSR m32 = VEX.LZ.0F.WIG AE /2 */
   26513       if (haveNo66noF2noF3(pfx)
   26514           && 0==getVexL(pfx)/*LZ*/
   26515           && 0==getRexW(pfx) /* be paranoid -- Intel docs don't require this */
   26516           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 2
   26517           && sz == 4) {
   26518          delta = dis_LDMXCSR(vbi, pfx, delta, True/*isAvx*/);
   26519          goto decode_success;
   26520       }
   26521       break;
   26522 
   26523    case 0xC2:
   26524       /* VCMPSD xmm3/m64(E=argL), xmm2(V=argR), xmm1(G) */
   26525       /* = VEX.NDS.LIG.F2.0F.WIG C2 /r ib */
   26526       if (haveF2no66noF3(pfx)) {
   26527          Long delta0 = delta;
   26528          delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   26529                                           "vcmpsd", False/*!all_lanes*/,
   26530                                           8/*sz*/);
   26531          if (delta > delta0) goto decode_success;
   26532          /* else fall through -- decoding has failed */
   26533       }
   26534       /* VCMPSS xmm3/m32(E=argL), xmm2(V=argR), xmm1(G) */
   26535       /* = VEX.NDS.LIG.F3.0F.WIG C2 /r ib */
   26536       if (haveF3no66noF2(pfx)) {
   26537          Long delta0 = delta;
   26538          delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   26539                                           "vcmpss", False/*!all_lanes*/,
   26540                                           4/*sz*/);
   26541          if (delta > delta0) goto decode_success;
   26542          /* else fall through -- decoding has failed */
   26543       }
   26544       /* VCMPPD xmm3/m128(E=argL), xmm2(V=argR), xmm1(G) */
   26545       /* = VEX.NDS.128.66.0F.WIG C2 /r ib */
   26546       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26547          Long delta0 = delta;
   26548          delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   26549                                           "vcmppd", True/*all_lanes*/,
   26550                                           8/*sz*/);
   26551          if (delta > delta0) goto decode_success;
   26552          /* else fall through -- decoding has failed */
   26553       }
   26554       /* VCMPPD ymm3/m256(E=argL), ymm2(V=argR), ymm1(G) */
   26555       /* = VEX.NDS.256.66.0F.WIG C2 /r ib */
   26556       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26557          Long delta0 = delta;
   26558          delta = dis_AVX256_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   26559                                           "vcmppd", 8/*sz*/);
   26560          if (delta > delta0) goto decode_success;
   26561          /* else fall through -- decoding has failed */
   26562       }
   26563       /* VCMPPS xmm3/m128(E=argL), xmm2(V=argR), xmm1(G) */
   26564       /* = VEX.NDS.128.0F.WIG C2 /r ib */
   26565       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26566          Long delta0 = delta;
   26567          delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   26568                                           "vcmpps", True/*all_lanes*/,
   26569                                           4/*sz*/);
   26570          if (delta > delta0) goto decode_success;
   26571          /* else fall through -- decoding has failed */
   26572       }
   26573       /* VCMPPS ymm3/m256(E=argL), ymm2(V=argR), ymm1(G) */
   26574       /* = VEX.NDS.256.0F.WIG C2 /r ib */
   26575       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26576          Long delta0 = delta;
   26577          delta = dis_AVX256_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   26578                                           "vcmpps", 4/*sz*/);
   26579          if (delta > delta0) goto decode_success;
   26580          /* else fall through -- decoding has failed */
   26581       }
   26582       break;
   26583 
   26584    case 0xC4:
   26585       /* VPINSRW r32/m16, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG C4 /r ib */
   26586       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26587          UChar  modrm = getUChar(delta);
   26588          UInt   rG    = gregOfRexRM(pfx, modrm);
   26589          UInt   rV    = getVexNvvvv(pfx);
   26590          Int    imm8;
   26591          IRTemp new16 = newTemp(Ity_I16);
   26592 
   26593          if ( epartIsReg( modrm ) ) {
   26594             imm8 = (Int)(getUChar(delta+1) & 7);
   26595             assign( new16, unop(Iop_32to16,
   26596                                 getIReg32(eregOfRexRM(pfx,modrm))) );
   26597             delta += 1+1;
   26598             DIP( "vpinsrw $%d,%s,%s\n", imm8,
   26599                  nameIReg32( eregOfRexRM(pfx, modrm) ), nameXMMReg(rG) );
   26600          } else {
   26601             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   26602             imm8 = (Int)(getUChar(delta+alen) & 7);
   26603             assign( new16, loadLE( Ity_I16, mkexpr(addr) ));
   26604             delta += alen+1;
   26605             DIP( "vpinsrw $%d,%s,%s\n",
   26606                  imm8, dis_buf, nameXMMReg(rG) );
   26607          }
   26608 
   26609          IRTemp src_vec = newTemp(Ity_V128);
   26610          assign(src_vec, getXMMReg( rV ));
   26611          IRTemp res_vec = math_PINSRW_128( src_vec, new16, imm8 );
   26612          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   26613          *uses_vvvv = True;
   26614          goto decode_success;
   26615       }
   26616       break;
   26617 
   26618    case 0xC5:
   26619       /* VPEXTRW imm8, xmm1, reg32 = VEX.128.66.0F.W0 C5 /r ib */
   26620       if (have66noF2noF3(pfx)
   26621          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   26622          Long delta0 = delta;
   26623          delta = dis_PEXTRW_128_EregOnly_toG( vbi, pfx, delta,
   26624                                               True/*isAvx*/ );
   26625          if (delta > delta0) goto decode_success;
   26626          /* else fall through -- decoding has failed */
   26627       }
   26628       break;
   26629 
   26630    case 0xC6:
   26631       /* VSHUFPS imm8, xmm3/m128, xmm2, xmm1, xmm2 */
   26632       /* = VEX.NDS.128.0F.WIG C6 /r ib */
   26633       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26634          Int    imm8 = 0;
   26635          IRTemp eV   = newTemp(Ity_V128);
   26636          IRTemp vV   = newTemp(Ity_V128);
   26637          UInt  modrm = getUChar(delta);
   26638          UInt  rG    = gregOfRexRM(pfx,modrm);
   26639          UInt  rV    = getVexNvvvv(pfx);
   26640          assign( vV, getXMMReg(rV) );
   26641          if (epartIsReg(modrm)) {
   26642             UInt rE = eregOfRexRM(pfx,modrm);
   26643             assign( eV, getXMMReg(rE) );
   26644             imm8 = (Int)getUChar(delta+1);
   26645             delta += 1+1;
   26646             DIP("vshufps $%d,%s,%s,%s\n",
   26647                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   26648          } else {
   26649             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   26650             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   26651             imm8 = (Int)getUChar(delta+alen);
   26652             delta += 1+alen;
   26653             DIP("vshufps $%d,%s,%s,%s\n",
   26654                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   26655          }
   26656          IRTemp res = math_SHUFPS_128( eV, vV, imm8 );
   26657          putYMMRegLoAndZU( rG, mkexpr(res) );
   26658          *uses_vvvv = True;
   26659          goto decode_success;
   26660       }
   26661       /* VSHUFPS imm8, ymm3/m256, ymm2, ymm1, ymm2 */
   26662       /* = VEX.NDS.256.0F.WIG C6 /r ib */
   26663       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26664          Int    imm8 = 0;
   26665          IRTemp eV   = newTemp(Ity_V256);
   26666          IRTemp vV   = newTemp(Ity_V256);
   26667          UInt  modrm = getUChar(delta);
   26668          UInt  rG    = gregOfRexRM(pfx,modrm);
   26669          UInt  rV    = getVexNvvvv(pfx);
   26670          assign( vV, getYMMReg(rV) );
   26671          if (epartIsReg(modrm)) {
   26672             UInt rE = eregOfRexRM(pfx,modrm);
   26673             assign( eV, getYMMReg(rE) );
   26674             imm8 = (Int)getUChar(delta+1);
   26675             delta += 1+1;
   26676             DIP("vshufps $%d,%s,%s,%s\n",
   26677                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   26678          } else {
   26679             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   26680             assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
   26681             imm8 = (Int)getUChar(delta+alen);
   26682             delta += 1+alen;
   26683             DIP("vshufps $%d,%s,%s,%s\n",
   26684                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   26685          }
   26686          IRTemp res = math_SHUFPS_256( eV, vV, imm8 );
   26687          putYMMReg( rG, mkexpr(res) );
   26688          *uses_vvvv = True;
   26689          goto decode_success;
   26690       }
   26691       /* VSHUFPD imm8, xmm3/m128, xmm2, xmm1, xmm2 */
   26692       /* = VEX.NDS.128.66.0F.WIG C6 /r ib */
   26693       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26694          Int    imm8 = 0;
   26695          IRTemp eV   = newTemp(Ity_V128);
   26696          IRTemp vV   = newTemp(Ity_V128);
   26697          UInt  modrm = getUChar(delta);
   26698          UInt  rG    = gregOfRexRM(pfx,modrm);
   26699          UInt  rV    = getVexNvvvv(pfx);
   26700          assign( vV, getXMMReg(rV) );
   26701          if (epartIsReg(modrm)) {
   26702             UInt rE = eregOfRexRM(pfx,modrm);
   26703             assign( eV, getXMMReg(rE) );
   26704             imm8 = (Int)getUChar(delta+1);
   26705             delta += 1+1;
   26706             DIP("vshufpd $%d,%s,%s,%s\n",
   26707                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   26708          } else {
   26709             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   26710             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   26711             imm8 = (Int)getUChar(delta+alen);
   26712             delta += 1+alen;
   26713             DIP("vshufpd $%d,%s,%s,%s\n",
   26714                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   26715          }
   26716          IRTemp res = math_SHUFPD_128( eV, vV, imm8 );
   26717          putYMMRegLoAndZU( rG, mkexpr(res) );
   26718          *uses_vvvv = True;
   26719          goto decode_success;
   26720       }
   26721       /* VSHUFPD imm8, ymm3/m256, ymm2, ymm1, ymm2 */
   26722       /* = VEX.NDS.256.66.0F.WIG C6 /r ib */
   26723       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26724          Int    imm8 = 0;
   26725          IRTemp eV   = newTemp(Ity_V256);
   26726          IRTemp vV   = newTemp(Ity_V256);
   26727          UInt  modrm = getUChar(delta);
   26728          UInt  rG    = gregOfRexRM(pfx,modrm);
   26729          UInt  rV    = getVexNvvvv(pfx);
   26730          assign( vV, getYMMReg(rV) );
   26731          if (epartIsReg(modrm)) {
   26732             UInt rE = eregOfRexRM(pfx,modrm);
   26733             assign( eV, getYMMReg(rE) );
   26734             imm8 = (Int)getUChar(delta+1);
   26735             delta += 1+1;
   26736             DIP("vshufpd $%d,%s,%s,%s\n",
   26737                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   26738          } else {
   26739             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   26740             assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
   26741             imm8 = (Int)getUChar(delta+alen);
   26742             delta += 1+alen;
   26743             DIP("vshufpd $%d,%s,%s,%s\n",
   26744                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   26745          }
   26746          IRTemp res = math_SHUFPD_256( eV, vV, imm8 );
   26747          putYMMReg( rG, mkexpr(res) );
   26748          *uses_vvvv = True;
   26749          goto decode_success;
   26750       }
   26751       break;
   26752 
   26753    case 0xD0:
   26754       /* VADDSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D0 /r */
   26755       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26756          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   26757                     uses_vvvv, vbi, pfx, delta,
   26758                     "vaddsubpd", math_ADDSUBPD_128 );
   26759          goto decode_success;
   26760       }
   26761       /* VADDSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D0 /r */
   26762       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26763          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   26764                     uses_vvvv, vbi, pfx, delta,
   26765                     "vaddsubpd", math_ADDSUBPD_256 );
   26766          goto decode_success;
   26767       }
   26768       /* VADDSUBPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.F2.0F.WIG D0 /r */
   26769       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26770          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   26771                     uses_vvvv, vbi, pfx, delta,
   26772                     "vaddsubps", math_ADDSUBPS_128 );
   26773          goto decode_success;
   26774       }
   26775       /* VADDSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.F2.0F.WIG D0 /r */
   26776       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26777          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   26778                     uses_vvvv, vbi, pfx, delta,
   26779                     "vaddsubps", math_ADDSUBPS_256 );
   26780          goto decode_success;
   26781       }
   26782       break;
   26783 
   26784    case 0xD1:
   26785       /* VPSRLW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D1 /r */
   26786       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26787          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   26788                                         "vpsrlw", Iop_ShrN16x8 );
   26789          *uses_vvvv = True;
   26790          goto decode_success;
   26791 
   26792       }
   26793       /* VPSRLW xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D1 /r */
   26794       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26795          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   26796                                         "vpsrlw", Iop_ShrN16x16 );
   26797          *uses_vvvv = True;
   26798          goto decode_success;
   26799 
   26800       }
   26801       break;
   26802 
   26803    case 0xD2:
   26804       /* VPSRLD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D2 /r */
   26805       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26806          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   26807                                         "vpsrld", Iop_ShrN32x4 );
   26808          *uses_vvvv = True;
   26809          goto decode_success;
   26810       }
   26811       /* VPSRLD xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D2 /r */
   26812       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26813          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   26814                                         "vpsrld", Iop_ShrN32x8 );
   26815          *uses_vvvv = True;
   26816          goto decode_success;
   26817       }
   26818       break;
   26819 
   26820    case 0xD3:
   26821       /* VPSRLQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D3 /r */
   26822       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26823          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   26824                                         "vpsrlq", Iop_ShrN64x2 );
   26825          *uses_vvvv = True;
   26826          goto decode_success;
   26827       }
   26828       /* VPSRLQ xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D3 /r */
   26829       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26830          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   26831                                         "vpsrlq", Iop_ShrN64x4 );
   26832          *uses_vvvv = True;
   26833          goto decode_success;
   26834       }
   26835       break;
   26836 
   26837    case 0xD4:
   26838       /* VPADDQ r/m, rV, r ::: r = rV + r/m */
   26839       /* VPADDQ = VEX.NDS.128.66.0F.WIG D4 /r */
   26840       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26841          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26842                     uses_vvvv, vbi, pfx, delta, "vpaddq", Iop_Add64x2 );
   26843          goto decode_success;
   26844       }
   26845       /* VPADDQ r/m, rV, r ::: r = rV + r/m */
   26846       /* VPADDQ = VEX.NDS.256.66.0F.WIG D4 /r */
   26847       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26848          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26849                     uses_vvvv, vbi, pfx, delta, "vpaddq", Iop_Add64x4 );
   26850          goto decode_success;
   26851       }
   26852       break;
   26853 
   26854    case 0xD5:
   26855       /* VPMULLW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D5 /r */
   26856       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26857          delta = dis_AVX128_E_V_to_G(
   26858                     uses_vvvv, vbi, pfx, delta, "vpmullw", Iop_Mul16x8 );
   26859          goto decode_success;
   26860       }
   26861       /* VPMULLW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D5 /r */
   26862       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26863          delta = dis_AVX256_E_V_to_G(
   26864                     uses_vvvv, vbi, pfx, delta, "vpmullw", Iop_Mul16x16 );
   26865          goto decode_success;
   26866       }
   26867       break;
   26868 
   26869    case 0xD6:
   26870       /* I can't even find any Intel docs for this one. */
   26871       /* Basically: 66 0F D6 = MOVQ -- move 64 bits from G (lo half
   26872          xmm) to E (mem or lo half xmm).  Looks like L==0(128), W==0
   26873          (WIG, maybe?) */
   26874       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   26875           && 0==getRexW(pfx)/*this might be redundant, dunno*/) {
   26876          UChar modrm = getUChar(delta);
   26877          UInt  rG    = gregOfRexRM(pfx,modrm);
   26878          if (epartIsReg(modrm)) {
   26879             /* fall through, awaiting test case */
   26880             /* dst: lo half copied, hi half zeroed */
   26881          } else {
   26882             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   26883             storeLE( mkexpr(addr), getXMMRegLane64( rG, 0 ));
   26884             DIP("vmovq %s,%s\n", nameXMMReg(rG), dis_buf );
   26885             delta += alen;
   26886             goto decode_success;
   26887          }
   26888       }
   26889       break;
   26890 
   26891    case 0xD7:
   26892       /* VEX.128.66.0F.WIG D7 /r = VPMOVMSKB xmm1, r32 */
   26893       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26894          delta = dis_PMOVMSKB_128( vbi, pfx, delta, True/*isAvx*/ );
   26895          goto decode_success;
   26896       }
   26897       /* VEX.128.66.0F.WIG D7 /r = VPMOVMSKB ymm1, r32 */
   26898       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26899          delta = dis_PMOVMSKB_256( vbi, pfx, delta );
   26900          goto decode_success;
   26901       }
   26902       break;
   26903 
   26904    case 0xD8:
   26905       /* VPSUBUSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D8 /r */
   26906       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26907          delta = dis_AVX128_E_V_to_G(
   26908                     uses_vvvv, vbi, pfx, delta, "vpsubusb", Iop_QSub8Ux16 );
   26909          goto decode_success;
   26910       }
   26911       /* VPSUBUSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D8 /r */
   26912       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26913          delta = dis_AVX256_E_V_to_G(
   26914                     uses_vvvv, vbi, pfx, delta, "vpsubusb", Iop_QSub8Ux32 );
   26915          goto decode_success;
   26916       }
   26917       break;
   26918 
   26919    case 0xD9:
   26920       /* VPSUBUSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D9 /r */
   26921       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26922          delta = dis_AVX128_E_V_to_G(
   26923                     uses_vvvv, vbi, pfx, delta, "vpsubusw", Iop_QSub16Ux8 );
   26924          goto decode_success;
   26925       }
   26926       /* VPSUBUSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D9 /r */
   26927       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26928          delta = dis_AVX256_E_V_to_G(
   26929                     uses_vvvv, vbi, pfx, delta, "vpsubusw", Iop_QSub16Ux16 );
   26930          goto decode_success;
   26931       }
   26932       break;
   26933 
   26934    case 0xDA:
   26935       /* VPMINUB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DA /r */
   26936       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26937          delta = dis_AVX128_E_V_to_G(
   26938                     uses_vvvv, vbi, pfx, delta, "vpminub", Iop_Min8Ux16 );
   26939          goto decode_success;
   26940       }
   26941       /* VPMINUB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DA /r */
   26942       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26943          delta = dis_AVX256_E_V_to_G(
   26944                     uses_vvvv, vbi, pfx, delta, "vpminub", Iop_Min8Ux32 );
   26945          goto decode_success;
   26946       }
   26947       break;
   26948 
   26949    case 0xDB:
   26950       /* VPAND r/m, rV, r ::: r = rV & r/m */
   26951       /* VEX.NDS.128.66.0F.WIG DB /r = VPAND xmm3/m128, xmm2, xmm1 */
   26952       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26953          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26954                     uses_vvvv, vbi, pfx, delta, "vpand", Iop_AndV128 );
   26955          goto decode_success;
   26956       }
   26957       /* VPAND r/m, rV, r ::: r = rV & r/m */
   26958       /* VEX.NDS.256.66.0F.WIG DB /r = VPAND ymm3/m256, ymm2, ymm1 */
   26959       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26960          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26961                     uses_vvvv, vbi, pfx, delta, "vpand", Iop_AndV256 );
   26962          goto decode_success;
   26963       }
   26964       break;
   26965 
   26966    case 0xDC:
   26967       /* VPADDUSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DC /r */
   26968       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26969          delta = dis_AVX128_E_V_to_G(
   26970                     uses_vvvv, vbi, pfx, delta, "vpaddusb", Iop_QAdd8Ux16 );
   26971          goto decode_success;
   26972       }
   26973       /* VPADDUSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DC /r */
   26974       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26975          delta = dis_AVX256_E_V_to_G(
   26976                     uses_vvvv, vbi, pfx, delta, "vpaddusb", Iop_QAdd8Ux32 );
   26977          goto decode_success;
   26978       }
   26979       break;
   26980 
   26981    case 0xDD:
   26982       /* VPADDUSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DD /r */
   26983       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26984          delta = dis_AVX128_E_V_to_G(
   26985                     uses_vvvv, vbi, pfx, delta, "vpaddusw", Iop_QAdd16Ux8 );
   26986          goto decode_success;
   26987       }
   26988       /* VPADDUSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DD /r */
   26989       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26990          delta = dis_AVX256_E_V_to_G(
   26991                     uses_vvvv, vbi, pfx, delta, "vpaddusw", Iop_QAdd16Ux16 );
   26992          goto decode_success;
   26993       }
   26994       break;
   26995 
   26996    case 0xDE:
   26997       /* VPMAXUB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DE /r */
   26998       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26999          delta = dis_AVX128_E_V_to_G(
   27000                     uses_vvvv, vbi, pfx, delta, "vpmaxub", Iop_Max8Ux16 );
   27001          goto decode_success;
   27002       }
   27003       /* VPMAXUB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DE /r */
   27004       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27005          delta = dis_AVX256_E_V_to_G(
   27006                     uses_vvvv, vbi, pfx, delta, "vpmaxub", Iop_Max8Ux32 );
   27007          goto decode_success;
   27008       }
   27009       break;
   27010 
   27011    case 0xDF:
   27012       /* VPANDN r/m, rV, r ::: r = rV & ~r/m (is that correct, re the ~ ?) */
   27013       /* VEX.NDS.128.66.0F.WIG DF /r = VPANDN xmm3/m128, xmm2, xmm1 */
   27014       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27015          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   27016                     uses_vvvv, vbi, pfx, delta, "vpandn", Iop_AndV128,
   27017                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
   27018          goto decode_success;
   27019       }
   27020       /* VPANDN r/m, rV, r ::: r = rV & ~r/m (is that correct, re the ~ ?) */
   27021       /* VEX.NDS.256.66.0F.WIG DF /r = VPANDN ymm3/m256, ymm2, ymm1 */
   27022       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27023          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
   27024                     uses_vvvv, vbi, pfx, delta, "vpandn", Iop_AndV256,
   27025                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
   27026          goto decode_success;
   27027       }
   27028       break;
   27029 
   27030    case 0xE0:
   27031       /* VPAVGB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E0 /r */
   27032       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27033          delta = dis_AVX128_E_V_to_G(
   27034                     uses_vvvv, vbi, pfx, delta, "vpavgb", Iop_Avg8Ux16 );
   27035          goto decode_success;
   27036       }
   27037       /* VPAVGB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E0 /r */
   27038       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27039          delta = dis_AVX256_E_V_to_G(
   27040                     uses_vvvv, vbi, pfx, delta, "vpavgb", Iop_Avg8Ux32 );
   27041          goto decode_success;
   27042       }
   27043       break;
   27044 
   27045    case 0xE1:
   27046       /* VPSRAW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E1 /r */
   27047       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27048          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   27049                                         "vpsraw", Iop_SarN16x8 );
   27050          *uses_vvvv = True;
   27051          goto decode_success;
   27052       }
   27053       /* VPSRAW xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E1 /r */
   27054       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27055          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   27056                                         "vpsraw", Iop_SarN16x16 );
   27057          *uses_vvvv = True;
   27058          goto decode_success;
   27059       }
   27060       break;
   27061 
   27062    case 0xE2:
   27063       /* VPSRAD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E2 /r */
   27064       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27065          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   27066                                         "vpsrad", Iop_SarN32x4 );
   27067          *uses_vvvv = True;
   27068          goto decode_success;
   27069       }
   27070       /* VPSRAD xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E2 /r */
   27071       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27072          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   27073                                         "vpsrad", Iop_SarN32x8 );
   27074          *uses_vvvv = True;
   27075          goto decode_success;
   27076       }
   27077       break;
   27078 
   27079    case 0xE3:
   27080       /* VPAVGW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E3 /r */
   27081       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27082          delta = dis_AVX128_E_V_to_G(
   27083                     uses_vvvv, vbi, pfx, delta, "vpavgw", Iop_Avg16Ux8 );
   27084          goto decode_success;
   27085       }
   27086       /* VPAVGW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E3 /r */
   27087       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27088          delta = dis_AVX256_E_V_to_G(
   27089                     uses_vvvv, vbi, pfx, delta, "vpavgw", Iop_Avg16Ux16 );
   27090          goto decode_success;
   27091       }
   27092       break;
   27093 
   27094    case 0xE4:
   27095       /* VPMULHUW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E4 /r */
   27096       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27097          delta = dis_AVX128_E_V_to_G(
   27098                     uses_vvvv, vbi, pfx, delta, "vpmulhuw", Iop_MulHi16Ux8 );
   27099          goto decode_success;
   27100       }
   27101       /* VPMULHUW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E4 /r */
   27102       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27103          delta = dis_AVX256_E_V_to_G(
   27104                     uses_vvvv, vbi, pfx, delta, "vpmulhuw", Iop_MulHi16Ux16 );
   27105          goto decode_success;
   27106       }
   27107       break;
   27108 
   27109    case 0xE5:
   27110       /* VPMULHW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E5 /r */
   27111       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27112          delta = dis_AVX128_E_V_to_G(
   27113                     uses_vvvv, vbi, pfx, delta, "vpmulhw", Iop_MulHi16Sx8 );
   27114          goto decode_success;
   27115       }
   27116       /* VPMULHW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E5 /r */
   27117       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27118          delta = dis_AVX256_E_V_to_G(
   27119                     uses_vvvv, vbi, pfx, delta, "vpmulhw", Iop_MulHi16Sx16 );
   27120          goto decode_success;
   27121       }
   27122       break;
   27123 
   27124    case 0xE6:
   27125       /* VCVTDQ2PD xmm2/m64, xmm1 = VEX.128.F3.0F.WIG E6 /r */
   27126       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
   27127          delta = dis_CVTDQ2PD_128(vbi, pfx, delta, True/*isAvx*/);
   27128          goto decode_success;
   27129       }
   27130       /* VCVTDQ2PD xmm2/m128, ymm1 = VEX.256.F3.0F.WIG E6 /r */
   27131       if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
   27132          delta = dis_CVTDQ2PD_256(vbi, pfx, delta);
   27133          goto decode_success;
   27134       }
   27135       /* VCVTTPD2DQ xmm2/m128, xmm1 = VEX.128.66.0F.WIG E6 /r */
   27136       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27137          delta = dis_CVTxPD2DQ_128(vbi, pfx, delta, True/*isAvx*/,
   27138                                    True/*r2zero*/);
   27139          goto decode_success;
   27140       }
   27141       /* VCVTTPD2DQ ymm2/m256, xmm1 = VEX.256.66.0F.WIG E6 /r */
   27142       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27143          delta = dis_CVTxPD2DQ_256(vbi, pfx, delta, True/*r2zero*/);
   27144          goto decode_success;
   27145       }
   27146       /* VCVTPD2DQ xmm2/m128, xmm1 = VEX.128.F2.0F.WIG E6 /r */
   27147       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27148          delta = dis_CVTxPD2DQ_128(vbi, pfx, delta, True/*isAvx*/,
   27149                                    False/*!r2zero*/);
   27150          goto decode_success;
   27151       }
   27152       /* VCVTPD2DQ ymm2/m256, xmm1 = VEX.256.F2.0F.WIG E6 /r */
   27153       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27154          delta = dis_CVTxPD2DQ_256(vbi, pfx, delta, False/*!r2zero*/);
   27155          goto decode_success;
   27156       }
   27157       break;
   27158 
   27159    case 0xE7:
   27160       /* VMOVNTDQ xmm1, m128 = VEX.128.66.0F.WIG E7 /r */
   27161       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27162          UChar modrm = getUChar(delta);
   27163          UInt rG     = gregOfRexRM(pfx,modrm);
   27164          if (!epartIsReg(modrm)) {
   27165             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   27166             gen_SEGV_if_not_16_aligned( addr );
   27167             storeLE( mkexpr(addr), getXMMReg(rG) );
   27168             DIP("vmovntdq %s,%s\n", dis_buf, nameXMMReg(rG));
   27169             delta += alen;
   27170             goto decode_success;
   27171          }
   27172          /* else fall through */
   27173       }
   27174       /* VMOVNTDQ ymm1, m256 = VEX.256.66.0F.WIG E7 /r */
   27175       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27176          UChar modrm = getUChar(delta);
   27177          UInt rG     = gregOfRexRM(pfx,modrm);
   27178          if (!epartIsReg(modrm)) {
   27179             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   27180             gen_SEGV_if_not_32_aligned( addr );
   27181             storeLE( mkexpr(addr), getYMMReg(rG) );
   27182             DIP("vmovntdq %s,%s\n", dis_buf, nameYMMReg(rG));
   27183             delta += alen;
   27184             goto decode_success;
   27185          }
   27186          /* else fall through */
   27187       }
   27188       break;
   27189 
   27190    case 0xE8:
   27191       /* VPSUBSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E8 /r */
   27192       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27193          delta = dis_AVX128_E_V_to_G(
   27194                     uses_vvvv, vbi, pfx, delta, "vpsubsb", Iop_QSub8Sx16 );
   27195          goto decode_success;
   27196       }
   27197       /* VPSUBSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E8 /r */
   27198       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27199          delta = dis_AVX256_E_V_to_G(
   27200                     uses_vvvv, vbi, pfx, delta, "vpsubsb", Iop_QSub8Sx32 );
   27201          goto decode_success;
   27202       }
   27203       break;
   27204 
   27205    case 0xE9:
   27206       /* VPSUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E9 /r */
   27207       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27208          delta = dis_AVX128_E_V_to_G(
   27209                     uses_vvvv, vbi, pfx, delta, "vpsubsw", Iop_QSub16Sx8 );
   27210          goto decode_success;
   27211       }
   27212       /* VPSUBSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E9 /r */
   27213       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27214          delta = dis_AVX256_E_V_to_G(
   27215                     uses_vvvv, vbi, pfx, delta, "vpsubsw", Iop_QSub16Sx16 );
   27216          goto decode_success;
   27217       }
   27218       break;
   27219 
   27220    case 0xEA:
   27221       /* VPMINSW r/m, rV, r ::: r = min-signed16s(rV, r/m) */
   27222       /* VPMINSW = VEX.NDS.128.66.0F.WIG EA /r */
   27223       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27224          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   27225                     uses_vvvv, vbi, pfx, delta, "vpminsw", Iop_Min16Sx8 );
   27226          goto decode_success;
   27227       }
   27228       /* VPMINSW r/m, rV, r ::: r = min-signed16s(rV, r/m) */
   27229       /* VPMINSW = VEX.NDS.256.66.0F.WIG EA /r */
   27230       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27231          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   27232                     uses_vvvv, vbi, pfx, delta, "vpminsw", Iop_Min16Sx16 );
   27233          goto decode_success;
   27234       }
   27235       break;
   27236 
   27237    case 0xEB:
   27238       /* VPOR r/m, rV, r ::: r = rV | r/m */
   27239       /* VPOR = VEX.NDS.128.66.0F.WIG EB /r */
   27240       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27241          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   27242                     uses_vvvv, vbi, pfx, delta, "vpor", Iop_OrV128 );
   27243          goto decode_success;
   27244       }
   27245       /* VPOR r/m, rV, r ::: r = rV | r/m */
   27246       /* VPOR = VEX.NDS.256.66.0F.WIG EB /r */
   27247       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27248          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   27249                     uses_vvvv, vbi, pfx, delta, "vpor", Iop_OrV256 );
   27250          goto decode_success;
   27251       }
   27252       break;
   27253 
   27254    case 0xEC:
   27255       /* VPADDSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG EC /r */
   27256       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27257          delta = dis_AVX128_E_V_to_G(
   27258                     uses_vvvv, vbi, pfx, delta, "vpaddsb", Iop_QAdd8Sx16 );
   27259          goto decode_success;
   27260       }
   27261       /* VPADDSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG EC /r */
   27262       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27263          delta = dis_AVX256_E_V_to_G(
   27264                     uses_vvvv, vbi, pfx, delta, "vpaddsb", Iop_QAdd8Sx32 );
   27265          goto decode_success;
   27266       }
   27267       break;
   27268 
   27269    case 0xED:
   27270       /* VPADDSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG ED /r */
   27271       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27272          delta = dis_AVX128_E_V_to_G(
   27273                     uses_vvvv, vbi, pfx, delta, "vpaddsw", Iop_QAdd16Sx8 );
   27274          goto decode_success;
   27275       }
   27276       /* VPADDSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG ED /r */
   27277       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27278          delta = dis_AVX256_E_V_to_G(
   27279                     uses_vvvv, vbi, pfx, delta, "vpaddsw", Iop_QAdd16Sx16 );
   27280          goto decode_success;
   27281       }
   27282       break;
   27283 
   27284    case 0xEE:
   27285       /* VPMAXSW r/m, rV, r ::: r = max-signed16s(rV, r/m) */
   27286       /* VPMAXSW = VEX.NDS.128.66.0F.WIG EE /r */
   27287       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27288          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   27289                     uses_vvvv, vbi, pfx, delta, "vpmaxsw", Iop_Max16Sx8 );
   27290          goto decode_success;
   27291       }
   27292       /* VPMAXSW r/m, rV, r ::: r = max-signed16s(rV, r/m) */
   27293       /* VPMAXSW = VEX.NDS.256.66.0F.WIG EE /r */
   27294       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27295          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   27296                     uses_vvvv, vbi, pfx, delta, "vpmaxsw", Iop_Max16Sx16 );
   27297          goto decode_success;
   27298       }
   27299       break;
   27300 
   27301    case 0xEF:
   27302       /* VPXOR r/m, rV, r ::: r = rV ^ r/m */
   27303       /* VPXOR = VEX.NDS.128.66.0F.WIG EF /r */
   27304       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27305          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   27306                     uses_vvvv, vbi, pfx, delta, "vpxor", Iop_XorV128 );
   27307          goto decode_success;
   27308       }
   27309       /* VPXOR r/m, rV, r ::: r = rV ^ r/m */
   27310       /* VPXOR = VEX.NDS.256.66.0F.WIG EF /r */
   27311       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27312          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   27313                     uses_vvvv, vbi, pfx, delta, "vpxor", Iop_XorV256 );
   27314          goto decode_success;
   27315       }
   27316       break;
   27317 
   27318    case 0xF0:
   27319       /* VLDDQU m256, ymm1 = VEX.256.F2.0F.WIG F0 /r */
   27320       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27321          UChar  modrm = getUChar(delta);
   27322          UInt   rD    = gregOfRexRM(pfx, modrm);
   27323          IRTemp tD    = newTemp(Ity_V256);
   27324          if (epartIsReg(modrm)) break;
   27325          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   27326          delta += alen;
   27327          assign(tD, loadLE(Ity_V256, mkexpr(addr)));
   27328          DIP("vlddqu %s,%s\n", dis_buf, nameYMMReg(rD));
   27329          putYMMReg(rD, mkexpr(tD));
   27330          goto decode_success;
   27331       }
   27332       /* VLDDQU m128, xmm1 = VEX.128.F2.0F.WIG F0 /r */
   27333       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27334          UChar  modrm = getUChar(delta);
   27335          UInt   rD    = gregOfRexRM(pfx, modrm);
   27336          IRTemp tD    = newTemp(Ity_V128);
   27337          if (epartIsReg(modrm)) break;
   27338          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   27339          delta += alen;
   27340          assign(tD, loadLE(Ity_V128, mkexpr(addr)));
   27341          DIP("vlddqu %s,%s\n", dis_buf, nameXMMReg(rD));
   27342          putYMMRegLoAndZU(rD, mkexpr(tD));
   27343          goto decode_success;
   27344       }
   27345       break;
   27346 
   27347    case 0xF1:
   27348       /* VPSLLW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F1 /r */
   27349       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27350          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   27351                                         "vpsllw", Iop_ShlN16x8 );
   27352          *uses_vvvv = True;
   27353          goto decode_success;
   27354 
   27355       }
   27356       /* VPSLLW xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F1 /r */
   27357       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27358          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   27359                                         "vpsllw", Iop_ShlN16x16 );
   27360          *uses_vvvv = True;
   27361          goto decode_success;
   27362 
   27363       }
   27364       break;
   27365 
   27366    case 0xF2:
   27367       /* VPSLLD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F2 /r */
   27368       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27369          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   27370                                         "vpslld", Iop_ShlN32x4 );
   27371          *uses_vvvv = True;
   27372          goto decode_success;
   27373       }
   27374       /* VPSLLD xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F2 /r */
   27375       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27376          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   27377                                         "vpslld", Iop_ShlN32x8 );
   27378          *uses_vvvv = True;
   27379          goto decode_success;
   27380       }
   27381       break;
   27382 
   27383    case 0xF3:
   27384       /* VPSLLQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F3 /r */
   27385       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27386          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   27387                                         "vpsllq", Iop_ShlN64x2 );
   27388          *uses_vvvv = True;
   27389          goto decode_success;
   27390       }
   27391       /* VPSLLQ xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F3 /r */
   27392       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27393          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   27394                                         "vpsllq", Iop_ShlN64x4 );
   27395          *uses_vvvv = True;
   27396          goto decode_success;
   27397       }
   27398       break;
   27399 
   27400    case 0xF4:
   27401       /* VPMULUDQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F4 /r */
   27402       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27403          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   27404                     uses_vvvv, vbi, pfx, delta,
   27405                     "vpmuludq", math_PMULUDQ_128 );
   27406          goto decode_success;
   27407       }
   27408       /* VPMULUDQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F4 /r */
   27409       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27410          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   27411                     uses_vvvv, vbi, pfx, delta,
   27412                     "vpmuludq", math_PMULUDQ_256 );
   27413          goto decode_success;
   27414       }
   27415       break;
   27416 
   27417    case 0xF5:
   27418       /* VPMADDWD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F5 /r */
   27419       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27420          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   27421                     uses_vvvv, vbi, pfx, delta,
   27422                     "vpmaddwd", math_PMADDWD_128 );
   27423          goto decode_success;
   27424       }
   27425       /* VPMADDWD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F5 /r */
   27426       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27427          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   27428                     uses_vvvv, vbi, pfx, delta,
   27429                     "vpmaddwd", math_PMADDWD_256 );
   27430          goto decode_success;
   27431       }
   27432       break;
   27433 
   27434    case 0xF6:
   27435       /* VPSADBW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F6 /r */
   27436       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27437          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   27438                     uses_vvvv, vbi, pfx, delta,
   27439                     "vpsadbw", math_PSADBW_128 );
   27440          goto decode_success;
   27441       }
   27442       /* VPSADBW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F6 /r */
   27443       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27444          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   27445                     uses_vvvv, vbi, pfx, delta,
   27446                     "vpsadbw", math_PSADBW_256 );
   27447          goto decode_success;
   27448       }
   27449       break;
   27450 
   27451    case 0xF7:
   27452       /* VMASKMOVDQU xmm2, xmm1 = VEX.128.66.0F.WIG F7 /r */
   27453       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   27454           && epartIsReg(getUChar(delta))) {
   27455          delta = dis_MASKMOVDQU( vbi, pfx, delta, True/*isAvx*/ );
   27456          goto decode_success;
   27457       }
   27458       break;
   27459 
   27460    case 0xF8:
   27461       /* VPSUBB r/m, rV, r ::: r = rV - r/m */
   27462       /* VPSUBB = VEX.NDS.128.66.0F.WIG F8 /r */
   27463       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27464          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   27465                     uses_vvvv, vbi, pfx, delta, "vpsubb", Iop_Sub8x16 );
   27466          goto decode_success;
   27467       }
   27468       /* VPSUBB r/m, rV, r ::: r = rV - r/m */
   27469       /* VPSUBB = VEX.NDS.256.66.0F.WIG F8 /r */
   27470       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27471          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   27472                     uses_vvvv, vbi, pfx, delta, "vpsubb", Iop_Sub8x32 );
   27473          goto decode_success;
   27474       }
   27475       break;
   27476 
   27477    case 0xF9:
   27478       /* VPSUBW r/m, rV, r ::: r = rV - r/m */
   27479       /* VPSUBW = VEX.NDS.128.66.0F.WIG F9 /r */
   27480       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27481          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   27482                     uses_vvvv, vbi, pfx, delta, "vpsubw", Iop_Sub16x8 );
   27483          goto decode_success;
   27484       }
   27485       /* VPSUBW r/m, rV, r ::: r = rV - r/m */
   27486       /* VPSUBW = VEX.NDS.256.66.0F.WIG F9 /r */
   27487       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27488          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   27489                     uses_vvvv, vbi, pfx, delta, "vpsubw", Iop_Sub16x16 );
   27490          goto decode_success;
   27491       }
   27492       break;
   27493 
   27494    case 0xFA:
   27495       /* VPSUBD r/m, rV, r ::: r = rV - r/m */
   27496       /* VPSUBD = VEX.NDS.128.66.0F.WIG FA /r */
   27497       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27498          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   27499                     uses_vvvv, vbi, pfx, delta, "vpsubd", Iop_Sub32x4 );
   27500          goto decode_success;
   27501       }
   27502       /* VPSUBD r/m, rV, r ::: r = rV - r/m */
   27503       /* VPSUBD = VEX.NDS.256.66.0F.WIG FA /r */
   27504       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27505          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   27506                     uses_vvvv, vbi, pfx, delta, "vpsubd", Iop_Sub32x8 );
   27507          goto decode_success;
   27508       }
   27509       break;
   27510 
   27511    case 0xFB:
   27512       /* VPSUBQ r/m, rV, r ::: r = rV - r/m */
   27513       /* VPSUBQ = VEX.NDS.128.66.0F.WIG FB /r */
   27514       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27515          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   27516                     uses_vvvv, vbi, pfx, delta, "vpsubq", Iop_Sub64x2 );
   27517          goto decode_success;
   27518       }
   27519       /* VPSUBQ r/m, rV, r ::: r = rV - r/m */
   27520       /* VPSUBQ = VEX.NDS.256.66.0F.WIG FB /r */
   27521       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27522          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   27523                     uses_vvvv, vbi, pfx, delta, "vpsubq", Iop_Sub64x4 );
   27524          goto decode_success;
   27525       }
   27526       break;
   27527 
   27528    case 0xFC:
   27529       /* VPADDB r/m, rV, r ::: r = rV + r/m */
   27530       /* VPADDB = VEX.NDS.128.66.0F.WIG FC /r */
   27531       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27532          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   27533                     uses_vvvv, vbi, pfx, delta, "vpaddb", Iop_Add8x16 );
   27534          goto decode_success;
   27535       }
   27536       /* VPADDB r/m, rV, r ::: r = rV + r/m */
   27537       /* VPADDB = VEX.NDS.256.66.0F.WIG FC /r */
   27538       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27539          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   27540                     uses_vvvv, vbi, pfx, delta, "vpaddb", Iop_Add8x32 );
   27541          goto decode_success;
   27542       }
   27543       break;
   27544 
   27545    case 0xFD:
   27546       /* VPADDW r/m, rV, r ::: r = rV + r/m */
   27547       /* VPADDW = VEX.NDS.128.66.0F.WIG FD /r */
   27548       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27549          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   27550                     uses_vvvv, vbi, pfx, delta, "vpaddw", Iop_Add16x8 );
   27551          goto decode_success;
   27552       }
   27553       /* VPADDW r/m, rV, r ::: r = rV + r/m */
   27554       /* VPADDW = VEX.NDS.256.66.0F.WIG FD /r */
   27555       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27556          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   27557                     uses_vvvv, vbi, pfx, delta, "vpaddw", Iop_Add16x16 );
   27558          goto decode_success;
   27559       }
   27560       break;
   27561 
   27562    case 0xFE:
   27563       /* VPADDD r/m, rV, r ::: r = rV + r/m */
   27564       /* VPADDD = VEX.NDS.128.66.0F.WIG FE /r */
   27565       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27566          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   27567                     uses_vvvv, vbi, pfx, delta, "vpaddd", Iop_Add32x4 );
   27568          goto decode_success;
   27569       }
   27570       /* VPADDD r/m, rV, r ::: r = rV + r/m */
   27571       /* VPADDD = VEX.NDS.256.66.0F.WIG FE /r */
   27572       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27573          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   27574                     uses_vvvv, vbi, pfx, delta, "vpaddd", Iop_Add32x8 );
   27575          goto decode_success;
   27576       }
   27577       break;
   27578 
   27579    default:
   27580       break;
   27581 
   27582    }
   27583 
   27584   //decode_failure:
   27585    return deltaIN;
   27586 
   27587   decode_success:
   27588    return delta;
   27589 }
   27590 
   27591 
   27592 /*------------------------------------------------------------*/
   27593 /*---                                                      ---*/
   27594 /*--- Top-level post-escape decoders: dis_ESC_0F38__VEX    ---*/
   27595 /*---                                                      ---*/
   27596 /*------------------------------------------------------------*/
   27597 
   27598 static IRTemp math_PERMILPS_VAR_128 ( IRTemp dataV, IRTemp ctrlV )
   27599 {
   27600    /* In the control vector, zero out all but the bottom two bits of
   27601       each 32-bit lane. */
   27602    IRExpr* cv1 = binop(Iop_ShrN32x4,
   27603                        binop(Iop_ShlN32x4, mkexpr(ctrlV), mkU8(30)),
   27604                        mkU8(30));
   27605    /* And use the resulting cleaned-up control vector as steering
   27606       in a Perm operation. */
   27607    IRTemp res = newTemp(Ity_V128);
   27608    assign(res, binop(Iop_Perm32x4, mkexpr(dataV), cv1));
   27609    return res;
   27610 }
   27611 
   27612 static IRTemp math_PERMILPS_VAR_256 ( IRTemp dataV, IRTemp ctrlV )
   27613 {
   27614    IRTemp dHi, dLo, cHi, cLo;
   27615    dHi = dLo = cHi = cLo = IRTemp_INVALID;
   27616    breakupV256toV128s( dataV, &dHi, &dLo );
   27617    breakupV256toV128s( ctrlV, &cHi, &cLo );
   27618    IRTemp rHi = math_PERMILPS_VAR_128( dHi, cHi );
   27619    IRTemp rLo = math_PERMILPS_VAR_128( dLo, cLo );
   27620    IRTemp res = newTemp(Ity_V256);
   27621    assign(res, binop(Iop_V128HLtoV256, mkexpr(rHi), mkexpr(rLo)));
   27622    return res;
   27623 }
   27624 
   27625 static IRTemp math_PERMILPD_VAR_128 ( IRTemp dataV, IRTemp ctrlV )
   27626 {
   27627    /* No cleverness here .. */
   27628    IRTemp dHi, dLo, cHi, cLo;
   27629    dHi = dLo = cHi = cLo = IRTemp_INVALID;
   27630    breakupV128to64s( dataV, &dHi, &dLo );
   27631    breakupV128to64s( ctrlV, &cHi, &cLo );
   27632    IRExpr* rHi
   27633       = IRExpr_ITE( unop(Iop_64to1,
   27634                          binop(Iop_Shr64, mkexpr(cHi), mkU8(1))),
   27635                     mkexpr(dHi), mkexpr(dLo) );
   27636    IRExpr* rLo
   27637       = IRExpr_ITE( unop(Iop_64to1,
   27638                          binop(Iop_Shr64, mkexpr(cLo), mkU8(1))),
   27639                     mkexpr(dHi), mkexpr(dLo) );
   27640    IRTemp res = newTemp(Ity_V128);
   27641    assign(res, binop(Iop_64HLtoV128, rHi, rLo));
   27642    return res;
   27643 }
   27644 
   27645 static IRTemp math_PERMILPD_VAR_256 ( IRTemp dataV, IRTemp ctrlV )
   27646 {
   27647    IRTemp dHi, dLo, cHi, cLo;
   27648    dHi = dLo = cHi = cLo = IRTemp_INVALID;
   27649    breakupV256toV128s( dataV, &dHi, &dLo );
   27650    breakupV256toV128s( ctrlV, &cHi, &cLo );
   27651    IRTemp rHi = math_PERMILPD_VAR_128( dHi, cHi );
   27652    IRTemp rLo = math_PERMILPD_VAR_128( dLo, cLo );
   27653    IRTemp res = newTemp(Ity_V256);
   27654    assign(res, binop(Iop_V128HLtoV256, mkexpr(rHi), mkexpr(rLo)));
   27655    return res;
   27656 }
   27657 
   27658 static IRTemp math_VPERMD ( IRTemp ctrlV, IRTemp dataV )
   27659 {
   27660    /* In the control vector, zero out all but the bottom three bits of
   27661       each 32-bit lane. */
   27662    IRExpr* cv1 = binop(Iop_ShrN32x8,
   27663                        binop(Iop_ShlN32x8, mkexpr(ctrlV), mkU8(29)),
   27664                        mkU8(29));
   27665    /* And use the resulting cleaned-up control vector as steering
   27666       in a Perm operation. */
   27667    IRTemp res = newTemp(Ity_V256);
   27668    assign(res, binop(Iop_Perm32x8, mkexpr(dataV), cv1));
   27669    return res;
   27670 }
   27671 
   27672 static Long dis_SHIFTX ( /*OUT*/Bool* uses_vvvv,
   27673                          const VexAbiInfo* vbi, Prefix pfx, Long delta,
   27674                          const HChar* opname, IROp op8 )
   27675 {
   27676    HChar   dis_buf[50];
   27677    Int     alen;
   27678    Int     size = getRexW(pfx) ? 8 : 4;
   27679    IRType  ty   = szToITy(size);
   27680    IRTemp  src  = newTemp(ty);
   27681    IRTemp  amt  = newTemp(ty);
   27682    UChar   rm   = getUChar(delta);
   27683 
   27684    assign( amt, getIRegV(size,pfx) );
   27685    if (epartIsReg(rm)) {
   27686       assign( src, getIRegE(size,pfx,rm) );
   27687       DIP("%s %s,%s,%s\n", opname, nameIRegV(size,pfx),
   27688                            nameIRegE(size,pfx,rm), nameIRegG(size,pfx,rm));
   27689       delta++;
   27690    } else {
   27691       IRTemp addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   27692       assign( src, loadLE(ty, mkexpr(addr)) );
   27693       DIP("%s %s,%s,%s\n", opname, nameIRegV(size,pfx), dis_buf,
   27694                            nameIRegG(size,pfx,rm));
   27695       delta += alen;
   27696    }
   27697 
   27698    putIRegG( size, pfx, rm,
   27699              binop(mkSizedOp(ty,op8), mkexpr(src),
   27700                    narrowTo(Ity_I8, binop(mkSizedOp(ty,Iop_And8), mkexpr(amt),
   27701                                           mkU(ty,8*size-1)))) );
   27702    /* Flags aren't modified.  */
   27703    *uses_vvvv = True;
   27704    return delta;
   27705 }
   27706 
   27707 
   27708 static Long dis_FMA ( const VexAbiInfo* vbi, Prefix pfx, Long delta, UChar opc )
   27709 {
   27710    UChar  modrm   = getUChar(delta);
   27711    UInt   rG      = gregOfRexRM(pfx, modrm);
   27712    UInt   rV      = getVexNvvvv(pfx);
   27713    Bool   scalar  = (opc & 0xF) > 7 && (opc & 1);
   27714    IRType ty      = getRexW(pfx) ? Ity_F64 : Ity_F32;
   27715    IRType vty     = scalar ? ty : (getVexL(pfx) ? Ity_V256 : Ity_V128);
   27716    IRTemp addr    = IRTemp_INVALID;
   27717    HChar  dis_buf[50];
   27718    Int    alen    = 0;
   27719    const HChar *name;
   27720    const HChar *suffix;
   27721    const HChar *order;
   27722    Bool   negateRes   = False;
   27723    Bool   negateZeven = False;
   27724    Bool   negateZodd  = False;
   27725    UInt   count = 0;
   27726 
   27727    switch (opc & 0xF) {
   27728       case 0x6: name = "addsub"; negateZeven = True; break;
   27729       case 0x7: name = "subadd"; negateZodd = True; break;
   27730       case 0x8:
   27731       case 0x9: name = "add"; break;
   27732       case 0xA:
   27733       case 0xB: name = "sub"; negateZeven = True; negateZodd = True;
   27734          break;
   27735       case 0xC:
   27736       case 0xD: name = "add"; negateRes = True; negateZeven = True;
   27737                                                 negateZodd = True; break;
   27738       case 0xE:
   27739       case 0xF: name = "sub"; negateRes = True; break;
   27740       default:  vpanic("dis_FMA(amd64)"); break;
   27741    }
   27742    switch (opc & 0xF0) {
   27743       case 0x90: order = "132"; break;
   27744       case 0xA0: order = "213"; break;
   27745       case 0xB0: order = "231"; break;
   27746       default:   vpanic("dis_FMA(amd64)"); break;
   27747    }
   27748    if (scalar) {
   27749       suffix = ty == Ity_F64 ? "sd" : "ss";
   27750    } else {
   27751       suffix = ty == Ity_F64 ? "pd" : "ps";
   27752    }
   27753 
   27754    // Figure out |count| (the number of elements) by considering |vty| and |ty|.
   27755    count = sizeofIRType(vty) / sizeofIRType(ty);
   27756    vassert(count == 1 || count == 2 || count == 4 || count == 8);
   27757 
   27758    // Fetch operands into the first |count| elements of |sX|, |sY| and |sZ|.
   27759    UInt i;
   27760    IRExpr *sX[8], *sY[8], *sZ[8], *res[8];
   27761    for (i = 0; i < 8; i++) sX[i] = sY[i] = sZ[i] = res[i] = NULL;
   27762 
   27763    IRExpr* (*getYMMRegLane)(UInt,Int)
   27764       = ty == Ity_F32 ? getYMMRegLane32F : getYMMRegLane64F;
   27765    void (*putYMMRegLane)(UInt,Int,IRExpr*)
   27766       = ty == Ity_F32 ? putYMMRegLane32F : putYMMRegLane64F;
   27767 
   27768    for (i = 0; i < count; i++) {
   27769       sX[i] = getYMMRegLane(rG, i);
   27770       sZ[i] = getYMMRegLane(rV, i);
   27771    }
   27772 
   27773    if (epartIsReg(modrm)) {
   27774       UInt rE = eregOfRexRM(pfx, modrm);
   27775       delta += 1;
   27776       for (i = 0; i < count; i++) {
   27777          sY[i] = getYMMRegLane(rE, i);
   27778       }
   27779       if (vty == Ity_V256) {
   27780          DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
   27781              name, order, suffix, nameYMMReg(rE), nameYMMReg(rV),
   27782              nameYMMReg(rG));
   27783       } else {
   27784          DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
   27785              name, order, suffix, nameXMMReg(rE), nameXMMReg(rV),
   27786              nameXMMReg(rG));
   27787       }
   27788    } else {
   27789       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   27790       delta += alen;
   27791       for (i = 0; i < count; i++) {
   27792          sY[i] = loadLE(ty, binop(Iop_Add64, mkexpr(addr),
   27793                                   mkU64(i * sizeofIRType(ty))));
   27794       }
   27795       if (vty == Ity_V256) {
   27796          DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
   27797              name, order, suffix, dis_buf, nameYMMReg(rV),
   27798              nameYMMReg(rG));
   27799       } else {
   27800          DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
   27801              name, order, suffix, dis_buf, nameXMMReg(rV),
   27802              nameXMMReg(rG));
   27803       }
   27804    }
   27805 
   27806    /* vX/vY/vZ are now in 132 order.  If the instruction requires a different
   27807       order, swap them around.  */
   27808 
   27809 #  define COPY_ARR(_dst, _src) \
   27810       do { for (int j = 0; j < 8; j++) { _dst[j] = _src[j]; } } while (0)
   27811 
   27812    if ((opc & 0xF0) != 0x90) {
   27813       IRExpr* temp[8];
   27814       COPY_ARR(temp, sX);
   27815       if ((opc & 0xF0) == 0xA0) {
   27816          COPY_ARR(sX, sZ);
   27817          COPY_ARR(sZ, sY);
   27818          COPY_ARR(sY, temp);
   27819       } else {
   27820          COPY_ARR(sX, sZ);
   27821          COPY_ARR(sZ, temp);
   27822       }
   27823    }
   27824 
   27825 #  undef COPY_ARR
   27826 
   27827    for (i = 0; i < count; i++) {
   27828       IROp opNEG = ty == Ity_F64 ? Iop_NegF64 : Iop_NegF32;
   27829       if ((i & 1) ? negateZodd : negateZeven) {
   27830          sZ[i] = unop(opNEG, sZ[i]);
   27831       }
   27832       res[i] = IRExpr_Qop(ty == Ity_F64 ? Iop_MAddF64 : Iop_MAddF32,
   27833                           get_FAKE_roundingmode(), sX[i], sY[i], sZ[i]);
   27834       if (negateRes) {
   27835          res[i] = unop(opNEG, res[i]);
   27836       }
   27837    }
   27838 
   27839    for (i = 0; i < count; i++) {
   27840       putYMMRegLane(rG, i, res[i]);
   27841    }
   27842 
   27843    switch (vty) {
   27844       case Ity_F32:  putYMMRegLane32(rG, 1, mkU32(0)); /*fallthru*/
   27845       case Ity_F64:  putYMMRegLane64(rG, 1, mkU64(0)); /*fallthru*/
   27846       case Ity_V128: putYMMRegLane128(rG, 1, mkV128(0)); /*fallthru*/
   27847       case Ity_V256: break;
   27848       default: vassert(0);
   27849    }
   27850 
   27851    return delta;
   27852 }
   27853 
   27854 
   27855 /* Masked load or masked store. */
   27856 static ULong dis_VMASKMOV ( Bool *uses_vvvv, const VexAbiInfo* vbi,
   27857                             Prefix pfx, Long delta,
   27858                             const HChar* opname, Bool isYMM, IRType ty,
   27859                             Bool isLoad )
   27860 {
   27861    HChar   dis_buf[50];
   27862    Int     alen, i;
   27863    IRTemp  addr;
   27864    UChar   modrm = getUChar(delta);
   27865    UInt    rG    = gregOfRexRM(pfx,modrm);
   27866    UInt    rV    = getVexNvvvv(pfx);
   27867 
   27868    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   27869    delta += alen;
   27870 
   27871    /**/ if (isLoad && isYMM) {
   27872       DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
   27873    }
   27874    else if (isLoad && !isYMM) {
   27875       DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   27876    }
   27877 
   27878    else if (!isLoad && isYMM) {
   27879       DIP("%s %s,%s,%s\n", opname, nameYMMReg(rG), nameYMMReg(rV), dis_buf );
   27880    }
   27881    else {
   27882       vassert(!isLoad && !isYMM);
   27883       DIP("%s %s,%s,%s\n", opname, nameXMMReg(rG), nameXMMReg(rV), dis_buf );
   27884    }
   27885 
   27886    vassert(ty == Ity_I32 || ty == Ity_I64);
   27887    Bool laneIs32 = ty == Ity_I32;
   27888 
   27889    Int nLanes = (isYMM ? 2 : 1) * (laneIs32 ? 4 : 2);
   27890 
   27891    for (i = 0; i < nLanes; i++) {
   27892       IRExpr* shAmt = laneIs32 ? mkU8(31)    : mkU8(63);
   27893       IRExpr* one   = laneIs32 ? mkU32(1)    : mkU64(1);
   27894       IROp    opSHR = laneIs32 ? Iop_Shr32   : Iop_Shr64;
   27895       IROp    opEQ  = laneIs32 ? Iop_CmpEQ32 : Iop_CmpEQ64;
   27896       IRExpr* lane  = (laneIs32 ? getYMMRegLane32 : getYMMRegLane64)( rV, i );
   27897 
   27898       IRTemp  cond = newTemp(Ity_I1);
   27899       assign(cond, binop(opEQ, binop(opSHR, lane, shAmt), one));
   27900 
   27901       IRTemp  data = newTemp(ty);
   27902       IRExpr* ea   = binop(Iop_Add64, mkexpr(addr),
   27903                                       mkU64(i * (laneIs32 ? 4 : 8)));
   27904       if (isLoad) {
   27905          stmt(
   27906             IRStmt_LoadG(
   27907                Iend_LE, laneIs32 ? ILGop_Ident32 : ILGop_Ident64,
   27908                data, ea, laneIs32 ? mkU32(0) : mkU64(0), mkexpr(cond)
   27909          ));
   27910          (laneIs32 ? putYMMRegLane32 : putYMMRegLane64)( rG, i, mkexpr(data) );
   27911       } else {
   27912          assign(data, (laneIs32 ? getYMMRegLane32 : getYMMRegLane64)( rG, i ));
   27913          stmt( IRStmt_StoreG(Iend_LE, ea, mkexpr(data), mkexpr(cond)) );
   27914       }
   27915    }
   27916 
   27917    if (isLoad && !isYMM)
   27918       putYMMRegLane128( rG, 1, mkV128(0) );
   27919 
   27920    *uses_vvvv = True;
   27921    return delta;
   27922 }
   27923 
   27924 
   27925 /* Gather.  */
   27926 static ULong dis_VGATHER ( Bool *uses_vvvv, const VexAbiInfo* vbi,
   27927                            Prefix pfx, Long delta,
   27928                            const HChar* opname, Bool isYMM,
   27929                            Bool isVM64x, IRType ty )
   27930 {
   27931    HChar  dis_buf[50];
   27932    Int    alen, i, vscale, count1, count2;
   27933    IRTemp addr;
   27934    UChar  modrm = getUChar(delta);
   27935    UInt   rG    = gregOfRexRM(pfx,modrm);
   27936    UInt   rV    = getVexNvvvv(pfx);
   27937    UInt   rI;
   27938    IRType dstTy = (isYMM && (ty == Ity_I64 || !isVM64x)) ? Ity_V256 : Ity_V128;
   27939    IRType idxTy = (isYMM && (ty == Ity_I32 || isVM64x)) ? Ity_V256 : Ity_V128;
   27940    IRTemp cond;
   27941    addr = disAVSIBMode ( &alen, vbi, pfx, delta, dis_buf, &rI,
   27942                          idxTy, &vscale );
   27943    if (addr == IRTemp_INVALID || rI == rG || rI == rV || rG == rV)
   27944       return delta;
   27945    if (dstTy == Ity_V256) {
   27946       DIP("%s %s,%s,%s\n", opname, nameYMMReg(rV), dis_buf, nameYMMReg(rG) );
   27947    } else {
   27948       DIP("%s %s,%s,%s\n", opname, nameXMMReg(rV), dis_buf, nameXMMReg(rG) );
   27949    }
   27950    delta += alen;
   27951 
   27952    if (ty == Ity_I32) {
   27953       count1 = isYMM ? 8 : 4;
   27954       count2 = isVM64x ? count1 / 2 : count1;
   27955    } else {
   27956       count1 = count2 = isYMM ? 4 : 2;
   27957    }
   27958 
   27959    /* First update the mask register to copies of the sign bit.  */
   27960    if (ty == Ity_I32) {
   27961       if (isYMM)
   27962          putYMMReg( rV, binop(Iop_SarN32x8, getYMMReg( rV ), mkU8(31)) );
   27963       else
   27964          putYMMRegLoAndZU( rV, binop(Iop_SarN32x4, getXMMReg( rV ), mkU8(31)) );
   27965    } else {
   27966       for (i = 0; i < count1; i++) {
   27967          putYMMRegLane64( rV, i, binop(Iop_Sar64, getYMMRegLane64( rV, i ),
   27968                                        mkU8(63)) );
   27969       }
   27970    }
   27971 
   27972    /* Next gather the individual elements.  If any fault occurs, the
   27973       corresponding mask element will be set and the loop stops.  */
   27974    for (i = 0; i < count2; i++) {
   27975       IRExpr *expr, *addr_expr;
   27976       cond = newTemp(Ity_I1);
   27977       assign( cond,
   27978               binop(ty == Ity_I32 ? Iop_CmpLT32S : Iop_CmpLT64S,
   27979                     ty == Ity_I32 ? getYMMRegLane32( rV, i )
   27980                                   : getYMMRegLane64( rV, i ),
   27981                     mkU(ty, 0)) );
   27982       expr = ty == Ity_I32 ? getYMMRegLane32( rG, i )
   27983                            : getYMMRegLane64( rG, i );
   27984       addr_expr = isVM64x ? getYMMRegLane64( rI, i )
   27985                           : unop(Iop_32Sto64, getYMMRegLane32( rI, i ));
   27986       switch (vscale) {
   27987          case 2: addr_expr = binop(Iop_Shl64, addr_expr, mkU8(1)); break;
   27988          case 4: addr_expr = binop(Iop_Shl64, addr_expr, mkU8(2)); break;
   27989          case 8: addr_expr = binop(Iop_Shl64, addr_expr, mkU8(3)); break;
   27990          default: break;
   27991       }
   27992       addr_expr = binop(Iop_Add64, mkexpr(addr), addr_expr);
   27993       addr_expr = handleAddrOverrides(vbi, pfx, addr_expr);
   27994       addr_expr = IRExpr_ITE(mkexpr(cond), addr_expr, getIReg64(R_RSP));
   27995       expr = IRExpr_ITE(mkexpr(cond), loadLE(ty, addr_expr), expr);
   27996       if (ty == Ity_I32) {
   27997          putYMMRegLane32( rG, i, expr );
   27998          putYMMRegLane32( rV, i, mkU32(0) );
   27999       } else {
   28000          putYMMRegLane64( rG, i, expr);
   28001          putYMMRegLane64( rV, i, mkU64(0) );
   28002       }
   28003    }
   28004 
   28005    if (!isYMM || (ty == Ity_I32 && isVM64x)) {
   28006       if (ty == Ity_I64 || isYMM)
   28007          putYMMRegLane128( rV, 1, mkV128(0) );
   28008       else if (ty == Ity_I32 && count2 == 2) {
   28009          putYMMRegLane64( rV, 1, mkU64(0) );
   28010          putYMMRegLane64( rG, 1, mkU64(0) );
   28011       }
   28012       putYMMRegLane128( rG, 1, mkV128(0) );
   28013    }
   28014 
   28015    *uses_vvvv = True;
   28016    return delta;
   28017 }
   28018 
   28019 
   28020 __attribute__((noinline))
   28021 static
   28022 Long dis_ESC_0F38__VEX (
   28023         /*MB_OUT*/DisResult* dres,
   28024         /*OUT*/   Bool*      uses_vvvv,
   28025         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
   28026         Bool         resteerCisOk,
   28027         void*        callback_opaque,
   28028         const VexArchInfo* archinfo,
   28029         const VexAbiInfo*  vbi,
   28030         Prefix pfx, Int sz, Long deltaIN
   28031      )
   28032 {
   28033    IRTemp addr  = IRTemp_INVALID;
   28034    Int    alen  = 0;
   28035    HChar  dis_buf[50];
   28036    Long   delta = deltaIN;
   28037    UChar  opc   = getUChar(delta);
   28038    delta++;
   28039    *uses_vvvv = False;
   28040 
   28041    switch (opc) {
   28042 
   28043    case 0x00:
   28044       /* VPSHUFB r/m, rV, r ::: r = shuf(rV, r/m) */
   28045       /* VPSHUFB = VEX.NDS.128.66.0F38.WIG 00 /r */
   28046       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28047          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   28048                     uses_vvvv, vbi, pfx, delta, "vpshufb", math_PSHUFB_XMM );
   28049          goto decode_success;
   28050       }
   28051       /* VPSHUFB r/m, rV, r ::: r = shuf(rV, r/m) */
   28052       /* VPSHUFB = VEX.NDS.256.66.0F38.WIG 00 /r */
   28053       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28054          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   28055                     uses_vvvv, vbi, pfx, delta, "vpshufb", math_PSHUFB_YMM );
   28056          goto decode_success;
   28057       }
   28058       break;
   28059 
   28060    case 0x01:
   28061    case 0x02:
   28062    case 0x03:
   28063       /* VPHADDW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 01 /r */
   28064       /* VPHADDD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 02 /r */
   28065       /* VPHADDSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 03 /r */
   28066       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28067          delta = dis_PHADD_128( vbi, pfx, delta, True/*isAvx*/, opc );
   28068          *uses_vvvv = True;
   28069          goto decode_success;
   28070       }
   28071       /* VPHADDW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 01 /r */
   28072       /* VPHADDD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 02 /r */
   28073       /* VPHADDSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 03 /r */
   28074       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28075          delta = dis_PHADD_256( vbi, pfx, delta, opc );
   28076          *uses_vvvv = True;
   28077          goto decode_success;
   28078       }
   28079       break;
   28080 
   28081    case 0x04:
   28082       /* VPMADDUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 04 /r */
   28083       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28084          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   28085                     uses_vvvv, vbi, pfx, delta, "vpmaddubsw",
   28086                     math_PMADDUBSW_128 );
   28087          goto decode_success;
   28088       }
   28089       /* VPMADDUBSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 04 /r */
   28090       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28091          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   28092                     uses_vvvv, vbi, pfx, delta, "vpmaddubsw",
   28093                     math_PMADDUBSW_256 );
   28094          goto decode_success;
   28095       }
   28096       break;
   28097 
   28098    case 0x05:
   28099    case 0x06:
   28100    case 0x07:
   28101       /* VPHSUBW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 05 /r */
   28102       /* VPHSUBD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 06 /r */
   28103       /* VPHSUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 07 /r */
   28104       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28105          delta = dis_PHADD_128( vbi, pfx, delta, True/*isAvx*/, opc );
   28106          *uses_vvvv = True;
   28107          goto decode_success;
   28108       }
   28109       /* VPHSUBW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 05 /r */
   28110       /* VPHSUBD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 06 /r */
   28111       /* VPHSUBSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 07 /r */
   28112       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28113          delta = dis_PHADD_256( vbi, pfx, delta, opc );
   28114          *uses_vvvv = True;
   28115          goto decode_success;
   28116       }
   28117       break;
   28118 
   28119    case 0x08:
   28120    case 0x09:
   28121    case 0x0A:
   28122       /* VPSIGNB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 08 /r */
   28123       /* VPSIGNW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 09 /r */
   28124       /* VPSIGND xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 0A /r */
   28125       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28126          IRTemp sV      = newTemp(Ity_V128);
   28127          IRTemp dV      = newTemp(Ity_V128);
   28128          IRTemp sHi, sLo, dHi, dLo;
   28129          sHi = sLo = dHi = dLo = IRTemp_INVALID;
   28130          HChar  ch      = '?';
   28131          Int    laneszB = 0;
   28132          UChar  modrm   = getUChar(delta);
   28133          UInt   rG      = gregOfRexRM(pfx,modrm);
   28134          UInt   rV      = getVexNvvvv(pfx);
   28135 
   28136          switch (opc) {
   28137             case 0x08: laneszB = 1; ch = 'b'; break;
   28138             case 0x09: laneszB = 2; ch = 'w'; break;
   28139             case 0x0A: laneszB = 4; ch = 'd'; break;
   28140             default: vassert(0);
   28141          }
   28142 
   28143          assign( dV, getXMMReg(rV) );
   28144 
   28145          if (epartIsReg(modrm)) {
   28146             UInt rE = eregOfRexRM(pfx,modrm);
   28147             assign( sV, getXMMReg(rE) );
   28148             delta += 1;
   28149             DIP("vpsign%c %s,%s,%s\n", ch, nameXMMReg(rE),
   28150                 nameXMMReg(rV), nameXMMReg(rG));
   28151          } else {
   28152             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   28153             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   28154             delta += alen;
   28155             DIP("vpsign%c %s,%s,%s\n", ch, dis_buf,
   28156                 nameXMMReg(rV), nameXMMReg(rG));
   28157          }
   28158 
   28159          breakupV128to64s( dV, &dHi, &dLo );
   28160          breakupV128to64s( sV, &sHi, &sLo );
   28161 
   28162          putYMMRegLoAndZU(
   28163             rG,
   28164             binop(Iop_64HLtoV128,
   28165                   dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
   28166                   dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
   28167             )
   28168          );
   28169          *uses_vvvv = True;
   28170          goto decode_success;
   28171       }
   28172       /* VPSIGNB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 08 /r */
   28173       /* VPSIGNW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 09 /r */
   28174       /* VPSIGND ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 0A /r */
   28175       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28176          IRTemp sV      = newTemp(Ity_V256);
   28177          IRTemp dV      = newTemp(Ity_V256);
   28178          IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   28179          s3 = s2 = s1 = s0 = IRTemp_INVALID;
   28180          d3 = d2 = d1 = d0 = IRTemp_INVALID;
   28181          UChar  ch      = '?';
   28182          Int    laneszB = 0;
   28183          UChar  modrm   = getUChar(delta);
   28184          UInt   rG      = gregOfRexRM(pfx,modrm);
   28185          UInt   rV      = getVexNvvvv(pfx);
   28186 
   28187          switch (opc) {
   28188             case 0x08: laneszB = 1; ch = 'b'; break;
   28189             case 0x09: laneszB = 2; ch = 'w'; break;
   28190             case 0x0A: laneszB = 4; ch = 'd'; break;
   28191             default: vassert(0);
   28192          }
   28193 
   28194          assign( dV, getYMMReg(rV) );
   28195 
   28196          if (epartIsReg(modrm)) {
   28197             UInt rE = eregOfRexRM(pfx,modrm);
   28198             assign( sV, getYMMReg(rE) );
   28199             delta += 1;
   28200             DIP("vpsign%c %s,%s,%s\n", ch, nameYMMReg(rE),
   28201                 nameYMMReg(rV), nameYMMReg(rG));
   28202          } else {
   28203             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   28204             assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   28205             delta += alen;
   28206             DIP("vpsign%c %s,%s,%s\n", ch, dis_buf,
   28207                 nameYMMReg(rV), nameYMMReg(rG));
   28208          }
   28209 
   28210          breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
   28211          breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
   28212 
   28213          putYMMReg(
   28214             rG,
   28215             binop( Iop_V128HLtoV256,
   28216                    binop(Iop_64HLtoV128,
   28217                          dis_PSIGN_helper( mkexpr(s3), mkexpr(d3), laneszB ),
   28218                          dis_PSIGN_helper( mkexpr(s2), mkexpr(d2), laneszB )
   28219                    ),
   28220                    binop(Iop_64HLtoV128,
   28221                          dis_PSIGN_helper( mkexpr(s1), mkexpr(d1), laneszB ),
   28222                          dis_PSIGN_helper( mkexpr(s0), mkexpr(d0), laneszB )
   28223                    )
   28224             )
   28225          );
   28226          *uses_vvvv = True;
   28227          goto decode_success;
   28228       }
   28229       break;
   28230 
   28231    case 0x0B:
   28232       /* VPMULHRSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 0B /r */
   28233       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28234          IRTemp sV      = newTemp(Ity_V128);
   28235          IRTemp dV      = newTemp(Ity_V128);
   28236          IRTemp sHi, sLo, dHi, dLo;
   28237          sHi = sLo = dHi = dLo = IRTemp_INVALID;
   28238          UChar  modrm   = getUChar(delta);
   28239          UInt   rG      = gregOfRexRM(pfx,modrm);
   28240          UInt   rV      = getVexNvvvv(pfx);
   28241 
   28242          assign( dV, getXMMReg(rV) );
   28243 
   28244          if (epartIsReg(modrm)) {
   28245             UInt rE = eregOfRexRM(pfx,modrm);
   28246             assign( sV, getXMMReg(rE) );
   28247             delta += 1;
   28248             DIP("vpmulhrsw %s,%s,%s\n", nameXMMReg(rE),
   28249                 nameXMMReg(rV), nameXMMReg(rG));
   28250          } else {
   28251             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   28252             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   28253             delta += alen;
   28254             DIP("vpmulhrsw %s,%s,%s\n", dis_buf,
   28255                 nameXMMReg(rV), nameXMMReg(rG));
   28256          }
   28257 
   28258          breakupV128to64s( dV, &dHi, &dLo );
   28259          breakupV128to64s( sV, &sHi, &sLo );
   28260 
   28261          putYMMRegLoAndZU(
   28262             rG,
   28263             binop(Iop_64HLtoV128,
   28264                   dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
   28265                   dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
   28266             )
   28267          );
   28268          *uses_vvvv = True;
   28269          goto decode_success;
   28270       }
   28271       /* VPMULHRSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 0B /r */
   28272       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28273          IRTemp sV      = newTemp(Ity_V256);
   28274          IRTemp dV      = newTemp(Ity_V256);
   28275          IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   28276          s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   28277          UChar  modrm   = getUChar(delta);
   28278          UInt   rG      = gregOfRexRM(pfx,modrm);
   28279          UInt   rV      = getVexNvvvv(pfx);
   28280 
   28281          assign( dV, getYMMReg(rV) );
   28282 
   28283          if (epartIsReg(modrm)) {
   28284             UInt rE = eregOfRexRM(pfx,modrm);
   28285             assign( sV, getYMMReg(rE) );
   28286             delta += 1;
   28287             DIP("vpmulhrsw %s,%s,%s\n", nameYMMReg(rE),
   28288                 nameYMMReg(rV), nameYMMReg(rG));
   28289          } else {
   28290             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   28291             assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   28292             delta += alen;
   28293             DIP("vpmulhrsw %s,%s,%s\n", dis_buf,
   28294                 nameYMMReg(rV), nameYMMReg(rG));
   28295          }
   28296 
   28297          breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
   28298          breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
   28299 
   28300          putYMMReg(
   28301             rG,
   28302             binop(Iop_V128HLtoV256,
   28303                   binop(Iop_64HLtoV128,
   28304                         dis_PMULHRSW_helper( mkexpr(s3), mkexpr(d3) ),
   28305                         dis_PMULHRSW_helper( mkexpr(s2), mkexpr(d2) ) ),
   28306                   binop(Iop_64HLtoV128,
   28307                         dis_PMULHRSW_helper( mkexpr(s1), mkexpr(d1) ),
   28308                         dis_PMULHRSW_helper( mkexpr(s0), mkexpr(d0) ) )
   28309             )
   28310          );
   28311          *uses_vvvv = True;
   28312          dres->hint = Dis_HintVerbose;
   28313          goto decode_success;
   28314       }
   28315       break;
   28316 
   28317    case 0x0C:
   28318       /* VPERMILPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0C /r */
   28319       if (have66noF2noF3(pfx)
   28320           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   28321          UChar  modrm = getUChar(delta);
   28322          UInt   rG    = gregOfRexRM(pfx, modrm);
   28323          UInt   rV    = getVexNvvvv(pfx);
   28324          IRTemp ctrlV = newTemp(Ity_V128);
   28325          if (epartIsReg(modrm)) {
   28326             UInt rE = eregOfRexRM(pfx, modrm);
   28327             delta += 1;
   28328             DIP("vpermilps %s,%s,%s\n",
   28329                 nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   28330             assign(ctrlV, getXMMReg(rE));
   28331          } else {
   28332             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28333             delta += alen;
   28334             DIP("vpermilps %s,%s,%s\n",
   28335                 dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   28336             assign(ctrlV, loadLE(Ity_V128, mkexpr(addr)));
   28337          }
   28338          IRTemp dataV = newTemp(Ity_V128);
   28339          assign(dataV, getXMMReg(rV));
   28340          IRTemp resV = math_PERMILPS_VAR_128(dataV, ctrlV);
   28341          putYMMRegLoAndZU(rG, mkexpr(resV));
   28342          *uses_vvvv = True;
   28343          goto decode_success;
   28344       }
   28345       /* VPERMILPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0C /r */
   28346       if (have66noF2noF3(pfx)
   28347           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   28348          UChar  modrm = getUChar(delta);
   28349          UInt   rG    = gregOfRexRM(pfx, modrm);
   28350          UInt   rV    = getVexNvvvv(pfx);
   28351          IRTemp ctrlV = newTemp(Ity_V256);
   28352          if (epartIsReg(modrm)) {
   28353             UInt rE = eregOfRexRM(pfx, modrm);
   28354             delta += 1;
   28355             DIP("vpermilps %s,%s,%s\n",
   28356                 nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   28357             assign(ctrlV, getYMMReg(rE));
   28358          } else {
   28359             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28360             delta += alen;
   28361             DIP("vpermilps %s,%s,%s\n",
   28362                 dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   28363             assign(ctrlV, loadLE(Ity_V256, mkexpr(addr)));
   28364          }
   28365          IRTemp dataV = newTemp(Ity_V256);
   28366          assign(dataV, getYMMReg(rV));
   28367          IRTemp resV = math_PERMILPS_VAR_256(dataV, ctrlV);
   28368          putYMMReg(rG, mkexpr(resV));
   28369          *uses_vvvv = True;
   28370          goto decode_success;
   28371       }
   28372       break;
   28373 
   28374    case 0x0D:
   28375       /* VPERMILPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0D /r */
   28376       if (have66noF2noF3(pfx)
   28377           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   28378          UChar  modrm = getUChar(delta);
   28379          UInt   rG    = gregOfRexRM(pfx, modrm);
   28380          UInt   rV    = getVexNvvvv(pfx);
   28381          IRTemp ctrlV = newTemp(Ity_V128);
   28382          if (epartIsReg(modrm)) {
   28383             UInt rE = eregOfRexRM(pfx, modrm);
   28384             delta += 1;
   28385             DIP("vpermilpd %s,%s,%s\n",
   28386                 nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   28387             assign(ctrlV, getXMMReg(rE));
   28388          } else {
   28389             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28390             delta += alen;
   28391             DIP("vpermilpd %s,%s,%s\n",
   28392                 dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   28393             assign(ctrlV, loadLE(Ity_V128, mkexpr(addr)));
   28394          }
   28395          IRTemp dataV = newTemp(Ity_V128);
   28396          assign(dataV, getXMMReg(rV));
   28397          IRTemp resV = math_PERMILPD_VAR_128(dataV, ctrlV);
   28398          putYMMRegLoAndZU(rG, mkexpr(resV));
   28399          *uses_vvvv = True;
   28400          goto decode_success;
   28401       }
   28402       /* VPERMILPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0D /r */
   28403       if (have66noF2noF3(pfx)
   28404           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   28405          UChar  modrm = getUChar(delta);
   28406          UInt   rG    = gregOfRexRM(pfx, modrm);
   28407          UInt   rV    = getVexNvvvv(pfx);
   28408          IRTemp ctrlV = newTemp(Ity_V256);
   28409          if (epartIsReg(modrm)) {
   28410             UInt rE = eregOfRexRM(pfx, modrm);
   28411             delta += 1;
   28412             DIP("vpermilpd %s,%s,%s\n",
   28413                 nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   28414             assign(ctrlV, getYMMReg(rE));
   28415          } else {
   28416             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28417             delta += alen;
   28418             DIP("vpermilpd %s,%s,%s\n",
   28419                 dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   28420             assign(ctrlV, loadLE(Ity_V256, mkexpr(addr)));
   28421          }
   28422          IRTemp dataV = newTemp(Ity_V256);
   28423          assign(dataV, getYMMReg(rV));
   28424          IRTemp resV = math_PERMILPD_VAR_256(dataV, ctrlV);
   28425          putYMMReg(rG, mkexpr(resV));
   28426          *uses_vvvv = True;
   28427          goto decode_success;
   28428       }
   28429       break;
   28430 
   28431    case 0x0E:
   28432       /* VTESTPS xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 0E /r */
   28433       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28434          delta = dis_xTESTy_128( vbi, pfx, delta, True/*isAvx*/, 32 );
   28435          goto decode_success;
   28436       }
   28437       /* VTESTPS ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 0E /r */
   28438       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28439          delta = dis_xTESTy_256( vbi, pfx, delta, 32 );
   28440          goto decode_success;
   28441       }
   28442       break;
   28443 
   28444    case 0x0F:
   28445       /* VTESTPD xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 0F /r */
   28446       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28447          delta = dis_xTESTy_128( vbi, pfx, delta, True/*isAvx*/, 64 );
   28448          goto decode_success;
   28449       }
   28450       /* VTESTPD ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 0F /r */
   28451       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28452          delta = dis_xTESTy_256( vbi, pfx, delta, 64 );
   28453          goto decode_success;
   28454       }
   28455       break;
   28456 
   28457    case 0x16:
   28458       /* VPERMPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 16 /r */
   28459       if (have66noF2noF3(pfx)
   28460           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   28461          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   28462                     uses_vvvv, vbi, pfx, delta, "vpermps", math_VPERMD );
   28463          goto decode_success;
   28464       }
   28465       break;
   28466 
   28467    case 0x17:
   28468       /* VPTEST xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 17 /r */
   28469       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28470          delta = dis_xTESTy_128( vbi, pfx, delta, True/*isAvx*/, 0 );
   28471          goto decode_success;
   28472       }
   28473       /* VPTEST ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 17 /r */
   28474       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28475          delta = dis_xTESTy_256( vbi, pfx, delta, 0 );
   28476          goto decode_success;
   28477       }
   28478       break;
   28479 
   28480    case 0x18:
   28481       /* VBROADCASTSS m32, xmm1 = VEX.128.66.0F38.WIG 18 /r */
   28482       if (have66noF2noF3(pfx)
   28483           && 0==getVexL(pfx)/*128*/
   28484           && !epartIsReg(getUChar(delta))) {
   28485          UChar modrm = getUChar(delta);
   28486          UInt  rG    = gregOfRexRM(pfx, modrm);
   28487          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28488          delta += alen;
   28489          DIP("vbroadcastss %s,%s\n", dis_buf, nameXMMReg(rG));
   28490          IRTemp t32 = newTemp(Ity_I32);
   28491          assign(t32, loadLE(Ity_I32, mkexpr(addr)));
   28492          IRTemp t64 = newTemp(Ity_I64);
   28493          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   28494          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
   28495          putYMMRegLoAndZU(rG, res);
   28496          goto decode_success;
   28497       }
   28498       /* VBROADCASTSS m32, ymm1 = VEX.256.66.0F38.WIG 18 /r */
   28499       if (have66noF2noF3(pfx)
   28500           && 1==getVexL(pfx)/*256*/
   28501           && !epartIsReg(getUChar(delta))) {
   28502          UChar modrm = getUChar(delta);
   28503          UInt  rG    = gregOfRexRM(pfx, modrm);
   28504          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28505          delta += alen;
   28506          DIP("vbroadcastss %s,%s\n", dis_buf, nameYMMReg(rG));
   28507          IRTemp t32 = newTemp(Ity_I32);
   28508          assign(t32, loadLE(Ity_I32, mkexpr(addr)));
   28509          IRTemp t64 = newTemp(Ity_I64);
   28510          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   28511          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   28512                                                   mkexpr(t64), mkexpr(t64));
   28513          putYMMReg(rG, res);
   28514          goto decode_success;
   28515       }
   28516       /* VBROADCASTSS xmm2, xmm1 = VEX.128.66.0F38.WIG 18 /r */
   28517       if (have66noF2noF3(pfx)
   28518           && 0==getVexL(pfx)/*128*/
   28519           && epartIsReg(getUChar(delta))) {
   28520          UChar modrm = getUChar(delta);
   28521          UInt  rG    = gregOfRexRM(pfx, modrm);
   28522          UInt  rE    = eregOfRexRM(pfx, modrm);
   28523          DIP("vbroadcastss %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   28524          IRTemp t32 = newTemp(Ity_I32);
   28525          assign(t32, getXMMRegLane32(rE, 0));
   28526          IRTemp t64 = newTemp(Ity_I64);
   28527          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   28528          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
   28529          putYMMRegLoAndZU(rG, res);
   28530          delta++;
   28531          goto decode_success;
   28532       }
   28533       /* VBROADCASTSS xmm2, ymm1 = VEX.256.66.0F38.WIG 18 /r */
   28534       if (have66noF2noF3(pfx)
   28535           && 1==getVexL(pfx)/*256*/
   28536           && epartIsReg(getUChar(delta))) {
   28537          UChar modrm = getUChar(delta);
   28538          UInt  rG    = gregOfRexRM(pfx, modrm);
   28539          UInt  rE    = eregOfRexRM(pfx, modrm);
   28540          DIP("vbroadcastss %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   28541          IRTemp t32 = newTemp(Ity_I32);
   28542          assign(t32, getXMMRegLane32(rE, 0));
   28543          IRTemp t64 = newTemp(Ity_I64);
   28544          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   28545          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   28546                                                   mkexpr(t64), mkexpr(t64));
   28547          putYMMReg(rG, res);
   28548          delta++;
   28549          goto decode_success;
   28550       }
   28551       break;
   28552 
   28553    case 0x19:
   28554       /* VBROADCASTSD m64, ymm1 = VEX.256.66.0F38.WIG 19 /r */
   28555       if (have66noF2noF3(pfx)
   28556           && 1==getVexL(pfx)/*256*/
   28557           && !epartIsReg(getUChar(delta))) {
   28558          UChar modrm = getUChar(delta);
   28559          UInt  rG    = gregOfRexRM(pfx, modrm);
   28560          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28561          delta += alen;
   28562          DIP("vbroadcastsd %s,%s\n", dis_buf, nameYMMReg(rG));
   28563          IRTemp t64 = newTemp(Ity_I64);
   28564          assign(t64, loadLE(Ity_I64, mkexpr(addr)));
   28565          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   28566                                                   mkexpr(t64), mkexpr(t64));
   28567          putYMMReg(rG, res);
   28568          goto decode_success;
   28569       }
   28570       /* VBROADCASTSD xmm2, ymm1 = VEX.256.66.0F38.WIG 19 /r */
   28571       if (have66noF2noF3(pfx)
   28572           && 1==getVexL(pfx)/*256*/
   28573           && epartIsReg(getUChar(delta))) {
   28574          UChar modrm = getUChar(delta);
   28575          UInt  rG    = gregOfRexRM(pfx, modrm);
   28576          UInt  rE    = eregOfRexRM(pfx, modrm);
   28577          DIP("vbroadcastsd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   28578          IRTemp t64 = newTemp(Ity_I64);
   28579          assign(t64, getXMMRegLane64(rE, 0));
   28580          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   28581                                                   mkexpr(t64), mkexpr(t64));
   28582          putYMMReg(rG, res);
   28583          delta++;
   28584          goto decode_success;
   28585       }
   28586       break;
   28587 
   28588    case 0x1A:
   28589       /* VBROADCASTF128 m128, ymm1 = VEX.256.66.0F38.WIG 1A /r */
   28590       if (have66noF2noF3(pfx)
   28591           && 1==getVexL(pfx)/*256*/
   28592           && !epartIsReg(getUChar(delta))) {
   28593          UChar modrm = getUChar(delta);
   28594          UInt  rG    = gregOfRexRM(pfx, modrm);
   28595          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28596          delta += alen;
   28597          DIP("vbroadcastf128 %s,%s\n", dis_buf, nameYMMReg(rG));
   28598          IRTemp t128 = newTemp(Ity_V128);
   28599          assign(t128, loadLE(Ity_V128, mkexpr(addr)));
   28600          putYMMReg( rG, binop(Iop_V128HLtoV256, mkexpr(t128), mkexpr(t128)) );
   28601          goto decode_success;
   28602       }
   28603       break;
   28604 
   28605    case 0x1C:
   28606       /* VPABSB xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 1C /r */
   28607       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28608          delta = dis_AVX128_E_to_G_unary(
   28609                     uses_vvvv, vbi, pfx, delta,
   28610                     "vpabsb", math_PABS_XMM_pap1 );
   28611          goto decode_success;
   28612       }
   28613       /* VPABSB ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 1C /r */
   28614       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28615          delta = dis_AVX256_E_to_G_unary(
   28616                     uses_vvvv, vbi, pfx, delta,
   28617                     "vpabsb", math_PABS_YMM_pap1 );
   28618          goto decode_success;
   28619       }
   28620       break;
   28621 
   28622    case 0x1D:
   28623       /* VPABSW xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 1D /r */
   28624       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28625          delta = dis_AVX128_E_to_G_unary(
   28626                     uses_vvvv, vbi, pfx, delta,
   28627                     "vpabsw", math_PABS_XMM_pap2 );
   28628          goto decode_success;
   28629       }
   28630       /* VPABSW ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 1D /r */
   28631       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28632          delta = dis_AVX256_E_to_G_unary(
   28633                     uses_vvvv, vbi, pfx, delta,
   28634                     "vpabsw", math_PABS_YMM_pap2 );
   28635          goto decode_success;
   28636       }
   28637       break;
   28638 
   28639    case 0x1E:
   28640       /* VPABSD xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 1E /r */
   28641       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28642          delta = dis_AVX128_E_to_G_unary(
   28643                     uses_vvvv, vbi, pfx, delta,
   28644                     "vpabsd", math_PABS_XMM_pap4 );
   28645          goto decode_success;
   28646       }
   28647       /* VPABSD ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 1E /r */
   28648       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28649          delta = dis_AVX256_E_to_G_unary(
   28650                     uses_vvvv, vbi, pfx, delta,
   28651                     "vpabsd", math_PABS_YMM_pap4 );
   28652          goto decode_success;
   28653       }
   28654       break;
   28655 
   28656    case 0x20:
   28657       /* VPMOVSXBW xmm2/m64, xmm1 */
   28658       /* VPMOVSXBW = VEX.128.66.0F38.WIG 20 /r */
   28659       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28660          delta = dis_PMOVxXBW_128( vbi, pfx, delta,
   28661                                    True/*isAvx*/, False/*!xIsZ*/ );
   28662          goto decode_success;
   28663       }
   28664       /* VPMOVSXBW xmm2/m128, ymm1 */
   28665       /* VPMOVSXBW = VEX.256.66.0F38.WIG 20 /r */
   28666       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28667          delta = dis_PMOVxXBW_256( vbi, pfx, delta, False/*!xIsZ*/ );
   28668          goto decode_success;
   28669       }
   28670       break;
   28671 
   28672    case 0x21:
   28673       /* VPMOVSXBD xmm2/m32, xmm1 */
   28674       /* VPMOVSXBD = VEX.128.66.0F38.WIG 21 /r */
   28675       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28676          delta = dis_PMOVxXBD_128( vbi, pfx, delta,
   28677                                    True/*isAvx*/, False/*!xIsZ*/ );
   28678          goto decode_success;
   28679       }
   28680       /* VPMOVSXBD xmm2/m64, ymm1 */
   28681       /* VPMOVSXBD = VEX.256.66.0F38.WIG 21 /r */
   28682       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28683          delta = dis_PMOVxXBD_256( vbi, pfx, delta, False/*!xIsZ*/ );
   28684          goto decode_success;
   28685       }
   28686       break;
   28687 
   28688    case 0x22:
   28689       /* VPMOVSXBQ xmm2/m16, xmm1 */
   28690       /* VPMOVSXBQ = VEX.128.66.0F38.WIG 22 /r */
   28691       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28692          delta = dis_PMOVSXBQ_128( vbi, pfx, delta, True/*isAvx*/ );
   28693          goto decode_success;
   28694       }
   28695       /* VPMOVSXBQ xmm2/m32, ymm1 */
   28696       /* VPMOVSXBQ = VEX.256.66.0F38.WIG 22 /r */
   28697       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28698          delta = dis_PMOVSXBQ_256( vbi, pfx, delta );
   28699          goto decode_success;
   28700       }
   28701       break;
   28702 
   28703    case 0x23:
   28704       /* VPMOVSXWD xmm2/m64, xmm1 = VEX.128.66.0F38.WIG 23 /r */
   28705       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28706          delta = dis_PMOVxXWD_128( vbi, pfx, delta,
   28707                                    True/*isAvx*/, False/*!xIsZ*/ );
   28708          goto decode_success;
   28709       }
   28710       /* VPMOVSXWD xmm2/m128, ymm1 = VEX.256.66.0F38.WIG 23 /r */
   28711       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28712          delta = dis_PMOVxXWD_256( vbi, pfx, delta, False/*!xIsZ*/ );
   28713          goto decode_success;
   28714       }
   28715       break;
   28716 
   28717    case 0x24:
   28718       /* VPMOVSXWQ xmm2/m32, xmm1 = VEX.128.66.0F38.WIG 24 /r */
   28719       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28720          delta = dis_PMOVSXWQ_128( vbi, pfx, delta, True/*isAvx*/ );
   28721          goto decode_success;
   28722       }
   28723       /* VPMOVSXWQ xmm2/m64, ymm1 = VEX.256.66.0F38.WIG 24 /r */
   28724       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28725          delta = dis_PMOVSXWQ_256( vbi, pfx, delta );
   28726          goto decode_success;
   28727       }
   28728       break;
   28729 
   28730    case 0x25:
   28731       /* VPMOVSXDQ xmm2/m64, xmm1 = VEX.128.66.0F38.WIG 25 /r */
   28732       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28733          delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
   28734                                    True/*isAvx*/, False/*!xIsZ*/ );
   28735          goto decode_success;
   28736       }
   28737       /* VPMOVSXDQ xmm2/m128, ymm1 = VEX.256.66.0F38.WIG 25 /r */
   28738       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28739          delta = dis_PMOVxXDQ_256( vbi, pfx, delta, False/*!xIsZ*/ );
   28740          goto decode_success;
   28741       }
   28742       break;
   28743 
   28744    case 0x28:
   28745       /* VPMULDQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 28 /r */
   28746       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28747          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   28748                     uses_vvvv, vbi, pfx, delta,
   28749                     "vpmuldq", math_PMULDQ_128 );
   28750          goto decode_success;
   28751       }
   28752       /* VPMULDQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 28 /r */
   28753       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28754          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   28755                     uses_vvvv, vbi, pfx, delta,
   28756                     "vpmuldq", math_PMULDQ_256 );
   28757          goto decode_success;
   28758       }
   28759       break;
   28760 
   28761    case 0x29:
   28762       /* VPCMPEQQ r/m, rV, r ::: r = rV `eq-by-64s` r/m */
   28763       /* VPCMPEQQ = VEX.NDS.128.66.0F38.WIG 29 /r */
   28764       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28765          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28766                     uses_vvvv, vbi, pfx, delta, "vpcmpeqq", Iop_CmpEQ64x2 );
   28767          goto decode_success;
   28768       }
   28769       /* VPCMPEQQ r/m, rV, r ::: r = rV `eq-by-64s` r/m */
   28770       /* VPCMPEQQ = VEX.NDS.256.66.0F38.WIG 29 /r */
   28771       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28772          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28773                     uses_vvvv, vbi, pfx, delta, "vpcmpeqq", Iop_CmpEQ64x4 );
   28774          goto decode_success;
   28775       }
   28776       break;
   28777 
   28778    case 0x2A:
   28779       /* VMOVNTDQA m128, xmm1 = VEX.128.66.0F38.WIG 2A /r */
   28780       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28781           && !epartIsReg(getUChar(delta))) {
   28782          UChar  modrm = getUChar(delta);
   28783          UInt   rD    = gregOfRexRM(pfx, modrm);
   28784          IRTemp tD    = newTemp(Ity_V128);
   28785          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28786          delta += alen;
   28787          gen_SEGV_if_not_16_aligned(addr);
   28788          assign(tD, loadLE(Ity_V128, mkexpr(addr)));
   28789          DIP("vmovntdqa %s,%s\n", dis_buf, nameXMMReg(rD));
   28790          putYMMRegLoAndZU(rD, mkexpr(tD));
   28791          goto decode_success;
   28792       }
   28793       /* VMOVNTDQA m256, ymm1 = VEX.256.66.0F38.WIG 2A /r */
   28794       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28795           && !epartIsReg(getUChar(delta))) {
   28796          UChar  modrm = getUChar(delta);
   28797          UInt   rD    = gregOfRexRM(pfx, modrm);
   28798          IRTemp tD    = newTemp(Ity_V256);
   28799          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28800          delta += alen;
   28801          gen_SEGV_if_not_32_aligned(addr);
   28802          assign(tD, loadLE(Ity_V256, mkexpr(addr)));
   28803          DIP("vmovntdqa %s,%s\n", dis_buf, nameYMMReg(rD));
   28804          putYMMReg(rD, mkexpr(tD));
   28805          goto decode_success;
   28806       }
   28807       break;
   28808 
   28809    case 0x2B:
   28810       /* VPACKUSDW r/m, rV, r ::: r = QNarrowBin32Sto16Ux8(rV, r/m) */
   28811       /* VPACKUSDW = VEX.NDS.128.66.0F38.WIG 2B /r */
   28812       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28813          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   28814                     uses_vvvv, vbi, pfx, delta, "vpackusdw",
   28815                     Iop_QNarrowBin32Sto16Ux8, NULL,
   28816                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   28817          goto decode_success;
   28818       }
   28819       /* VPACKUSDW r/m, rV, r ::: r = QNarrowBin32Sto16Ux8(rV, r/m) */
   28820       /* VPACKUSDW = VEX.NDS.256.66.0F38.WIG 2B /r */
   28821       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28822          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   28823                     uses_vvvv, vbi, pfx, delta, "vpackusdw",
   28824                     math_VPACKUSDW_YMM );
   28825          goto decode_success;
   28826       }
   28827       break;
   28828 
   28829    case 0x2C:
   28830       /* VMASKMOVPS m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 2C /r */
   28831       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28832           && 0==getRexW(pfx)/*W0*/
   28833           && !epartIsReg(getUChar(delta))) {
   28834          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
   28835                                /*!isYMM*/False, Ity_I32, /*isLoad*/True );
   28836          goto decode_success;
   28837       }
   28838       /* VMASKMOVPS m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 2C /r */
   28839       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28840           && 0==getRexW(pfx)/*W0*/
   28841           && !epartIsReg(getUChar(delta))) {
   28842          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
   28843                                /*isYMM*/True, Ity_I32, /*isLoad*/True );
   28844          goto decode_success;
   28845       }
   28846       break;
   28847 
   28848    case 0x2D:
   28849       /* VMASKMOVPD m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 2D /r */
   28850       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28851           && 0==getRexW(pfx)/*W0*/
   28852           && !epartIsReg(getUChar(delta))) {
   28853          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
   28854                                /*!isYMM*/False, Ity_I64, /*isLoad*/True );
   28855          goto decode_success;
   28856       }
   28857       /* VMASKMOVPD m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 2D /r */
   28858       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28859           && 0==getRexW(pfx)/*W0*/
   28860           && !epartIsReg(getUChar(delta))) {
   28861          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
   28862                                /*isYMM*/True, Ity_I64, /*isLoad*/True );
   28863          goto decode_success;
   28864       }
   28865       break;
   28866 
   28867    case 0x2E:
   28868       /* VMASKMOVPS xmm1, xmm2, m128 = VEX.NDS.128.66.0F38.W0 2E /r */
   28869       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28870           && 0==getRexW(pfx)/*W0*/
   28871           && !epartIsReg(getUChar(delta))) {
   28872          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
   28873                                /*!isYMM*/False, Ity_I32, /*!isLoad*/False );
   28874          goto decode_success;
   28875       }
   28876       /* VMASKMOVPS ymm1, ymm2, m256 = VEX.NDS.256.66.0F38.W0 2E /r */
   28877       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28878           && 0==getRexW(pfx)/*W0*/
   28879           && !epartIsReg(getUChar(delta))) {
   28880          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
   28881                                /*isYMM*/True, Ity_I32, /*!isLoad*/False );
   28882          goto decode_success;
   28883       }
   28884       break;
   28885 
   28886    case 0x2F:
   28887       /* VMASKMOVPD xmm1, xmm2, m128 = VEX.NDS.128.66.0F38.W0 2F /r */
   28888       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28889           && 0==getRexW(pfx)/*W0*/
   28890           && !epartIsReg(getUChar(delta))) {
   28891          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
   28892                                /*!isYMM*/False, Ity_I64, /*!isLoad*/False );
   28893          goto decode_success;
   28894       }
   28895       /* VMASKMOVPD ymm1, ymm2, m256 = VEX.NDS.256.66.0F38.W0 2F /r */
   28896       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28897           && 0==getRexW(pfx)/*W0*/
   28898           && !epartIsReg(getUChar(delta))) {
   28899          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
   28900                                /*isYMM*/True, Ity_I64, /*!isLoad*/False );
   28901          goto decode_success;
   28902       }
   28903       break;
   28904 
   28905    case 0x30:
   28906       /* VPMOVZXBW xmm2/m64, xmm1 */
   28907       /* VPMOVZXBW = VEX.128.66.0F38.WIG 30 /r */
   28908       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28909          delta = dis_PMOVxXBW_128( vbi, pfx, delta,
   28910                                    True/*isAvx*/, True/*xIsZ*/ );
   28911          goto decode_success;
   28912       }
   28913       /* VPMOVZXBW xmm2/m128, ymm1 */
   28914       /* VPMOVZXBW = VEX.256.66.0F38.WIG 30 /r */
   28915       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28916          delta = dis_PMOVxXBW_256( vbi, pfx, delta, True/*xIsZ*/ );
   28917          goto decode_success;
   28918       }
   28919       break;
   28920 
   28921    case 0x31:
   28922       /* VPMOVZXBD xmm2/m32, xmm1 */
   28923       /* VPMOVZXBD = VEX.128.66.0F38.WIG 31 /r */
   28924       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28925          delta = dis_PMOVxXBD_128( vbi, pfx, delta,
   28926                                    True/*isAvx*/, True/*xIsZ*/ );
   28927          goto decode_success;
   28928       }
   28929       /* VPMOVZXBD xmm2/m64, ymm1 */
   28930       /* VPMOVZXBD = VEX.256.66.0F38.WIG 31 /r */
   28931       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28932          delta = dis_PMOVxXBD_256( vbi, pfx, delta, True/*xIsZ*/ );
   28933          goto decode_success;
   28934       }
   28935       break;
   28936 
   28937    case 0x32:
   28938       /* VPMOVZXBQ xmm2/m16, xmm1 */
   28939       /* VPMOVZXBQ = VEX.128.66.0F38.WIG 32 /r */
   28940       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28941          delta = dis_PMOVZXBQ_128( vbi, pfx, delta, True/*isAvx*/ );
   28942          goto decode_success;
   28943       }
   28944       /* VPMOVZXBQ xmm2/m32, ymm1 */
   28945       /* VPMOVZXBQ = VEX.256.66.0F38.WIG 32 /r */
   28946       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28947          delta = dis_PMOVZXBQ_256( vbi, pfx, delta );
   28948          goto decode_success;
   28949       }
   28950       break;
   28951 
   28952    case 0x33:
   28953       /* VPMOVZXWD xmm2/m64, xmm1 */
   28954       /* VPMOVZXWD = VEX.128.66.0F38.WIG 33 /r */
   28955       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28956          delta = dis_PMOVxXWD_128( vbi, pfx, delta,
   28957                                    True/*isAvx*/, True/*xIsZ*/ );
   28958          goto decode_success;
   28959       }
   28960       /* VPMOVZXWD xmm2/m128, ymm1 */
   28961       /* VPMOVZXWD = VEX.256.66.0F38.WIG 33 /r */
   28962       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28963          delta = dis_PMOVxXWD_256( vbi, pfx, delta, True/*xIsZ*/ );
   28964          goto decode_success;
   28965       }
   28966       break;
   28967 
   28968    case 0x34:
   28969       /* VPMOVZXWQ xmm2/m32, xmm1 = VEX.128.66.0F38.WIG 34 /r */
   28970       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28971          delta = dis_PMOVZXWQ_128( vbi, pfx, delta, True/*isAvx*/ );
   28972          goto decode_success;
   28973       }
   28974       /* VPMOVZXWQ xmm2/m64, ymm1 = VEX.256.66.0F38.WIG 34 /r */
   28975       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28976          delta = dis_PMOVZXWQ_256( vbi, pfx, delta );
   28977          goto decode_success;
   28978       }
   28979       break;
   28980 
   28981    case 0x35:
   28982       /* VPMOVZXDQ xmm2/m64, xmm1 = VEX.128.66.0F38.WIG 35 /r */
   28983       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28984          delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
   28985                                    True/*isAvx*/, True/*xIsZ*/ );
   28986          goto decode_success;
   28987       }
   28988       /* VPMOVZXDQ xmm2/m128, ymm1 = VEX.256.66.0F38.WIG 35 /r */
   28989       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28990          delta = dis_PMOVxXDQ_256( vbi, pfx, delta, True/*xIsZ*/ );
   28991          goto decode_success;
   28992       }
   28993       break;
   28994 
   28995    case 0x36:
   28996       /* VPERMD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 36 /r */
   28997       if (have66noF2noF3(pfx)
   28998           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   28999          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   29000                     uses_vvvv, vbi, pfx, delta, "vpermd", math_VPERMD );
   29001          goto decode_success;
   29002       }
   29003       break;
   29004 
   29005    case 0x37:
   29006       /* VPCMPGTQ r/m, rV, r ::: r = rV `>s-by-64s` r/m */
   29007       /* VPCMPGTQ = VEX.NDS.128.66.0F38.WIG 37 /r */
   29008       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   29009          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   29010                     uses_vvvv, vbi, pfx, delta, "vpcmpgtq", Iop_CmpGT64Sx2 );
   29011          goto decode_success;
   29012       }
   29013       /* VPCMPGTQ r/m, rV, r ::: r = rV `>s-by-64s` r/m */
   29014       /* VPCMPGTQ = VEX.NDS.256.66.0F38.WIG 37 /r */
   29015       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   29016          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   29017                     uses_vvvv, vbi, pfx, delta, "vpcmpgtq", Iop_CmpGT64Sx4 );
   29018          goto decode_success;
   29019       }
   29020       break;
   29021 
   29022    case 0x38:
   29023       /* VPMINSB r/m, rV, r ::: r = min-signed-8s(rV, r/m) */
   29024       /* VPMINSB = VEX.NDS.128.66.0F38.WIG 38 /r */
   29025       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   29026          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   29027                     uses_vvvv, vbi, pfx, delta, "vpminsb", Iop_Min8Sx16 );
   29028          goto decode_success;
   29029       }
   29030       /* VPMINSB r/m, rV, r ::: r = min-signed-8s(rV, r/m) */
   29031       /* VPMINSB = VEX.NDS.256.66.0F38.WIG 38 /r */
   29032       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   29033          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   29034                     uses_vvvv, vbi, pfx, delta, "vpminsb", Iop_Min8Sx32 );
   29035          goto decode_success;
   29036       }
   29037       break;
   29038 
   29039    case 0x39:
   29040       /* VPMINSD r/m, rV, r ::: r = min-signed-32s(rV, r/m) */
   29041       /* VPMINSD = VEX.NDS.128.66.0F38.WIG 39 /r */
   29042       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   29043          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   29044                     uses_vvvv, vbi, pfx, delta, "vpminsd", Iop_Min32Sx4 );
   29045          goto decode_success;
   29046       }
   29047       /* VPMINSD r/m, rV, r ::: r = min-signed-32s(rV, r/m) */
   29048       /* VPMINSD = VEX.NDS.256.66.0F38.WIG 39 /r */
   29049       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   29050          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   29051                     uses_vvvv, vbi, pfx, delta, "vpminsd", Iop_Min32Sx8 );
   29052          goto decode_success;
   29053       }
   29054       break;
   29055 
   29056    case 0x3A:
   29057       /* VPMINUW r/m, rV, r ::: r = min-unsigned-16s(rV, r/m) */
   29058       /* VPMINUW = VEX.NDS.128.66.0F38.WIG 3A /r */
   29059       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   29060          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   29061                     uses_vvvv, vbi, pfx, delta, "vpminuw", Iop_Min16Ux8 );
   29062          goto decode_success;
   29063       }
   29064       /* VPMINUW r/m, rV, r ::: r = min-unsigned-16s(rV, r/m) */
   29065       /* VPMINUW = VEX.NDS.256.66.0F38.WIG 3A /r */
   29066       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   29067          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   29068                     uses_vvvv, vbi, pfx, delta, "vpminuw", Iop_Min16Ux16 );
   29069          goto decode_success;
   29070       }
   29071       break;
   29072 
   29073    case 0x3B:
   29074       /* VPMINUD r/m, rV, r ::: r = min-unsigned-32s(rV, r/m) */
   29075       /* VPMINUD = VEX.NDS.128.66.0F38.WIG 3B /r */
   29076       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   29077          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   29078                     uses_vvvv, vbi, pfx, delta, "vpminud", Iop_Min32Ux4 );
   29079          goto decode_success;
   29080       }
   29081       /* VPMINUD r/m, rV, r ::: r = min-unsigned-32s(rV, r/m) */
   29082       /* VPMINUD = VEX.NDS.256.66.0F38.WIG 3B /r */
   29083       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   29084          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   29085                     uses_vvvv, vbi, pfx, delta, "vpminud", Iop_Min32Ux8 );
   29086          goto decode_success;
   29087       }
   29088       break;
   29089 
   29090    case 0x3C:
   29091       /* VPMAXSB r/m, rV, r ::: r = max-signed-8s(rV, r/m) */
   29092       /* VPMAXSB = VEX.NDS.128.66.0F38.WIG 3C /r */
   29093       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   29094          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   29095                     uses_vvvv, vbi, pfx, delta, "vpmaxsb", Iop_Max8Sx16 );
   29096          goto decode_success;
   29097       }
   29098       /* VPMAXSB r/m, rV, r ::: r = max-signed-8s(rV, r/m) */
   29099       /* VPMAXSB = VEX.NDS.256.66.0F38.WIG 3C /r */
   29100       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   29101          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   29102                     uses_vvvv, vbi, pfx, delta, "vpmaxsb", Iop_Max8Sx32 );
   29103          goto decode_success;
   29104       }
   29105       break;
   29106 
   29107    case 0x3D:
   29108       /* VPMAXSD r/m, rV, r ::: r = max-signed-32s(rV, r/m) */
   29109       /* VPMAXSD = VEX.NDS.128.66.0F38.WIG 3D /r */
   29110       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   29111          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   29112                     uses_vvvv, vbi, pfx, delta, "vpmaxsd", Iop_Max32Sx4 );
   29113          goto decode_success;
   29114       }
   29115       /* VPMAXSD r/m, rV, r ::: r = max-signed-32s(rV, r/m) */
   29116       /* VPMAXSD = VEX.NDS.256.66.0F38.WIG 3D /r */
   29117       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   29118          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   29119                     uses_vvvv, vbi, pfx, delta, "vpmaxsd", Iop_Max32Sx8 );
   29120          goto decode_success;
   29121       }
   29122       break;
   29123 
   29124    case 0x3E:
   29125       /* VPMAXUW r/m, rV, r ::: r = max-unsigned-16s(rV, r/m) */
   29126       /* VPMAXUW = VEX.NDS.128.66.0F38.WIG 3E /r */
   29127       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   29128          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   29129                     uses_vvvv, vbi, pfx, delta, "vpmaxuw", Iop_Max16Ux8 );
   29130          goto decode_success;
   29131       }
   29132       /* VPMAXUW r/m, rV, r ::: r = max-unsigned-16s(rV, r/m) */
   29133       /* VPMAXUW = VEX.NDS.256.66.0F38.WIG 3E /r */
   29134       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   29135          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   29136                     uses_vvvv, vbi, pfx, delta, "vpmaxuw", Iop_Max16Ux16 );
   29137          goto decode_success;
   29138       }
   29139       break;
   29140 
   29141    case 0x3F:
   29142       /* VPMAXUD r/m, rV, r ::: r = max-unsigned-32s(rV, r/m) */
   29143       /* VPMAXUD = VEX.NDS.128.66.0F38.WIG 3F /r */
   29144       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   29145          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   29146                     uses_vvvv, vbi, pfx, delta, "vpmaxud", Iop_Max32Ux4 );
   29147          goto decode_success;
   29148       }
   29149       /* VPMAXUD r/m, rV, r ::: r = max-unsigned-32s(rV, r/m) */
   29150       /* VPMAXUD = VEX.NDS.256.66.0F38.WIG 3F /r */
   29151       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   29152          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   29153                     uses_vvvv, vbi, pfx, delta, "vpmaxud", Iop_Max32Ux8 );
   29154          goto decode_success;
   29155       }
   29156       break;
   29157 
   29158    case 0x40:
   29159       /* VPMULLD r/m, rV, r ::: r = mul-32s(rV, r/m) */
   29160       /* VPMULLD = VEX.NDS.128.66.0F38.WIG 40 /r */
   29161       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   29162          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   29163                     uses_vvvv, vbi, pfx, delta, "vpmulld", Iop_Mul32x4 );
   29164          goto decode_success;
   29165       }
   29166       /* VPMULLD r/m, rV, r ::: r = mul-32s(rV, r/m) */
   29167       /* VPMULLD = VEX.NDS.256.66.0F38.WIG 40 /r */
   29168       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   29169          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   29170                     uses_vvvv, vbi, pfx, delta, "vpmulld", Iop_Mul32x8 );
   29171          goto decode_success;
   29172       }
   29173       break;
   29174 
   29175    case 0x41:
   29176       /* VPHMINPOSUW xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 41 /r */
   29177       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   29178          delta = dis_PHMINPOSUW_128( vbi, pfx, delta, True/*isAvx*/ );
   29179          goto decode_success;
   29180       }
   29181       break;
   29182 
   29183    case 0x45:
   29184       /* VPSRLVD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 45 /r */
   29185       /* VPSRLVD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 45 /r */
   29186       if (have66noF2noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
   29187          delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsrlvd",
   29188                                          Iop_Shr32, 1==getVexL(pfx) );
   29189          *uses_vvvv = True;
   29190          goto decode_success;
   29191       }
   29192       /* VPSRLVQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 45 /r */
   29193       /* VPSRLVQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 45 /r */
   29194       if (have66noF2noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
   29195          delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsrlvq",
   29196                                          Iop_Shr64, 1==getVexL(pfx) );
   29197          *uses_vvvv = True;
   29198          goto decode_success;
   29199       }
   29200       break;
   29201 
   29202    case 0x46:
   29203       /* VPSRAVD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 46 /r */
   29204       /* VPSRAVD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 46 /r */
   29205       if (have66noF2noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
   29206          delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsravd",
   29207                                          Iop_Sar32, 1==getVexL(pfx) );
   29208          *uses_vvvv = True;
   29209          goto decode_success;
   29210       }
   29211       break;
   29212 
   29213    case 0x47:
   29214       /* VPSLLVD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 47 /r */
   29215       /* VPSLLVD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 47 /r */
   29216       if (have66noF2noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
   29217          delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsllvd",
   29218                                          Iop_Shl32, 1==getVexL(pfx) );
   29219          *uses_vvvv = True;
   29220          goto decode_success;
   29221       }
   29222       /* VPSLLVQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 47 /r */
   29223       /* VPSLLVQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 47 /r */
   29224       if (have66noF2noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
   29225          delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsllvq",
   29226                                          Iop_Shl64, 1==getVexL(pfx) );
   29227          *uses_vvvv = True;
   29228          goto decode_success;
   29229       }
   29230       break;
   29231 
   29232    case 0x58:
   29233       /* VPBROADCASTD xmm2/m32, xmm1 = VEX.128.66.0F38.W0 58 /r */
   29234       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29235           && 0==getRexW(pfx)/*W0*/) {
   29236          UChar modrm = getUChar(delta);
   29237          UInt  rG    = gregOfRexRM(pfx, modrm);
   29238          IRTemp t32 = newTemp(Ity_I32);
   29239          if (epartIsReg(modrm)) {
   29240             UInt rE = eregOfRexRM(pfx, modrm);
   29241             delta++;
   29242             DIP("vpbroadcastd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   29243             assign(t32, getXMMRegLane32(rE, 0));
   29244          } else {
   29245             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   29246             delta += alen;
   29247             DIP("vpbroadcastd %s,%s\n", dis_buf, nameXMMReg(rG));
   29248             assign(t32, loadLE(Ity_I32, mkexpr(addr)));
   29249          }
   29250          IRTemp t64 = newTemp(Ity_I64);
   29251          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   29252          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
   29253          putYMMRegLoAndZU(rG, res);
   29254          goto decode_success;
   29255       }
   29256       /* VPBROADCASTD xmm2/m32, ymm1 = VEX.256.66.0F38.W0 58 /r */
   29257       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29258           && 0==getRexW(pfx)/*W0*/) {
   29259          UChar modrm = getUChar(delta);
   29260          UInt  rG    = gregOfRexRM(pfx, modrm);
   29261          IRTemp t32 = newTemp(Ity_I32);
   29262          if (epartIsReg(modrm)) {
   29263             UInt rE = eregOfRexRM(pfx, modrm);
   29264             delta++;
   29265             DIP("vpbroadcastd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   29266             assign(t32, getXMMRegLane32(rE, 0));
   29267          } else {
   29268             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   29269             delta += alen;
   29270             DIP("vpbroadcastd %s,%s\n", dis_buf, nameYMMReg(rG));
   29271             assign(t32, loadLE(Ity_I32, mkexpr(addr)));
   29272          }
   29273          IRTemp t64 = newTemp(Ity_I64);
   29274          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   29275          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   29276                                                   mkexpr(t64), mkexpr(t64));
   29277          putYMMReg(rG, res);
   29278          goto decode_success;
   29279       }
   29280       break;
   29281 
   29282    case 0x59:
   29283       /* VPBROADCASTQ xmm2/m64, xmm1 = VEX.128.66.0F38.W0 59 /r */
   29284       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29285           && 0==getRexW(pfx)/*W0*/) {
   29286          UChar modrm = getUChar(delta);
   29287          UInt  rG    = gregOfRexRM(pfx, modrm);
   29288          IRTemp t64 = newTemp(Ity_I64);
   29289          if (epartIsReg(modrm)) {
   29290             UInt rE = eregOfRexRM(pfx, modrm);
   29291             delta++;
   29292             DIP("vpbroadcastq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   29293             assign(t64, getXMMRegLane64(rE, 0));
   29294          } else {
   29295             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   29296             delta += alen;
   29297             DIP("vpbroadcastq %s,%s\n", dis_buf, nameXMMReg(rG));
   29298             assign(t64, loadLE(Ity_I64, mkexpr(addr)));
   29299          }
   29300          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
   29301          putYMMRegLoAndZU(rG, res);
   29302          goto decode_success;
   29303       }
   29304       /* VPBROADCASTQ xmm2/m64, ymm1 = VEX.256.66.0F38.W0 59 /r */
   29305       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29306           && 0==getRexW(pfx)/*W0*/) {
   29307          UChar modrm = getUChar(delta);
   29308          UInt  rG    = gregOfRexRM(pfx, modrm);
   29309          IRTemp t64 = newTemp(Ity_I64);
   29310          if (epartIsReg(modrm)) {
   29311             UInt rE = eregOfRexRM(pfx, modrm);
   29312             delta++;
   29313             DIP("vpbroadcastq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   29314             assign(t64, getXMMRegLane64(rE, 0));
   29315          } else {
   29316             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   29317             delta += alen;
   29318             DIP("vpbroadcastq %s,%s\n", dis_buf, nameYMMReg(rG));
   29319             assign(t64, loadLE(Ity_I64, mkexpr(addr)));
   29320          }
   29321          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   29322                                                   mkexpr(t64), mkexpr(t64));
   29323          putYMMReg(rG, res);
   29324          goto decode_success;
   29325       }
   29326       break;
   29327 
   29328    case 0x5A:
   29329       /* VBROADCASTI128 m128, ymm1 = VEX.256.66.0F38.WIG 5A /r */
   29330       if (have66noF2noF3(pfx)
   29331           && 1==getVexL(pfx)/*256*/
   29332           && !epartIsReg(getUChar(delta))) {
   29333          UChar modrm = getUChar(delta);
   29334          UInt  rG    = gregOfRexRM(pfx, modrm);
   29335          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   29336          delta += alen;
   29337          DIP("vbroadcasti128 %s,%s\n", dis_buf, nameYMMReg(rG));
   29338          IRTemp t128 = newTemp(Ity_V128);
   29339          assign(t128, loadLE(Ity_V128, mkexpr(addr)));
   29340          putYMMReg( rG, binop(Iop_V128HLtoV256, mkexpr(t128), mkexpr(t128)) );
   29341          goto decode_success;
   29342       }
   29343       break;
   29344 
   29345    case 0x78:
   29346       /* VPBROADCASTB xmm2/m8, xmm1 = VEX.128.66.0F38.W0 78 /r */
   29347       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29348           && 0==getRexW(pfx)/*W0*/) {
   29349          UChar modrm = getUChar(delta);
   29350          UInt  rG    = gregOfRexRM(pfx, modrm);
   29351          IRTemp t8   = newTemp(Ity_I8);
   29352          if (epartIsReg(modrm)) {
   29353             UInt rE = eregOfRexRM(pfx, modrm);
   29354             delta++;
   29355             DIP("vpbroadcastb %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   29356             assign(t8, unop(Iop_32to8, getXMMRegLane32(rE, 0)));
   29357          } else {
   29358             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   29359             delta += alen;
   29360             DIP("vpbroadcastb %s,%s\n", dis_buf, nameXMMReg(rG));
   29361             assign(t8, loadLE(Ity_I8, mkexpr(addr)));
   29362          }
   29363          IRTemp t16 = newTemp(Ity_I16);
   29364          assign(t16, binop(Iop_8HLto16, mkexpr(t8), mkexpr(t8)));
   29365          IRTemp t32 = newTemp(Ity_I32);
   29366          assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
   29367          IRTemp t64 = newTemp(Ity_I64);
   29368          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   29369          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
   29370          putYMMRegLoAndZU(rG, res);
   29371          goto decode_success;
   29372       }
   29373       /* VPBROADCASTB xmm2/m8, ymm1 = VEX.256.66.0F38.W0 78 /r */
   29374       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29375           && 0==getRexW(pfx)/*W0*/) {
   29376          UChar modrm = getUChar(delta);
   29377          UInt  rG    = gregOfRexRM(pfx, modrm);
   29378          IRTemp t8   = newTemp(Ity_I8);
   29379          if (epartIsReg(modrm)) {
   29380             UInt rE = eregOfRexRM(pfx, modrm);
   29381             delta++;
   29382             DIP("vpbroadcastb %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   29383             assign(t8, unop(Iop_32to8, getXMMRegLane32(rE, 0)));
   29384          } else {
   29385             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   29386             delta += alen;
   29387             DIP("vpbroadcastb %s,%s\n", dis_buf, nameYMMReg(rG));
   29388             assign(t8, loadLE(Ity_I8, mkexpr(addr)));
   29389          }
   29390          IRTemp t16 = newTemp(Ity_I16);
   29391          assign(t16, binop(Iop_8HLto16, mkexpr(t8), mkexpr(t8)));
   29392          IRTemp t32 = newTemp(Ity_I32);
   29393          assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
   29394          IRTemp t64 = newTemp(Ity_I64);
   29395          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   29396          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   29397                                                   mkexpr(t64), mkexpr(t64));
   29398          putYMMReg(rG, res);
   29399          goto decode_success;
   29400       }
   29401       break;
   29402 
   29403    case 0x79:
   29404       /* VPBROADCASTW xmm2/m16, xmm1 = VEX.128.66.0F38.W0 79 /r */
   29405       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29406           && 0==getRexW(pfx)/*W0*/) {
   29407          UChar modrm = getUChar(delta);
   29408          UInt  rG    = gregOfRexRM(pfx, modrm);
   29409          IRTemp t16  = newTemp(Ity_I16);
   29410          if (epartIsReg(modrm)) {
   29411             UInt rE = eregOfRexRM(pfx, modrm);
   29412             delta++;
   29413             DIP("vpbroadcastw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   29414             assign(t16, unop(Iop_32to16, getXMMRegLane32(rE, 0)));
   29415          } else {
   29416             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   29417             delta += alen;
   29418             DIP("vpbroadcastw %s,%s\n", dis_buf, nameXMMReg(rG));
   29419             assign(t16, loadLE(Ity_I16, mkexpr(addr)));
   29420          }
   29421          IRTemp t32 = newTemp(Ity_I32);
   29422          assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
   29423          IRTemp t64 = newTemp(Ity_I64);
   29424          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   29425          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
   29426          putYMMRegLoAndZU(rG, res);
   29427          goto decode_success;
   29428       }
   29429       /* VPBROADCASTW xmm2/m16, ymm1 = VEX.256.66.0F38.W0 79 /r */
   29430       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29431           && 0==getRexW(pfx)/*W0*/) {
   29432          UChar modrm = getUChar(delta);
   29433          UInt  rG    = gregOfRexRM(pfx, modrm);
   29434          IRTemp t16  = newTemp(Ity_I16);
   29435          if (epartIsReg(modrm)) {
   29436             UInt rE = eregOfRexRM(pfx, modrm);
   29437             delta++;
   29438             DIP("vpbroadcastw %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   29439             assign(t16, unop(Iop_32to16, getXMMRegLane32(rE, 0)));
   29440          } else {
   29441             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   29442             delta += alen;
   29443             DIP("vpbroadcastw %s,%s\n", dis_buf, nameYMMReg(rG));
   29444             assign(t16, loadLE(Ity_I16, mkexpr(addr)));
   29445          }
   29446          IRTemp t32 = newTemp(Ity_I32);
   29447          assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
   29448          IRTemp t64 = newTemp(Ity_I64);
   29449          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   29450          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   29451                                                   mkexpr(t64), mkexpr(t64));
   29452          putYMMReg(rG, res);
   29453          goto decode_success;
   29454       }
   29455       break;
   29456 
   29457    case 0x8C:
   29458       /* VPMASKMOVD m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 8C /r */
   29459       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29460           && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   29461          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
   29462                                /*!isYMM*/False, Ity_I32, /*isLoad*/True );
   29463          goto decode_success;
   29464       }
   29465       /* VPMASKMOVD m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 8C /r */
   29466       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29467           && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   29468          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
   29469                                /*isYMM*/True, Ity_I32, /*isLoad*/True );
   29470          goto decode_success;
   29471       }
   29472       /* VPMASKMOVQ m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 8C /r */
   29473       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29474           && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   29475          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
   29476                                /*!isYMM*/False, Ity_I64, /*isLoad*/True );
   29477          goto decode_success;
   29478       }
   29479       /* VPMASKMOVQ m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 8C /r */
   29480       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29481           && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   29482          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
   29483                                /*isYMM*/True, Ity_I64, /*isLoad*/True );
   29484          goto decode_success;
   29485       }
   29486       break;
   29487 
   29488    case 0x8E:
   29489       /* VPMASKMOVD xmm1, xmm2, m128 = VEX.NDS.128.66.0F38.W0 8E /r */
   29490       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29491           && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   29492          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
   29493                                /*!isYMM*/False, Ity_I32, /*!isLoad*/False );
   29494          goto decode_success;
   29495       }
   29496       /* VPMASKMOVD ymm1, ymm2, m256 = VEX.NDS.256.66.0F38.W0 8E /r */
   29497       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29498           && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   29499          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
   29500                                /*isYMM*/True, Ity_I32, /*!isLoad*/False );
   29501          goto decode_success;
   29502       }
   29503       /* VPMASKMOVQ xmm1, xmm2, m128 = VEX.NDS.128.66.0F38.W1 8E /r */
   29504       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29505           && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   29506          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
   29507                                /*!isYMM*/False, Ity_I64, /*!isLoad*/False );
   29508          goto decode_success;
   29509       }
   29510       /* VPMASKMOVQ ymm1, ymm2, m256 = VEX.NDS.256.66.0F38.W1 8E /r */
   29511       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29512           && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   29513          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
   29514                                /*isYMM*/True, Ity_I64, /*!isLoad*/False );
   29515          goto decode_success;
   29516       }
   29517       break;
   29518 
   29519    case 0x90:
   29520       /* VPGATHERDD xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W0 90 /r */
   29521       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29522           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   29523          Long delta0 = delta;
   29524          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdd",
   29525                               /*!isYMM*/False, /*!isVM64x*/False, Ity_I32 );
   29526          if (delta != delta0)
   29527             goto decode_success;
   29528       }
   29529       /* VPGATHERDD ymm2, vm32y, ymm1 = VEX.DDS.256.66.0F38.W0 90 /r */
   29530       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29531           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   29532          Long delta0 = delta;
   29533          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdd",
   29534                               /*isYMM*/True, /*!isVM64x*/False, Ity_I32 );
   29535          if (delta != delta0)
   29536             goto decode_success;
   29537       }
   29538       /* VPGATHERDQ xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W1 90 /r */
   29539       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29540           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   29541          Long delta0 = delta;
   29542          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdq",
   29543                               /*!isYMM*/False, /*!isVM64x*/False, Ity_I64 );
   29544          if (delta != delta0)
   29545             goto decode_success;
   29546       }
   29547       /* VPGATHERDQ ymm2, vm32x, ymm1 = VEX.DDS.256.66.0F38.W1 90 /r */
   29548       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29549           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   29550          Long delta0 = delta;
   29551          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdq",
   29552                               /*isYMM*/True, /*!isVM64x*/False, Ity_I64 );
   29553          if (delta != delta0)
   29554             goto decode_success;
   29555       }
   29556       break;
   29557 
   29558    case 0x91:
   29559       /* VPGATHERQD xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W0 91 /r */
   29560       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29561           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   29562          Long delta0 = delta;
   29563          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqd",
   29564                               /*!isYMM*/False, /*isVM64x*/True, Ity_I32 );
   29565          if (delta != delta0)
   29566             goto decode_success;
   29567       }
   29568       /* VPGATHERQD xmm2, vm64y, xmm1 = VEX.DDS.256.66.0F38.W0 91 /r */
   29569       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29570           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   29571          Long delta0 = delta;
   29572          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqd",
   29573                               /*isYMM*/True, /*isVM64x*/True, Ity_I32 );
   29574          if (delta != delta0)
   29575             goto decode_success;
   29576       }
   29577       /* VPGATHERQQ xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W1 91 /r */
   29578       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29579           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   29580          Long delta0 = delta;
   29581          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqq",
   29582                               /*!isYMM*/False, /*isVM64x*/True, Ity_I64 );
   29583          if (delta != delta0)
   29584             goto decode_success;
   29585       }
   29586       /* VPGATHERQQ ymm2, vm64y, ymm1 = VEX.DDS.256.66.0F38.W1 91 /r */
   29587       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29588           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   29589          Long delta0 = delta;
   29590          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqq",
   29591                               /*isYMM*/True, /*isVM64x*/True, Ity_I64 );
   29592          if (delta != delta0)
   29593             goto decode_success;
   29594       }
   29595       break;
   29596 
   29597    case 0x92:
   29598       /* VGATHERDPS xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W0 92 /r */
   29599       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29600           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   29601          Long delta0 = delta;
   29602          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdps",
   29603                               /*!isYMM*/False, /*!isVM64x*/False, Ity_I32 );
   29604          if (delta != delta0)
   29605             goto decode_success;
   29606       }
   29607       /* VGATHERDPS ymm2, vm32y, ymm1 = VEX.DDS.256.66.0F38.W0 92 /r */
   29608       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29609           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   29610          Long delta0 = delta;
   29611          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdps",
   29612                               /*isYMM*/True, /*!isVM64x*/False, Ity_I32 );
   29613          if (delta != delta0)
   29614             goto decode_success;
   29615       }
   29616       /* VGATHERDPD xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W1 92 /r */
   29617       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29618           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   29619          Long delta0 = delta;
   29620          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdpd",
   29621                               /*!isYMM*/False, /*!isVM64x*/False, Ity_I64 );
   29622          if (delta != delta0)
   29623             goto decode_success;
   29624       }
   29625       /* VGATHERDPD ymm2, vm32x, ymm1 = VEX.DDS.256.66.0F38.W1 92 /r */
   29626       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29627           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   29628          Long delta0 = delta;
   29629          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdpd",
   29630                               /*isYMM*/True, /*!isVM64x*/False, Ity_I64 );
   29631          if (delta != delta0)
   29632             goto decode_success;
   29633       }
   29634       break;
   29635 
   29636    case 0x93:
   29637       /* VGATHERQPS xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W0 93 /r */
   29638       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29639           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   29640          Long delta0 = delta;
   29641          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqps",
   29642                               /*!isYMM*/False, /*isVM64x*/True, Ity_I32 );
   29643          if (delta != delta0)
   29644             goto decode_success;
   29645       }
   29646       /* VGATHERQPS xmm2, vm64y, xmm1 = VEX.DDS.256.66.0F38.W0 93 /r */
   29647       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29648           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   29649          Long delta0 = delta;
   29650          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqps",
   29651                               /*isYMM*/True, /*isVM64x*/True, Ity_I32 );
   29652          if (delta != delta0)
   29653             goto decode_success;
   29654       }
   29655       /* VGATHERQPD xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W1 93 /r */
   29656       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29657           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   29658          Long delta0 = delta;
   29659          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqpd",
   29660                               /*!isYMM*/False, /*isVM64x*/True, Ity_I64 );
   29661          if (delta != delta0)
   29662             goto decode_success;
   29663       }
   29664       /* VGATHERQPD ymm2, vm64y, ymm1 = VEX.DDS.256.66.0F38.W1 93 /r */
   29665       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29666           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   29667          Long delta0 = delta;
   29668          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqpd",
   29669                               /*isYMM*/True, /*isVM64x*/True, Ity_I64 );
   29670          if (delta != delta0)
   29671             goto decode_success;
   29672       }
   29673       break;
   29674 
   29675    case 0x96 ... 0x9F:
   29676    case 0xA6 ... 0xAF:
   29677    case 0xB6 ... 0xBF:
   29678       /* VFMADDSUB132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 96 /r */
   29679       /* VFMADDSUB132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 96 /r */
   29680       /* VFMADDSUB132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 96 /r */
   29681       /* VFMADDSUB132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 96 /r */
   29682       /* VFMSUBADD132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 97 /r */
   29683       /* VFMSUBADD132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 97 /r */
   29684       /* VFMSUBADD132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 97 /r */
   29685       /* VFMSUBADD132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 97 /r */
   29686       /* VFMADD132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 98 /r */
   29687       /* VFMADD132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 98 /r */
   29688       /* VFMADD132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 98 /r */
   29689       /* VFMADD132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 98 /r */
   29690       /* VFMADD132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 99 /r */
   29691       /* VFMADD132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 99 /r */
   29692       /* VFMSUB132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 9A /r */
   29693       /* VFMSUB132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 9A /r */
   29694       /* VFMSUB132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 9A /r */
   29695       /* VFMSUB132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 9A /r */
   29696       /* VFMSUB132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 9B /r */
   29697       /* VFMSUB132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 9B /r */
   29698       /* VFNMADD132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 9C /r */
   29699       /* VFNMADD132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 9C /r */
   29700       /* VFNMADD132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 9C /r */
   29701       /* VFNMADD132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 9C /r */
   29702       /* VFNMADD132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 9D /r */
   29703       /* VFNMADD132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 9D /r */
   29704       /* VFNMSUB132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 9E /r */
   29705       /* VFNMSUB132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 9E /r */
   29706       /* VFNMSUB132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 9E /r */
   29707       /* VFNMSUB132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 9E /r */
   29708       /* VFNMSUB132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 9F /r */
   29709       /* VFNMSUB132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 9F /r */
   29710       /* VFMADDSUB213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 A6 /r */
   29711       /* VFMADDSUB213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 A6 /r */
   29712       /* VFMADDSUB213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 A6 /r */
   29713       /* VFMADDSUB213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 A6 /r */
   29714       /* VFMSUBADD213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 A7 /r */
   29715       /* VFMSUBADD213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 A7 /r */
   29716       /* VFMSUBADD213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 A7 /r */
   29717       /* VFMSUBADD213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 A7 /r */
   29718       /* VFMADD213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 A8 /r */
   29719       /* VFMADD213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 A8 /r */
   29720       /* VFMADD213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 A8 /r */
   29721       /* VFMADD213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 A8 /r */
   29722       /* VFMADD213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 A9 /r */
   29723       /* VFMADD213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 A9 /r */
   29724       /* VFMSUB213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 AA /r */
   29725       /* VFMSUB213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 AA /r */
   29726       /* VFMSUB213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 AA /r */
   29727       /* VFMSUB213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 AA /r */
   29728       /* VFMSUB213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 AB /r */
   29729       /* VFMSUB213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 AB /r */
   29730       /* VFNMADD213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 AC /r */
   29731       /* VFNMADD213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 AC /r */
   29732       /* VFNMADD213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 AC /r */
   29733       /* VFNMADD213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 AC /r */
   29734       /* VFNMADD213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 AD /r */
   29735       /* VFNMADD213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 AD /r */
   29736       /* VFNMSUB213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 AE /r */
   29737       /* VFNMSUB213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 AE /r */
   29738       /* VFNMSUB213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 AE /r */
   29739       /* VFNMSUB213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 AE /r */
   29740       /* VFNMSUB213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 AF /r */
   29741       /* VFNMSUB213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 AF /r */
   29742       /* VFMADDSUB231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 B6 /r */
   29743       /* VFMADDSUB231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 B6 /r */
   29744       /* VFMADDSUB231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 B6 /r */
   29745       /* VFMADDSUB231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 B6 /r */
   29746       /* VFMSUBADD231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 B7 /r */
   29747       /* VFMSUBADD231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 B7 /r */
   29748       /* VFMSUBADD231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 B7 /r */
   29749       /* VFMSUBADD231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 B7 /r */
   29750       /* VFMADD231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 B8 /r */
   29751       /* VFMADD231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 B8 /r */
   29752       /* VFMADD231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 B8 /r */
   29753       /* VFMADD231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 B8 /r */
   29754       /* VFMADD231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 B9 /r */
   29755       /* VFMADD231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 B9 /r */
   29756       /* VFMSUB231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 BA /r */
   29757       /* VFMSUB231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 BA /r */
   29758       /* VFMSUB231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 BA /r */
   29759       /* VFMSUB231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 BA /r */
   29760       /* VFMSUB231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 BB /r */
   29761       /* VFMSUB231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 BB /r */
   29762       /* VFNMADD231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 BC /r */
   29763       /* VFNMADD231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 BC /r */
   29764       /* VFNMADD231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 BC /r */
   29765       /* VFNMADD231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 BC /r */
   29766       /* VFNMADD231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 BD /r */
   29767       /* VFNMADD231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 BD /r */
   29768       /* VFNMSUB231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 BE /r */
   29769       /* VFNMSUB231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 BE /r */
   29770       /* VFNMSUB231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 BE /r */
   29771       /* VFNMSUB231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 BE /r */
   29772       /* VFNMSUB231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 BF /r */
   29773       /* VFNMSUB231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 BF /r */
   29774       if (have66noF2noF3(pfx)) {
   29775          delta = dis_FMA( vbi, pfx, delta, opc );
   29776          *uses_vvvv = True;
   29777          dres->hint = Dis_HintVerbose;
   29778          goto decode_success;
   29779       }
   29780       break;
   29781 
   29782    case 0xDB:
   29783    case 0xDC:
   29784    case 0xDD:
   29785    case 0xDE:
   29786    case 0xDF:
   29787       /* VAESIMC xmm2/m128, xmm1 = VEX.128.66.0F38.WIG DB /r */
   29788       /* VAESENC xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DC /r */
   29789       /* VAESENCLAST xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DD /r */
   29790       /* VAESDEC xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DE /r */
   29791       /* VAESDECLAST xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DF /r */
   29792       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   29793          delta = dis_AESx( vbi, pfx, delta, True/*!isAvx*/, opc );
   29794          if (opc != 0xDB) *uses_vvvv = True;
   29795          goto decode_success;
   29796       }
   29797       break;
   29798 
   29799    case 0xF2:
   29800       /* ANDN r/m32, r32b, r32a = VEX.NDS.LZ.0F38.W0 F2 /r */
   29801       /* ANDN r/m64, r64b, r64a = VEX.NDS.LZ.0F38.W1 F2 /r */
   29802       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   29803          Int     size = getRexW(pfx) ? 8 : 4;
   29804          IRType  ty   = szToITy(size);
   29805          IRTemp  dst  = newTemp(ty);
   29806          IRTemp  src1 = newTemp(ty);
   29807          IRTemp  src2 = newTemp(ty);
   29808          UChar   rm   = getUChar(delta);
   29809 
   29810          assign( src1, getIRegV(size,pfx) );
   29811          if (epartIsReg(rm)) {
   29812             assign( src2, getIRegE(size,pfx,rm) );
   29813             DIP("andn %s,%s,%s\n", nameIRegE(size,pfx,rm),
   29814                 nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
   29815             delta++;
   29816          } else {
   29817             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29818             assign( src2, loadLE(ty, mkexpr(addr)) );
   29819             DIP("andn %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
   29820                 nameIRegG(size,pfx,rm));
   29821             delta += alen;
   29822          }
   29823 
   29824          assign( dst, binop( mkSizedOp(ty,Iop_And8),
   29825                              unop( mkSizedOp(ty,Iop_Not8), mkexpr(src1) ),
   29826                              mkexpr(src2) ) );
   29827          putIRegG( size, pfx, rm, mkexpr(dst) );
   29828          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
   29829                                                ? AMD64G_CC_OP_ANDN64
   29830                                                : AMD64G_CC_OP_ANDN32)) );
   29831          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
   29832          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
   29833          *uses_vvvv = True;
   29834          goto decode_success;
   29835       }
   29836       break;
   29837 
   29838    case 0xF3:
   29839       /* BLSI r/m32, r32 = VEX.NDD.LZ.0F38.W0 F3 /3 */
   29840       /* BLSI r/m64, r64 = VEX.NDD.LZ.0F38.W1 F3 /3 */
   29841       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/
   29842           && !haveREX(pfx) && gregLO3ofRM(getUChar(delta)) == 3) {
   29843          Int     size = getRexW(pfx) ? 8 : 4;
   29844          IRType  ty   = szToITy(size);
   29845          IRTemp  src  = newTemp(ty);
   29846          IRTemp  dst  = newTemp(ty);
   29847          UChar   rm   = getUChar(delta);
   29848 
   29849          if (epartIsReg(rm)) {
   29850             assign( src, getIRegE(size,pfx,rm) );
   29851             DIP("blsi %s,%s\n", nameIRegE(size,pfx,rm),
   29852                 nameIRegV(size,pfx));
   29853             delta++;
   29854          } else {
   29855             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29856             assign( src, loadLE(ty, mkexpr(addr)) );
   29857             DIP("blsi %s,%s\n", dis_buf, nameIRegV(size,pfx));
   29858             delta += alen;
   29859          }
   29860 
   29861          assign( dst, binop(mkSizedOp(ty,Iop_And8),
   29862                             binop(mkSizedOp(ty,Iop_Sub8), mkU(ty, 0),
   29863                                   mkexpr(src)), mkexpr(src)) );
   29864          putIRegV( size, pfx, mkexpr(dst) );
   29865          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
   29866                                                ? AMD64G_CC_OP_BLSI64
   29867                                                : AMD64G_CC_OP_BLSI32)) );
   29868          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
   29869          stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(src))) );
   29870          *uses_vvvv = True;
   29871          goto decode_success;
   29872       }
   29873       /* BLSMSK r/m32, r32 = VEX.NDD.LZ.0F38.W0 F3 /2 */
   29874       /* BLSMSK r/m64, r64 = VEX.NDD.LZ.0F38.W1 F3 /2 */
   29875       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/
   29876           && !haveREX(pfx) && gregLO3ofRM(getUChar(delta)) == 2) {
   29877          Int     size = getRexW(pfx) ? 8 : 4;
   29878          IRType  ty   = szToITy(size);
   29879          IRTemp  src  = newTemp(ty);
   29880          IRTemp  dst  = newTemp(ty);
   29881          UChar   rm   = getUChar(delta);
   29882 
   29883          if (epartIsReg(rm)) {
   29884             assign( src, getIRegE(size,pfx,rm) );
   29885             DIP("blsmsk %s,%s\n", nameIRegE(size,pfx,rm),
   29886                 nameIRegV(size,pfx));
   29887             delta++;
   29888          } else {
   29889             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29890             assign( src, loadLE(ty, mkexpr(addr)) );
   29891             DIP("blsmsk %s,%s\n", dis_buf, nameIRegV(size,pfx));
   29892             delta += alen;
   29893          }
   29894 
   29895          assign( dst, binop(mkSizedOp(ty,Iop_Xor8),
   29896                             binop(mkSizedOp(ty,Iop_Sub8), mkexpr(src),
   29897                                   mkU(ty, 1)), mkexpr(src)) );
   29898          putIRegV( size, pfx, mkexpr(dst) );
   29899          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
   29900                                                ? AMD64G_CC_OP_BLSMSK64
   29901                                                : AMD64G_CC_OP_BLSMSK32)) );
   29902          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
   29903          stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(src))) );
   29904          *uses_vvvv = True;
   29905          goto decode_success;
   29906       }
   29907       /* BLSR r/m32, r32 = VEX.NDD.LZ.0F38.W0 F3 /1 */
   29908       /* BLSR r/m64, r64 = VEX.NDD.LZ.0F38.W1 F3 /1 */
   29909       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/
   29910           && !haveREX(pfx) && gregLO3ofRM(getUChar(delta)) == 1) {
   29911          Int     size = getRexW(pfx) ? 8 : 4;
   29912          IRType  ty   = szToITy(size);
   29913          IRTemp  src  = newTemp(ty);
   29914          IRTemp  dst  = newTemp(ty);
   29915          UChar   rm   = getUChar(delta);
   29916 
   29917          if (epartIsReg(rm)) {
   29918             assign( src, getIRegE(size,pfx,rm) );
   29919             DIP("blsr %s,%s\n", nameIRegE(size,pfx,rm),
   29920                 nameIRegV(size,pfx));
   29921             delta++;
   29922          } else {
   29923             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29924             assign( src, loadLE(ty, mkexpr(addr)) );
   29925             DIP("blsr %s,%s\n", dis_buf, nameIRegV(size,pfx));
   29926             delta += alen;
   29927          }
   29928 
   29929          assign( dst, binop(mkSizedOp(ty,Iop_And8),
   29930                             binop(mkSizedOp(ty,Iop_Sub8), mkexpr(src),
   29931                                   mkU(ty, 1)), mkexpr(src)) );
   29932          putIRegV( size, pfx, mkexpr(dst) );
   29933          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
   29934                                                ? AMD64G_CC_OP_BLSR64
   29935                                                : AMD64G_CC_OP_BLSR32)) );
   29936          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
   29937          stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(src))) );
   29938          *uses_vvvv = True;
   29939          goto decode_success;
   29940       }
   29941       break;
   29942 
   29943    case 0xF5:
   29944       /* BZHI r32b, r/m32, r32a = VEX.NDS.LZ.0F38.W0 F5 /r */
   29945       /* BZHI r64b, r/m64, r64a = VEX.NDS.LZ.0F38.W1 F5 /r */
   29946       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   29947          Int     size  = getRexW(pfx) ? 8 : 4;
   29948          IRType  ty    = szToITy(size);
   29949          IRTemp  dst   = newTemp(ty);
   29950          IRTemp  src1  = newTemp(ty);
   29951          IRTemp  src2  = newTemp(ty);
   29952          IRTemp  start = newTemp(Ity_I8);
   29953          IRTemp  cond  = newTemp(Ity_I1);
   29954          UChar   rm    = getUChar(delta);
   29955 
   29956          assign( src2, getIRegV(size,pfx) );
   29957          if (epartIsReg(rm)) {
   29958             assign( src1, getIRegE(size,pfx,rm) );
   29959             DIP("bzhi %s,%s,%s\n", nameIRegV(size,pfx),
   29960                 nameIRegE(size,pfx,rm), nameIRegG(size,pfx,rm));
   29961             delta++;
   29962          } else {
   29963             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29964             assign( src1, loadLE(ty, mkexpr(addr)) );
   29965             DIP("bzhi %s,%s,%s\n", nameIRegV(size,pfx), dis_buf,
   29966                 nameIRegG(size,pfx,rm));
   29967             delta += alen;
   29968          }
   29969 
   29970          assign( start, narrowTo( Ity_I8, mkexpr(src2) ) );
   29971          assign( cond, binop(Iop_CmpLT32U,
   29972                              unop(Iop_8Uto32, mkexpr(start)),
   29973                              mkU32(8*size)) );
   29974          /* if (start < opsize) {
   29975                if (start == 0)
   29976                   dst = 0;
   29977                else
   29978                   dst = (src1 << (opsize-start)) u>> (opsize-start);
   29979             } else {
   29980                dst = src1;
   29981             } */
   29982          assign( dst,
   29983                  IRExpr_ITE(
   29984                     mkexpr(cond),
   29985                     IRExpr_ITE(
   29986                        binop(Iop_CmpEQ8, mkexpr(start), mkU8(0)),
   29987                        mkU(ty, 0),
   29988                        binop(
   29989                           mkSizedOp(ty,Iop_Shr8),
   29990                           binop(
   29991                              mkSizedOp(ty,Iop_Shl8),
   29992                              mkexpr(src1),
   29993                              binop(Iop_Sub8, mkU8(8*size), mkexpr(start))
   29994                           ),
   29995                           binop(Iop_Sub8, mkU8(8*size), mkexpr(start))
   29996                        )
   29997                     ),
   29998                     mkexpr(src1)
   29999                  )
   30000                );
   30001          putIRegG( size, pfx, rm, mkexpr(dst) );
   30002          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
   30003                                                ? AMD64G_CC_OP_BLSR64
   30004                                                : AMD64G_CC_OP_BLSR32)) );
   30005          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
   30006          stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(cond))) );
   30007          *uses_vvvv = True;
   30008          goto decode_success;
   30009       }
   30010       /* PDEP r/m32, r32b, r32a = VEX.NDS.LZ.F2.0F38.W0 F5 /r */
   30011       /* PDEP r/m64, r64b, r64a = VEX.NDS.LZ.F2.0F38.W1 F5 /r */
   30012       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   30013          Int     size = getRexW(pfx) ? 8 : 4;
   30014          IRType  ty   = szToITy(size);
   30015          IRTemp  src  = newTemp(ty);
   30016          IRTemp  mask = newTemp(ty);
   30017          UChar   rm   = getUChar(delta);
   30018 
   30019          assign( src, getIRegV(size,pfx) );
   30020          if (epartIsReg(rm)) {
   30021             assign( mask, getIRegE(size,pfx,rm) );
   30022             DIP("pdep %s,%s,%s\n", nameIRegE(size,pfx,rm),
   30023                 nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
   30024             delta++;
   30025          } else {
   30026             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   30027             assign( mask, loadLE(ty, mkexpr(addr)) );
   30028             DIP("pdep %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
   30029                 nameIRegG(size,pfx,rm));
   30030             delta += alen;
   30031          }
   30032 
   30033          IRExpr** args = mkIRExprVec_2( widenUto64(mkexpr(src)),
   30034                                         widenUto64(mkexpr(mask)) );
   30035          putIRegG( size, pfx, rm,
   30036                    narrowTo(ty, mkIRExprCCall(Ity_I64, 0/*regparms*/,
   30037                                               "amd64g_calculate_pdep",
   30038                                               &amd64g_calculate_pdep, args)) );
   30039          *uses_vvvv = True;
   30040          /* Flags aren't modified.  */
   30041          goto decode_success;
   30042       }
   30043       /* PEXT r/m32, r32b, r32a = VEX.NDS.LZ.F3.0F38.W0 F5 /r */
   30044       /* PEXT r/m64, r64b, r64a = VEX.NDS.LZ.F3.0F38.W1 F5 /r */
   30045       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   30046          Int     size = getRexW(pfx) ? 8 : 4;
   30047          IRType  ty   = szToITy(size);
   30048          IRTemp  src  = newTemp(ty);
   30049          IRTemp  mask = newTemp(ty);
   30050          UChar   rm   = getUChar(delta);
   30051 
   30052          assign( src, getIRegV(size,pfx) );
   30053          if (epartIsReg(rm)) {
   30054             assign( mask, getIRegE(size,pfx,rm) );
   30055             DIP("pext %s,%s,%s\n", nameIRegE(size,pfx,rm),
   30056                 nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
   30057             delta++;
   30058          } else {
   30059             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   30060             assign( mask, loadLE(ty, mkexpr(addr)) );
   30061             DIP("pext %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
   30062                 nameIRegG(size,pfx,rm));
   30063             delta += alen;
   30064          }
   30065 
   30066          /* First mask off bits not set in mask, they are ignored
   30067             and it should be fine if they contain undefined values.  */
   30068          IRExpr* masked = binop(mkSizedOp(ty,Iop_And8),
   30069                                 mkexpr(src), mkexpr(mask));
   30070          IRExpr** args = mkIRExprVec_2( widenUto64(masked),
   30071                                         widenUto64(mkexpr(mask)) );
   30072          putIRegG( size, pfx, rm,
   30073                    narrowTo(ty, mkIRExprCCall(Ity_I64, 0/*regparms*/,
   30074                                               "amd64g_calculate_pext",
   30075                                               &amd64g_calculate_pext, args)) );
   30076          *uses_vvvv = True;
   30077          /* Flags aren't modified.  */
   30078          goto decode_success;
   30079       }
   30080       break;
   30081 
   30082    case 0xF6:
   30083       /* MULX r/m32, r32b, r32a = VEX.NDD.LZ.F2.0F38.W0 F6 /r */
   30084       /* MULX r/m64, r64b, r64a = VEX.NDD.LZ.F2.0F38.W1 F6 /r */
   30085       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   30086          Int     size = getRexW(pfx) ? 8 : 4;
   30087          IRType  ty   = szToITy(size);
   30088          IRTemp  src1 = newTemp(ty);
   30089          IRTemp  src2 = newTemp(ty);
   30090          IRTemp  res  = newTemp(size == 8 ? Ity_I128 : Ity_I64);
   30091          UChar   rm   = getUChar(delta);
   30092 
   30093          assign( src1, getIRegRDX(size) );
   30094          if (epartIsReg(rm)) {
   30095             assign( src2, getIRegE(size,pfx,rm) );
   30096             DIP("mulx %s,%s,%s\n", nameIRegE(size,pfx,rm),
   30097                 nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
   30098             delta++;
   30099          } else {
   30100             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   30101             assign( src2, loadLE(ty, mkexpr(addr)) );
   30102             DIP("mulx %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
   30103                 nameIRegG(size,pfx,rm));
   30104             delta += alen;
   30105          }
   30106 
   30107          assign( res, binop(size == 8 ? Iop_MullU64 : Iop_MullU32,
   30108                             mkexpr(src1), mkexpr(src2)) );
   30109          putIRegV( size, pfx,
   30110                    unop(size == 8 ? Iop_128to64 : Iop_64to32, mkexpr(res)) );
   30111          putIRegG( size, pfx, rm,
   30112                    unop(size == 8 ? Iop_128HIto64 : Iop_64HIto32,
   30113                         mkexpr(res)) );
   30114          *uses_vvvv = True;
   30115          /* Flags aren't modified.  */
   30116          goto decode_success;
   30117       }
   30118       break;
   30119 
   30120    case 0xF7:
   30121       /* SARX r32b, r/m32, r32a = VEX.NDS.LZ.F3.0F38.W0 F7 /r */
   30122       /* SARX r64b, r/m64, r64a = VEX.NDS.LZ.F3.0F38.W1 F7 /r */
   30123       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   30124          delta = dis_SHIFTX( uses_vvvv, vbi, pfx, delta, "sarx", Iop_Sar8 );
   30125          goto decode_success;
   30126       }
   30127       /* SHLX r32b, r/m32, r32a = VEX.NDS.LZ.66.0F38.W0 F7 /r */
   30128       /* SHLX r64b, r/m64, r64a = VEX.NDS.LZ.66.0F38.W1 F7 /r */
   30129       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   30130          delta = dis_SHIFTX( uses_vvvv, vbi, pfx, delta, "shlx", Iop_Shl8 );
   30131          goto decode_success;
   30132       }
   30133       /* SHRX r32b, r/m32, r32a = VEX.NDS.LZ.F2.0F38.W0 F7 /r */
   30134       /* SHRX r64b, r/m64, r64a = VEX.NDS.LZ.F2.0F38.W1 F7 /r */
   30135       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   30136          delta = dis_SHIFTX( uses_vvvv, vbi, pfx, delta, "shrx", Iop_Shr8 );
   30137          goto decode_success;
   30138       }
   30139       /* BEXTR r32b, r/m32, r32a = VEX.NDS.LZ.0F38.W0 F7 /r */
   30140       /* BEXTR r64b, r/m64, r64a = VEX.NDS.LZ.0F38.W1 F7 /r */
   30141       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   30142          Int     size  = getRexW(pfx) ? 8 : 4;
   30143          IRType  ty    = szToITy(size);
   30144          IRTemp  dst   = newTemp(ty);
   30145          IRTemp  src1  = newTemp(ty);
   30146          IRTemp  src2  = newTemp(ty);
   30147          IRTemp  stle  = newTemp(Ity_I16);
   30148          IRTemp  start = newTemp(Ity_I8);
   30149          IRTemp  len   = newTemp(Ity_I8);
   30150          UChar   rm    = getUChar(delta);
   30151 
   30152          assign( src2, getIRegV(size,pfx) );
   30153          if (epartIsReg(rm)) {
   30154             assign( src1, getIRegE(size,pfx,rm) );
   30155             DIP("bextr %s,%s,%s\n", nameIRegV(size,pfx),
   30156                 nameIRegE(size,pfx,rm), nameIRegG(size,pfx,rm));
   30157             delta++;
   30158          } else {
   30159             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   30160             assign( src1, loadLE(ty, mkexpr(addr)) );
   30161             DIP("bextr %s,%s,%s\n", nameIRegV(size,pfx), dis_buf,
   30162                 nameIRegG(size,pfx,rm));
   30163             delta += alen;
   30164          }
   30165 
   30166          assign( stle, narrowTo( Ity_I16, mkexpr(src2) ) );
   30167          assign( start, unop( Iop_16to8, mkexpr(stle) ) );
   30168          assign( len, unop( Iop_16HIto8, mkexpr(stle) ) );
   30169          /* if (start+len < opsize) {
   30170                if (len != 0)
   30171                   dst = (src1 << (opsize-start-len)) u>> (opsize-len);
   30172                else
   30173                   dst = 0;
   30174             } else {
   30175                if (start < opsize)
   30176                   dst = src1 u>> start;
   30177                else
   30178                   dst = 0;
   30179             } */
   30180          assign( dst,
   30181                  IRExpr_ITE(
   30182                     binop(Iop_CmpLT32U,
   30183                           binop(Iop_Add32,
   30184                                 unop(Iop_8Uto32, mkexpr(start)),
   30185                                 unop(Iop_8Uto32, mkexpr(len))),
   30186                           mkU32(8*size)),
   30187                     IRExpr_ITE(
   30188                        binop(Iop_CmpEQ8, mkexpr(len), mkU8(0)),
   30189                        mkU(ty, 0),
   30190                        binop(mkSizedOp(ty,Iop_Shr8),
   30191                              binop(mkSizedOp(ty,Iop_Shl8), mkexpr(src1),
   30192                                    binop(Iop_Sub8,
   30193                                          binop(Iop_Sub8, mkU8(8*size),
   30194                                                mkexpr(start)),
   30195                                          mkexpr(len))),
   30196                              binop(Iop_Sub8, mkU8(8*size),
   30197                                    mkexpr(len)))
   30198                     ),
   30199                     IRExpr_ITE(
   30200                        binop(Iop_CmpLT32U,
   30201                              unop(Iop_8Uto32, mkexpr(start)),
   30202                              mkU32(8*size)),
   30203                        binop(mkSizedOp(ty,Iop_Shr8), mkexpr(src1),
   30204                              mkexpr(start)),
   30205                        mkU(ty, 0)
   30206                     )
   30207                  )
   30208                );
   30209          putIRegG( size, pfx, rm, mkexpr(dst) );
   30210          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
   30211                                                ? AMD64G_CC_OP_ANDN64
   30212                                                : AMD64G_CC_OP_ANDN32)) );
   30213          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
   30214          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
   30215          *uses_vvvv = True;
   30216          goto decode_success;
   30217       }
   30218       break;
   30219 
   30220    default:
   30221       break;
   30222 
   30223    }
   30224 
   30225   //decode_failure:
   30226    return deltaIN;
   30227 
   30228   decode_success:
   30229    return delta;
   30230 }
   30231 
   30232 /* operand format:
   30233  * [0] = dst
   30234  * [n] = srcn
   30235  */
   30236 static Long decode_vregW(Int count, Long delta, UChar modrm, Prefix pfx,
   30237                          const VexAbiInfo* vbi, IRTemp *v, UInt *dst, Int swap)
   30238 {
   30239    v[0] = newTemp(Ity_V128);
   30240    v[1] = newTemp(Ity_V128);
   30241    v[2] = newTemp(Ity_V128);
   30242    v[3] = newTemp(Ity_V128);
   30243    IRTemp addr = IRTemp_INVALID;
   30244    Int    alen = 0;
   30245    HChar  dis_buf[50];
   30246 
   30247    *dst = gregOfRexRM(pfx, modrm);
   30248    assign( v[0], getXMMReg(*dst) );
   30249 
   30250    if ( epartIsReg( modrm ) ) {
   30251       UInt ereg = eregOfRexRM(pfx, modrm);
   30252       assign(swap ? v[count-1] : v[count-2], getXMMReg(ereg) );
   30253       DIS(dis_buf, "%s", nameXMMReg(ereg));
   30254    } else {
   30255       Bool extra_byte = (getUChar(delta - 3) & 0xF) != 9;
   30256                  addr = disAMode(&alen, vbi, pfx, delta, dis_buf, extra_byte);
   30257       assign(swap ? v[count-1] : v[count-2], loadLE(Ity_V128, mkexpr(addr)));
   30258       delta += alen - 1;
   30259    }
   30260 
   30261    UInt vvvv = getVexNvvvv(pfx);
   30262    switch(count) {
   30263       case 2:
   30264          DIP( "%s,%s", nameXMMReg(*dst), dis_buf );
   30265          break;
   30266       case 3:
   30267          assign( swap ? v[1] : v[2], getXMMReg(vvvv) );
   30268          DIP( "%s,%s,%s", nameXMMReg(*dst), nameXMMReg(vvvv), dis_buf );
   30269          break;
   30270       case 4:
   30271          {
   30272             assign( v[1], getXMMReg(vvvv) );
   30273             UInt src2 = getUChar(delta + 1) >> 4;
   30274             assign( swap ? v[2] : v[3], getXMMReg(src2) );
   30275             DIP( "%s,%s,%s,%s", nameXMMReg(*dst), nameXMMReg(vvvv),
   30276                                 nameXMMReg(src2), dis_buf );
   30277          }
   30278          break;
   30279    }
   30280    return delta + 1;
   30281 }
   30282 
   30283 static Long dis_FMA4 (Prefix pfx, Long delta, UChar opc,
   30284                       Bool* uses_vvvv, const VexAbiInfo* vbi )
   30285 {
   30286    UInt dst;
   30287    *uses_vvvv = True;
   30288 
   30289    UChar  modrm   = getUChar(delta);
   30290 
   30291    Bool zero_64F = False;
   30292    Bool zero_96F = False;
   30293    UInt is_F32   = ((opc & 0x01) == 0x00) ? 1 : 0;
   30294    Bool neg      = (opc & 0xF0) == 0x70;
   30295    Bool alt      = (opc & 0xF0) == 0x50;
   30296    Bool sub      = alt ? (opc & 0x0E) != 0x0E : (opc & 0x0C) == 0x0C;
   30297 
   30298    IRTemp operand[4];
   30299    switch(opc & 0xF) {
   30300       case 0x0A: zero_96F = (opc >> 4) != 0x05; break;
   30301       case 0x0B: zero_64F = (opc >> 4) != 0x05; break;
   30302       case 0x0E: zero_96F = (opc >> 4) != 0x05; break;
   30303       case 0x0F: zero_64F = (opc >> 4) != 0x05; break;
   30304       default: break;
   30305    }
   30306    DIP("vfm%s",                  neg ?   "n" : "");
   30307    if(alt) DIP("%s",             sub ? "add" : "sub");
   30308    DIP("%s",                     sub ? "sub" : "add");
   30309    DIP("%c ", (zero_64F || zero_96F) ?   's' : 'p');
   30310    DIP("%c ",                is_F32  ?   's' : 'd');
   30311    delta = decode_vregW(4, delta, modrm, pfx, vbi, operand, &dst, getRexW(pfx));
   30312    DIP("\n");
   30313    IRExpr *src[3];
   30314 
   30315    void (*putXMM[2])(UInt,Int,IRExpr*) = {&putXMMRegLane64F, &putXMMRegLane32F};
   30316 
   30317    IROp size_op[] = {Iop_V128to64, Iop_V128HIto64, Iop_64to32, Iop_64HIto32};
   30318    IROp neg_op[]  = {Iop_NegF64, Iop_NegF32};
   30319    int i, j;
   30320    for(i = 0; i < is_F32 * 2 + 2; i++) {
   30321       for(j = 0; j < 3; j++) {
   30322          if(is_F32) {
   30323             src[j] = unop(Iop_ReinterpI32asF32,
   30324                         unop(size_op[i%2+2],
   30325                            unop(size_op[i/2],
   30326                                  mkexpr(operand[j + 1])
   30327                               )
   30328                            ));
   30329          } else {
   30330             src[j] = unop(Iop_ReinterpI64asF64,
   30331                         unop(size_op[i%2],
   30332                            mkexpr(operand[j + 1])
   30333                         ));
   30334          }
   30335       }
   30336       putXMM[is_F32](dst, i, IRExpr_Qop(is_F32 ? Iop_MAddF32 : Iop_MAddF64,
   30337                                              get_FAKE_roundingmode(),
   30338                                              neg ? unop(neg_op[is_F32], src[0])
   30339                                                  : src[0],
   30340                                              src[1],
   30341                                              sub ? unop(neg_op[is_F32], src[2])
   30342                                                  : src[2]
   30343                                           ));
   30344       if(alt) {
   30345          sub = !sub;
   30346       }
   30347    }
   30348 
   30349    /* Zero out top bits of ymm/xmm register. */
   30350    putYMMRegLane128( dst, 1, mkV128(0) );
   30351 
   30352    if(zero_64F || zero_96F) {
   30353       putXMMRegLane64( dst, 1, IRExpr_Const(IRConst_U64(0)));
   30354    }
   30355 
   30356    if(zero_96F) {
   30357       putXMMRegLane32( dst, 1, IRExpr_Const(IRConst_U32(0)));
   30358    }
   30359 
   30360    return delta+1;
   30361 }
   30362 
   30363 /*------------------------------------------------------------*/
   30364 /*---                                                      ---*/
   30365 /*--- Top-level post-escape decoders: dis_ESC_0F3A__VEX    ---*/
   30366 /*---                                                      ---*/
   30367 /*------------------------------------------------------------*/
   30368 
   30369 static IRTemp math_VPERMILPS_128 ( IRTemp sV, UInt imm8 )
   30370 {
   30371    vassert(imm8 < 256);
   30372    IRTemp s3, s2, s1, s0;
   30373    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   30374    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   30375 #  define SEL(_nn) (((_nn)==0) ? s0 : ((_nn)==1) ? s1 \
   30376                                     : ((_nn)==2) ? s2 : s3)
   30377    IRTemp res = newTemp(Ity_V128);
   30378    assign(res, mkV128from32s( SEL((imm8 >> 6) & 3),
   30379                               SEL((imm8 >> 4) & 3),
   30380                               SEL((imm8 >> 2) & 3),
   30381                               SEL((imm8 >> 0) & 3) ));
   30382 #  undef SEL
   30383    return res;
   30384 }
   30385 
   30386 __attribute__((noinline))
   30387 static
   30388 Long dis_ESC_0F3A__VEX (
   30389         /*MB_OUT*/DisResult* dres,
   30390         /*OUT*/   Bool*      uses_vvvv,
   30391         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
   30392         Bool         resteerCisOk,
   30393         void*        callback_opaque,
   30394         const VexArchInfo* archinfo,
   30395         const VexAbiInfo*  vbi,
   30396         Prefix pfx, Int sz, Long deltaIN
   30397      )
   30398 {
   30399    IRTemp addr  = IRTemp_INVALID;
   30400    Int    alen  = 0;
   30401    HChar  dis_buf[50];
   30402    Long   delta = deltaIN;
   30403    UChar  opc   = getUChar(delta);
   30404    delta++;
   30405    *uses_vvvv = False;
   30406 
   30407    switch (opc) {
   30408 
   30409    case 0x00:
   30410    case 0x01:
   30411       /* VPERMQ imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.W1 00 /r ib */
   30412       /* VPERMPD imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.W1 01 /r ib */
   30413       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   30414           && 1==getRexW(pfx)/*W1*/) {
   30415          UChar  modrm = getUChar(delta);
   30416          UInt   imm8  = 0;
   30417          UInt   rG    = gregOfRexRM(pfx, modrm);
   30418          IRTemp sV    = newTemp(Ity_V256);
   30419          const HChar *name  = opc == 0 ? "vpermq" : "vpermpd";
   30420          if (epartIsReg(modrm)) {
   30421             UInt rE = eregOfRexRM(pfx, modrm);
   30422             delta += 1;
   30423             imm8 = getUChar(delta);
   30424             DIP("%s $%u,%s,%s\n",
   30425                 name, imm8, nameYMMReg(rE), nameYMMReg(rG));
   30426             assign(sV, getYMMReg(rE));
   30427          } else {
   30428             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30429             delta += alen;
   30430             imm8 = getUChar(delta);
   30431             DIP("%s $%u,%s,%s\n",
   30432                 name, imm8, dis_buf, nameYMMReg(rG));
   30433             assign(sV, loadLE(Ity_V256, mkexpr(addr)));
   30434          }
   30435          delta++;
   30436          IRTemp s[4];
   30437          s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID;
   30438          breakupV256to64s(sV, &s[3], &s[2], &s[1], &s[0]);
   30439          IRTemp dV = newTemp(Ity_V256);
   30440          assign(dV, IRExpr_Qop(Iop_64x4toV256,
   30441                                mkexpr(s[(imm8 >> 6) & 3]),
   30442                                mkexpr(s[(imm8 >> 4) & 3]),
   30443                                mkexpr(s[(imm8 >> 2) & 3]),
   30444                                mkexpr(s[(imm8 >> 0) & 3])));
   30445          putYMMReg(rG, mkexpr(dV));
   30446          goto decode_success;
   30447       }
   30448       break;
   30449 
   30450    case 0x02:
   30451       /* VPBLENDD imm8, xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W0 02 /r ib */
   30452       if (have66noF2noF3(pfx)
   30453           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   30454          UChar  modrm = getUChar(delta);
   30455          UInt   imm8  = 0;
   30456          UInt   rG    = gregOfRexRM(pfx, modrm);
   30457          UInt   rV    = getVexNvvvv(pfx);
   30458          IRTemp sV    = newTemp(Ity_V128);
   30459          IRTemp dV    = newTemp(Ity_V128);
   30460          UInt   i;
   30461          IRTemp s[4], d[4];
   30462          assign(sV, getXMMReg(rV));
   30463          if (epartIsReg(modrm)) {
   30464             UInt rE = eregOfRexRM(pfx, modrm);
   30465             delta += 1;
   30466             imm8 = getUChar(delta);
   30467             DIP("vpblendd $%u,%s,%s,%s\n",
   30468                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   30469             assign(dV, getXMMReg(rE));
   30470          } else {
   30471             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30472             delta += alen;
   30473             imm8 = getUChar(delta);
   30474             DIP("vpblendd $%u,%s,%s,%s\n",
   30475                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   30476             assign(dV, loadLE(Ity_V128, mkexpr(addr)));
   30477          }
   30478          delta++;
   30479          for (i = 0; i < 4; i++) {
   30480             s[i] = IRTemp_INVALID;
   30481             d[i] = IRTemp_INVALID;
   30482          }
   30483          breakupV128to32s( sV, &s[3], &s[2], &s[1], &s[0] );
   30484          breakupV128to32s( dV, &d[3], &d[2], &d[1], &d[0] );
   30485          for (i = 0; i < 4; i++)
   30486             putYMMRegLane32(rG, i, mkexpr((imm8 & (1<<i)) ? d[i] : s[i]));
   30487          putYMMRegLane128(rG, 1, mkV128(0));
   30488          *uses_vvvv = True;
   30489          goto decode_success;
   30490       }
   30491       /* VPBLENDD imm8, ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F3A.W0 02 /r ib */
   30492       if (have66noF2noF3(pfx)
   30493           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   30494          UChar  modrm = getUChar(delta);
   30495          UInt   imm8  = 0;
   30496          UInt   rG    = gregOfRexRM(pfx, modrm);
   30497          UInt   rV    = getVexNvvvv(pfx);
   30498          IRTemp sV    = newTemp(Ity_V256);
   30499          IRTemp dV    = newTemp(Ity_V256);
   30500          UInt   i;
   30501          IRTemp s[8], d[8];
   30502          assign(sV, getYMMReg(rV));
   30503          if (epartIsReg(modrm)) {
   30504             UInt rE = eregOfRexRM(pfx, modrm);
   30505             delta += 1;
   30506             imm8 = getUChar(delta);
   30507             DIP("vpblendd $%u,%s,%s,%s\n",
   30508                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   30509             assign(dV, getYMMReg(rE));
   30510          } else {
   30511             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30512             delta += alen;
   30513             imm8 = getUChar(delta);
   30514             DIP("vpblendd $%u,%s,%s,%s\n",
   30515                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   30516             assign(dV, loadLE(Ity_V256, mkexpr(addr)));
   30517          }
   30518          delta++;
   30519          for (i = 0; i < 8; i++) {
   30520             s[i] = IRTemp_INVALID;
   30521             d[i] = IRTemp_INVALID;
   30522          }
   30523          breakupV256to32s( sV, &s[7], &s[6], &s[5], &s[4],
   30524                                &s[3], &s[2], &s[1], &s[0] );
   30525          breakupV256to32s( dV, &d[7], &d[6], &d[5], &d[4],
   30526                                &d[3], &d[2], &d[1], &d[0] );
   30527          for (i = 0; i < 8; i++)
   30528             putYMMRegLane32(rG, i, mkexpr((imm8 & (1<<i)) ? d[i] : s[i]));
   30529          *uses_vvvv = True;
   30530          goto decode_success;
   30531       }
   30532       break;
   30533 
   30534    case 0x04:
   30535       /* VPERMILPS imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.WIG 04 /r ib */
   30536       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   30537          UChar  modrm = getUChar(delta);
   30538          UInt   imm8  = 0;
   30539          UInt   rG    = gregOfRexRM(pfx, modrm);
   30540          IRTemp sV    = newTemp(Ity_V256);
   30541          if (epartIsReg(modrm)) {
   30542             UInt rE = eregOfRexRM(pfx, modrm);
   30543             delta += 1;
   30544             imm8 = getUChar(delta);
   30545             DIP("vpermilps $%u,%s,%s\n",
   30546                 imm8, nameYMMReg(rE), nameYMMReg(rG));
   30547             assign(sV, getYMMReg(rE));
   30548          } else {
   30549             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30550             delta += alen;
   30551             imm8 = getUChar(delta);
   30552             DIP("vpermilps $%u,%s,%s\n",
   30553                 imm8, dis_buf, nameYMMReg(rG));
   30554             assign(sV, loadLE(Ity_V256, mkexpr(addr)));
   30555          }
   30556          delta++;
   30557          IRTemp  sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   30558          breakupV256toV128s( sV, &sVhi, &sVlo );
   30559          IRTemp  dVhi = math_VPERMILPS_128( sVhi, imm8 );
   30560          IRTemp  dVlo = math_VPERMILPS_128( sVlo, imm8 );
   30561          IRExpr* res  = binop(Iop_V128HLtoV256, mkexpr(dVhi), mkexpr(dVlo));
   30562          putYMMReg(rG, res);
   30563          goto decode_success;
   30564       }
   30565       /* VPERMILPS imm8, xmm2/m128, xmm1 = VEX.128.66.0F3A.WIG 04 /r ib */
   30566       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30567          UChar  modrm = getUChar(delta);
   30568          UInt   imm8  = 0;
   30569          UInt   rG    = gregOfRexRM(pfx, modrm);
   30570          IRTemp sV    = newTemp(Ity_V128);
   30571          if (epartIsReg(modrm)) {
   30572             UInt rE = eregOfRexRM(pfx, modrm);
   30573             delta += 1;
   30574             imm8 = getUChar(delta);
   30575             DIP("vpermilps $%u,%s,%s\n",
   30576                 imm8, nameXMMReg(rE), nameXMMReg(rG));
   30577             assign(sV, getXMMReg(rE));
   30578          } else {
   30579             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30580             delta += alen;
   30581             imm8 = getUChar(delta);
   30582             DIP("vpermilps $%u,%s,%s\n",
   30583                 imm8, dis_buf, nameXMMReg(rG));
   30584             assign(sV, loadLE(Ity_V128, mkexpr(addr)));
   30585          }
   30586          delta++;
   30587          putYMMRegLoAndZU(rG, mkexpr ( math_VPERMILPS_128 ( sV, imm8 ) ) );
   30588          goto decode_success;
   30589       }
   30590       break;
   30591 
   30592    case 0x05:
   30593       /* VPERMILPD imm8, xmm2/m128, xmm1 = VEX.128.66.0F3A.WIG 05 /r ib */
   30594       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30595          UChar  modrm = getUChar(delta);
   30596          UInt   imm8  = 0;
   30597          UInt   rG    = gregOfRexRM(pfx, modrm);
   30598          IRTemp sV    = newTemp(Ity_V128);
   30599          if (epartIsReg(modrm)) {
   30600             UInt rE = eregOfRexRM(pfx, modrm);
   30601             delta += 1;
   30602             imm8 = getUChar(delta);
   30603             DIP("vpermilpd $%u,%s,%s\n",
   30604                 imm8, nameXMMReg(rE), nameXMMReg(rG));
   30605             assign(sV, getXMMReg(rE));
   30606          } else {
   30607             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30608             delta += alen;
   30609             imm8 = getUChar(delta);
   30610             DIP("vpermilpd $%u,%s,%s\n",
   30611                 imm8, dis_buf, nameXMMReg(rG));
   30612             assign(sV, loadLE(Ity_V128, mkexpr(addr)));
   30613          }
   30614          delta++;
   30615          IRTemp s1 = newTemp(Ity_I64);
   30616          IRTemp s0 = newTemp(Ity_I64);
   30617          assign(s1, unop(Iop_V128HIto64, mkexpr(sV)));
   30618          assign(s0, unop(Iop_V128to64,   mkexpr(sV)));
   30619          IRTemp dV = newTemp(Ity_V128);
   30620          assign(dV, binop(Iop_64HLtoV128,
   30621                                mkexpr((imm8 & (1<<1)) ? s1 : s0),
   30622                                mkexpr((imm8 & (1<<0)) ? s1 : s0)));
   30623          putYMMRegLoAndZU(rG, mkexpr(dV));
   30624          goto decode_success;
   30625       }
   30626       /* VPERMILPD imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.WIG 05 /r ib */
   30627       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   30628          UChar  modrm = getUChar(delta);
   30629          UInt   imm8  = 0;
   30630          UInt   rG    = gregOfRexRM(pfx, modrm);
   30631          IRTemp sV    = newTemp(Ity_V256);
   30632          if (epartIsReg(modrm)) {
   30633             UInt rE = eregOfRexRM(pfx, modrm);
   30634             delta += 1;
   30635             imm8 = getUChar(delta);
   30636             DIP("vpermilpd $%u,%s,%s\n",
   30637                 imm8, nameYMMReg(rE), nameYMMReg(rG));
   30638             assign(sV, getYMMReg(rE));
   30639          } else {
   30640             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30641             delta += alen;
   30642             imm8 = getUChar(delta);
   30643             DIP("vpermilpd $%u,%s,%s\n",
   30644                 imm8, dis_buf, nameYMMReg(rG));
   30645             assign(sV, loadLE(Ity_V256, mkexpr(addr)));
   30646          }
   30647          delta++;
   30648          IRTemp s3, s2, s1, s0;
   30649          s3 = s2 = s1 = s0 = IRTemp_INVALID;
   30650          breakupV256to64s(sV, &s3, &s2, &s1, &s0);
   30651          IRTemp dV = newTemp(Ity_V256);
   30652          assign(dV, IRExpr_Qop(Iop_64x4toV256,
   30653                                mkexpr((imm8 & (1<<3)) ? s3 : s2),
   30654                                mkexpr((imm8 & (1<<2)) ? s3 : s2),
   30655                                mkexpr((imm8 & (1<<1)) ? s1 : s0),
   30656                                mkexpr((imm8 & (1<<0)) ? s1 : s0)));
   30657          putYMMReg(rG, mkexpr(dV));
   30658          goto decode_success;
   30659       }
   30660       break;
   30661 
   30662    case 0x06:
   30663       /* VPERM2F128 imm8, ymm3/m256, ymm2, ymm1 = VEX.NDS.66.0F3A.W0 06 /r ib */
   30664       if (have66noF2noF3(pfx)
   30665           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   30666          UChar  modrm = getUChar(delta);
   30667          UInt   imm8  = 0;
   30668          UInt   rG    = gregOfRexRM(pfx, modrm);
   30669          UInt   rV    = getVexNvvvv(pfx);
   30670          IRTemp s00   = newTemp(Ity_V128);
   30671          IRTemp s01   = newTemp(Ity_V128);
   30672          IRTemp s10   = newTemp(Ity_V128);
   30673          IRTemp s11   = newTemp(Ity_V128);
   30674          assign(s00, getYMMRegLane128(rV, 0));
   30675          assign(s01, getYMMRegLane128(rV, 1));
   30676          if (epartIsReg(modrm)) {
   30677             UInt rE = eregOfRexRM(pfx, modrm);
   30678             delta += 1;
   30679             imm8 = getUChar(delta);
   30680             DIP("vperm2f128 $%u,%s,%s,%s\n",
   30681                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   30682             assign(s10, getYMMRegLane128(rE, 0));
   30683             assign(s11, getYMMRegLane128(rE, 1));
   30684          } else {
   30685             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30686             delta += alen;
   30687             imm8 = getUChar(delta);
   30688             DIP("vperm2f128 $%u,%s,%s,%s\n",
   30689                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   30690             assign(s10, loadLE(Ity_V128, binop(Iop_Add64,
   30691                                                mkexpr(addr), mkU64(0))));
   30692             assign(s11, loadLE(Ity_V128, binop(Iop_Add64,
   30693                                                mkexpr(addr), mkU64(16))));
   30694          }
   30695          delta++;
   30696 #        define SEL(_nn) (((_nn)==0) ? s00 : ((_nn)==1) ? s01 \
   30697                                            : ((_nn)==2) ? s10 : s11)
   30698          putYMMRegLane128(rG, 0, mkexpr(SEL((imm8 >> 0) & 3)));
   30699          putYMMRegLane128(rG, 1, mkexpr(SEL((imm8 >> 4) & 3)));
   30700 #        undef SEL
   30701          if (imm8 & (1<<3)) putYMMRegLane128(rG, 0, mkV128(0));
   30702          if (imm8 & (1<<7)) putYMMRegLane128(rG, 1, mkV128(0));
   30703          *uses_vvvv = True;
   30704          goto decode_success;
   30705       }
   30706       break;
   30707 
   30708    case 0x08:
   30709       /* VROUNDPS imm8, xmm2/m128, xmm1 */
   30710       /* VROUNDPS = VEX.NDS.128.66.0F3A.WIG 08 ib */
   30711       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30712          UChar  modrm = getUChar(delta);
   30713          UInt   rG    = gregOfRexRM(pfx, modrm);
   30714          IRTemp src   = newTemp(Ity_V128);
   30715          IRTemp s0    = IRTemp_INVALID;
   30716          IRTemp s1    = IRTemp_INVALID;
   30717          IRTemp s2    = IRTemp_INVALID;
   30718          IRTemp s3    = IRTemp_INVALID;
   30719          IRTemp rm    = newTemp(Ity_I32);
   30720          Int    imm   = 0;
   30721 
   30722          modrm = getUChar(delta);
   30723 
   30724          if (epartIsReg(modrm)) {
   30725             UInt rE = eregOfRexRM(pfx, modrm);
   30726             assign( src, getXMMReg( rE ) );
   30727             imm = getUChar(delta+1);
   30728             if (imm & ~15) break;
   30729             delta += 1+1;
   30730             DIP( "vroundps $%d,%s,%s\n", imm, nameXMMReg(rE), nameXMMReg(rG) );
   30731          } else {
   30732             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30733             assign( src, loadLE(Ity_V128, mkexpr(addr) ) );
   30734             imm = getUChar(delta+alen);
   30735             if (imm & ~15) break;
   30736             delta += alen+1;
   30737             DIP( "vroundps $%d,%s,%s\n", imm, dis_buf, nameXMMReg(rG) );
   30738          }
   30739 
   30740          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   30741             that encoding is the same as the encoding for IRRoundingMode,
   30742             we can use that value directly in the IR as a rounding
   30743             mode. */
   30744          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   30745 
   30746          breakupV128to32s( src, &s3, &s2, &s1, &s0 );
   30747          putYMMRegLane128( rG, 1, mkV128(0) );
   30748 #        define CVT(s) binop(Iop_RoundF32toInt, mkexpr(rm), \
   30749                              unop(Iop_ReinterpI32asF32, mkexpr(s)))
   30750          putYMMRegLane32F( rG, 3, CVT(s3) );
   30751          putYMMRegLane32F( rG, 2, CVT(s2) );
   30752          putYMMRegLane32F( rG, 1, CVT(s1) );
   30753          putYMMRegLane32F( rG, 0, CVT(s0) );
   30754 #        undef CVT
   30755          goto decode_success;
   30756       }
   30757       /* VROUNDPS imm8, ymm2/m256, ymm1 */
   30758       /* VROUNDPS = VEX.NDS.256.66.0F3A.WIG 08 ib */
   30759       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   30760          UChar  modrm = getUChar(delta);
   30761          UInt   rG    = gregOfRexRM(pfx, modrm);
   30762          IRTemp src   = newTemp(Ity_V256);
   30763          IRTemp s0    = IRTemp_INVALID;
   30764          IRTemp s1    = IRTemp_INVALID;
   30765          IRTemp s2    = IRTemp_INVALID;
   30766          IRTemp s3    = IRTemp_INVALID;
   30767          IRTemp s4    = IRTemp_INVALID;
   30768          IRTemp s5    = IRTemp_INVALID;
   30769          IRTemp s6    = IRTemp_INVALID;
   30770          IRTemp s7    = IRTemp_INVALID;
   30771          IRTemp rm    = newTemp(Ity_I32);
   30772          Int    imm   = 0;
   30773 
   30774          modrm = getUChar(delta);
   30775 
   30776          if (epartIsReg(modrm)) {
   30777             UInt rE = eregOfRexRM(pfx, modrm);
   30778             assign( src, getYMMReg( rE ) );
   30779             imm = getUChar(delta+1);
   30780             if (imm & ~15) break;
   30781             delta += 1+1;
   30782             DIP( "vroundps $%d,%s,%s\n", imm, nameYMMReg(rE), nameYMMReg(rG) );
   30783          } else {
   30784             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30785             assign( src, loadLE(Ity_V256, mkexpr(addr) ) );
   30786             imm = getUChar(delta+alen);
   30787             if (imm & ~15) break;
   30788             delta += alen+1;
   30789             DIP( "vroundps $%d,%s,%s\n", imm, dis_buf, nameYMMReg(rG) );
   30790          }
   30791 
   30792          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   30793             that encoding is the same as the encoding for IRRoundingMode,
   30794             we can use that value directly in the IR as a rounding
   30795             mode. */
   30796          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   30797 
   30798          breakupV256to32s( src, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
   30799 #        define CVT(s) binop(Iop_RoundF32toInt, mkexpr(rm), \
   30800                              unop(Iop_ReinterpI32asF32, mkexpr(s)))
   30801          putYMMRegLane32F( rG, 7, CVT(s7) );
   30802          putYMMRegLane32F( rG, 6, CVT(s6) );
   30803          putYMMRegLane32F( rG, 5, CVT(s5) );
   30804          putYMMRegLane32F( rG, 4, CVT(s4) );
   30805          putYMMRegLane32F( rG, 3, CVT(s3) );
   30806          putYMMRegLane32F( rG, 2, CVT(s2) );
   30807          putYMMRegLane32F( rG, 1, CVT(s1) );
   30808          putYMMRegLane32F( rG, 0, CVT(s0) );
   30809 #        undef CVT
   30810          goto decode_success;
   30811       }
   30812 
   30813    case 0x09:
   30814       /* VROUNDPD imm8, xmm2/m128, xmm1 */
   30815       /* VROUNDPD = VEX.NDS.128.66.0F3A.WIG 09 ib */
   30816       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30817          UChar  modrm = getUChar(delta);
   30818          UInt   rG    = gregOfRexRM(pfx, modrm);
   30819          IRTemp src   = newTemp(Ity_V128);
   30820          IRTemp s0    = IRTemp_INVALID;
   30821          IRTemp s1    = IRTemp_INVALID;
   30822          IRTemp rm    = newTemp(Ity_I32);
   30823          Int    imm   = 0;
   30824 
   30825          modrm = getUChar(delta);
   30826 
   30827          if (epartIsReg(modrm)) {
   30828             UInt rE = eregOfRexRM(pfx, modrm);
   30829             assign( src, getXMMReg( rE ) );
   30830             imm = getUChar(delta+1);
   30831             if (imm & ~15) break;
   30832             delta += 1+1;
   30833             DIP( "vroundpd $%d,%s,%s\n", imm, nameXMMReg(rE), nameXMMReg(rG) );
   30834          } else {
   30835             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30836             assign( src, loadLE(Ity_V128, mkexpr(addr) ) );
   30837             imm = getUChar(delta+alen);
   30838             if (imm & ~15) break;
   30839             delta += alen+1;
   30840             DIP( "vroundpd $%d,%s,%s\n", imm, dis_buf, nameXMMReg(rG) );
   30841          }
   30842 
   30843          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   30844             that encoding is the same as the encoding for IRRoundingMode,
   30845             we can use that value directly in the IR as a rounding
   30846             mode. */
   30847          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   30848 
   30849          breakupV128to64s( src, &s1, &s0 );
   30850          putYMMRegLane128( rG, 1, mkV128(0) );
   30851 #        define CVT(s) binop(Iop_RoundF64toInt, mkexpr(rm), \
   30852                              unop(Iop_ReinterpI64asF64, mkexpr(s)))
   30853          putYMMRegLane64F( rG, 1, CVT(s1) );
   30854          putYMMRegLane64F( rG, 0, CVT(s0) );
   30855 #        undef CVT
   30856          goto decode_success;
   30857       }
   30858       /* VROUNDPD imm8, ymm2/m256, ymm1 */
   30859       /* VROUNDPD = VEX.NDS.256.66.0F3A.WIG 09 ib */
   30860       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   30861          UChar  modrm = getUChar(delta);
   30862          UInt   rG    = gregOfRexRM(pfx, modrm);
   30863          IRTemp src   = newTemp(Ity_V256);
   30864          IRTemp s0    = IRTemp_INVALID;
   30865          IRTemp s1    = IRTemp_INVALID;
   30866          IRTemp s2    = IRTemp_INVALID;
   30867          IRTemp s3    = IRTemp_INVALID;
   30868          IRTemp rm    = newTemp(Ity_I32);
   30869          Int    imm   = 0;
   30870 
   30871          modrm = getUChar(delta);
   30872 
   30873          if (epartIsReg(modrm)) {
   30874             UInt rE = eregOfRexRM(pfx, modrm);
   30875             assign( src, getYMMReg( rE ) );
   30876             imm = getUChar(delta+1);
   30877             if (imm & ~15) break;
   30878             delta += 1+1;
   30879             DIP( "vroundpd $%d,%s,%s\n", imm, nameYMMReg(rE), nameYMMReg(rG) );
   30880          } else {
   30881             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30882             assign( src, loadLE(Ity_V256, mkexpr(addr) ) );
   30883             imm = getUChar(delta+alen);
   30884             if (imm & ~15) break;
   30885             delta += alen+1;
   30886             DIP( "vroundps $%d,%s,%s\n", imm, dis_buf, nameYMMReg(rG) );
   30887          }
   30888 
   30889          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   30890             that encoding is the same as the encoding for IRRoundingMode,
   30891             we can use that value directly in the IR as a rounding
   30892             mode. */
   30893          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   30894 
   30895          breakupV256to64s( src, &s3, &s2, &s1, &s0 );
   30896 #        define CVT(s) binop(Iop_RoundF64toInt, mkexpr(rm), \
   30897                              unop(Iop_ReinterpI64asF64, mkexpr(s)))
   30898          putYMMRegLane64F( rG, 3, CVT(s3) );
   30899          putYMMRegLane64F( rG, 2, CVT(s2) );
   30900          putYMMRegLane64F( rG, 1, CVT(s1) );
   30901          putYMMRegLane64F( rG, 0, CVT(s0) );
   30902 #        undef CVT
   30903          goto decode_success;
   30904       }
   30905 
   30906    case 0x0A:
   30907    case 0x0B:
   30908       /* VROUNDSS imm8, xmm3/m32, xmm2, xmm1 */
   30909       /* VROUNDSS = VEX.NDS.128.66.0F3A.WIG 0A ib */
   30910       /* VROUNDSD imm8, xmm3/m64, xmm2, xmm1 */
   30911       /* VROUNDSD = VEX.NDS.128.66.0F3A.WIG 0B ib */
   30912       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30913          UChar  modrm = getUChar(delta);
   30914          UInt   rG    = gregOfRexRM(pfx, modrm);
   30915          UInt   rV    = getVexNvvvv(pfx);
   30916          Bool   isD   = opc == 0x0B;
   30917          IRTemp src   = newTemp(isD ? Ity_F64 : Ity_F32);
   30918          IRTemp res   = newTemp(isD ? Ity_F64 : Ity_F32);
   30919          Int    imm   = 0;
   30920 
   30921          if (epartIsReg(modrm)) {
   30922             UInt rE = eregOfRexRM(pfx, modrm);
   30923             assign( src,
   30924                     isD ? getXMMRegLane64F(rE, 0) : getXMMRegLane32F(rE, 0) );
   30925             imm = getUChar(delta+1);
   30926             if (imm & ~15) break;
   30927             delta += 1+1;
   30928             DIP( "vrounds%c $%d,%s,%s,%s\n",
   30929                  isD ? 'd' : 's',
   30930                  imm, nameXMMReg( rE ), nameXMMReg( rV ), nameXMMReg( rG ) );
   30931          } else {
   30932             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30933             assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
   30934             imm = getUChar(delta+alen);
   30935             if (imm & ~15) break;
   30936             delta += alen+1;
   30937             DIP( "vrounds%c $%d,%s,%s,%s\n",
   30938                  isD ? 'd' : 's',
   30939                  imm, dis_buf, nameXMMReg( rV ), nameXMMReg( rG ) );
   30940          }
   30941 
   30942          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   30943             that encoding is the same as the encoding for IRRoundingMode,
   30944             we can use that value directly in the IR as a rounding
   30945             mode. */
   30946          assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
   30947                            (imm & 4) ? get_sse_roundingmode()
   30948                                      : mkU32(imm & 3),
   30949                            mkexpr(src)) );
   30950 
   30951          if (isD)
   30952             putXMMRegLane64F( rG, 0, mkexpr(res) );
   30953          else {
   30954             putXMMRegLane32F( rG, 0, mkexpr(res) );
   30955             putXMMRegLane32F( rG, 1, getXMMRegLane32F( rV, 1 ) );
   30956          }
   30957          putXMMRegLane64F( rG, 1, getXMMRegLane64F( rV, 1 ) );
   30958          putYMMRegLane128( rG, 1, mkV128(0) );
   30959          *uses_vvvv = True;
   30960          goto decode_success;
   30961       }
   30962       break;
   30963 
   30964    case 0x0C:
   30965       /* VBLENDPS imm8, ymm3/m256, ymm2, ymm1 */
   30966       /* VBLENDPS = VEX.NDS.256.66.0F3A.WIG 0C /r ib */
   30967       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   30968          UChar  modrm = getUChar(delta);
   30969          UInt   imm8;
   30970          UInt   rG    = gregOfRexRM(pfx, modrm);
   30971          UInt   rV    = getVexNvvvv(pfx);
   30972          IRTemp sV    = newTemp(Ity_V256);
   30973          IRTemp sE    = newTemp(Ity_V256);
   30974          assign ( sV, getYMMReg(rV) );
   30975          if (epartIsReg(modrm)) {
   30976             UInt rE = eregOfRexRM(pfx, modrm);
   30977             delta += 1;
   30978             imm8 = getUChar(delta);
   30979             DIP("vblendps $%u,%s,%s,%s\n",
   30980                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   30981             assign(sE, getYMMReg(rE));
   30982          } else {
   30983             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30984             delta += alen;
   30985             imm8 = getUChar(delta);
   30986             DIP("vblendps $%u,%s,%s,%s\n",
   30987                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   30988             assign(sE, loadLE(Ity_V256, mkexpr(addr)));
   30989          }
   30990          delta++;
   30991          putYMMReg( rG,
   30992                     mkexpr( math_BLENDPS_256( sE, sV, imm8) ) );
   30993          *uses_vvvv = True;
   30994          goto decode_success;
   30995       }
   30996       /* VBLENDPS imm8, xmm3/m128, xmm2, xmm1 */
   30997       /* VBLENDPS = VEX.NDS.128.66.0F3A.WIG 0C /r ib */
   30998       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30999          UChar  modrm = getUChar(delta);
   31000          UInt   imm8;
   31001          UInt   rG    = gregOfRexRM(pfx, modrm);
   31002          UInt   rV    = getVexNvvvv(pfx);
   31003          IRTemp sV    = newTemp(Ity_V128);
   31004          IRTemp sE    = newTemp(Ity_V128);
   31005          assign ( sV, getXMMReg(rV) );
   31006          if (epartIsReg(modrm)) {
   31007             UInt rE = eregOfRexRM(pfx, modrm);
   31008             delta += 1;
   31009             imm8 = getUChar(delta);
   31010             DIP("vblendps $%u,%s,%s,%s\n",
   31011                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   31012             assign(sE, getXMMReg(rE));
   31013          } else {
   31014             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31015             delta += alen;
   31016             imm8 = getUChar(delta);
   31017             DIP("vblendps $%u,%s,%s,%s\n",
   31018                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   31019             assign(sE, loadLE(Ity_V128, mkexpr(addr)));
   31020          }
   31021          delta++;
   31022          putYMMRegLoAndZU( rG,
   31023                            mkexpr( math_BLENDPS_128( sE, sV, imm8) ) );
   31024          *uses_vvvv = True;
   31025          goto decode_success;
   31026       }
   31027       break;
   31028 
   31029    case 0x0D:
   31030       /* VBLENDPD imm8, ymm3/m256, ymm2, ymm1 */
   31031       /* VBLENDPD = VEX.NDS.256.66.0F3A.WIG 0D /r ib */
   31032       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   31033          UChar  modrm = getUChar(delta);
   31034          UInt   imm8;
   31035          UInt   rG    = gregOfRexRM(pfx, modrm);
   31036          UInt   rV    = getVexNvvvv(pfx);
   31037          IRTemp sV    = newTemp(Ity_V256);
   31038          IRTemp sE    = newTemp(Ity_V256);
   31039          assign ( sV, getYMMReg(rV) );
   31040          if (epartIsReg(modrm)) {
   31041             UInt rE = eregOfRexRM(pfx, modrm);
   31042             delta += 1;
   31043             imm8 = getUChar(delta);
   31044             DIP("vblendpd $%u,%s,%s,%s\n",
   31045                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   31046             assign(sE, getYMMReg(rE));
   31047          } else {
   31048             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31049             delta += alen;
   31050             imm8 = getUChar(delta);
   31051             DIP("vblendpd $%u,%s,%s,%s\n",
   31052                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   31053             assign(sE, loadLE(Ity_V256, mkexpr(addr)));
   31054          }
   31055          delta++;
   31056          putYMMReg( rG,
   31057                     mkexpr( math_BLENDPD_256( sE, sV, imm8) ) );
   31058          *uses_vvvv = True;
   31059          goto decode_success;
   31060       }
   31061       /* VBLENDPD imm8, xmm3/m128, xmm2, xmm1 */
   31062       /* VBLENDPD = VEX.NDS.128.66.0F3A.WIG 0D /r ib */
   31063       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31064          UChar  modrm = getUChar(delta);
   31065          UInt   imm8;
   31066          UInt   rG    = gregOfRexRM(pfx, modrm);
   31067          UInt   rV    = getVexNvvvv(pfx);
   31068          IRTemp sV    = newTemp(Ity_V128);
   31069          IRTemp sE    = newTemp(Ity_V128);
   31070          assign ( sV, getXMMReg(rV) );
   31071          if (epartIsReg(modrm)) {
   31072             UInt rE = eregOfRexRM(pfx, modrm);
   31073             delta += 1;
   31074             imm8 = getUChar(delta);
   31075             DIP("vblendpd $%u,%s,%s,%s\n",
   31076                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   31077             assign(sE, getXMMReg(rE));
   31078          } else {
   31079             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31080             delta += alen;
   31081             imm8 = getUChar(delta);
   31082             DIP("vblendpd $%u,%s,%s,%s\n",
   31083                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   31084             assign(sE, loadLE(Ity_V128, mkexpr(addr)));
   31085          }
   31086          delta++;
   31087          putYMMRegLoAndZU( rG,
   31088                            mkexpr( math_BLENDPD_128( sE, sV, imm8) ) );
   31089          *uses_vvvv = True;
   31090          goto decode_success;
   31091       }
   31092       break;
   31093 
   31094    case 0x0E:
   31095       /* VPBLENDW imm8, xmm3/m128, xmm2, xmm1 */
   31096       /* VPBLENDW = VEX.NDS.128.66.0F3A.WIG 0E /r ib */
   31097       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31098          UChar  modrm = getUChar(delta);
   31099          UInt   imm8;
   31100          UInt   rG    = gregOfRexRM(pfx, modrm);
   31101          UInt   rV    = getVexNvvvv(pfx);
   31102          IRTemp sV    = newTemp(Ity_V128);
   31103          IRTemp sE    = newTemp(Ity_V128);
   31104          assign ( sV, getXMMReg(rV) );
   31105          if (epartIsReg(modrm)) {
   31106             UInt rE = eregOfRexRM(pfx, modrm);
   31107             delta += 1;
   31108             imm8 = getUChar(delta);
   31109             DIP("vpblendw $%u,%s,%s,%s\n",
   31110                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   31111             assign(sE, getXMMReg(rE));
   31112          } else {
   31113             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31114             delta += alen;
   31115             imm8 = getUChar(delta);
   31116             DIP("vpblendw $%u,%s,%s,%s\n",
   31117                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   31118             assign(sE, loadLE(Ity_V128, mkexpr(addr)));
   31119          }
   31120          delta++;
   31121          putYMMRegLoAndZU( rG,
   31122                            mkexpr( math_PBLENDW_128( sE, sV, imm8) ) );
   31123          *uses_vvvv = True;
   31124          goto decode_success;
   31125       }
   31126       /* VPBLENDW imm8, ymm3/m256, ymm2, ymm1 */
   31127       /* VPBLENDW = VEX.NDS.256.66.0F3A.WIG 0E /r ib */
   31128       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   31129          UChar  modrm = getUChar(delta);
   31130          UInt   imm8;
   31131          UInt   rG    = gregOfRexRM(pfx, modrm);
   31132          UInt   rV    = getVexNvvvv(pfx);
   31133          IRTemp sV    = newTemp(Ity_V256);
   31134          IRTemp sE    = newTemp(Ity_V256);
   31135          IRTemp sVhi, sVlo, sEhi, sElo;
   31136          sVhi = sVlo = sEhi = sElo = IRTemp_INVALID;
   31137          assign ( sV, getYMMReg(rV) );
   31138          if (epartIsReg(modrm)) {
   31139             UInt rE = eregOfRexRM(pfx, modrm);
   31140             delta += 1;
   31141             imm8 = getUChar(delta);
   31142             DIP("vpblendw $%u,%s,%s,%s\n",
   31143                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   31144             assign(sE, getYMMReg(rE));
   31145          } else {
   31146             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31147             delta += alen;
   31148             imm8 = getUChar(delta);
   31149             DIP("vpblendw $%u,%s,%s,%s\n",
   31150                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   31151             assign(sE, loadLE(Ity_V256, mkexpr(addr)));
   31152          }
   31153          delta++;
   31154          breakupV256toV128s( sV, &sVhi, &sVlo );
   31155          breakupV256toV128s( sE, &sEhi, &sElo );
   31156          putYMMReg( rG, binop( Iop_V128HLtoV256,
   31157                                mkexpr( math_PBLENDW_128( sEhi, sVhi, imm8) ),
   31158                                mkexpr( math_PBLENDW_128( sElo, sVlo, imm8) ) ) );
   31159          *uses_vvvv = True;
   31160          goto decode_success;
   31161       }
   31162       break;
   31163 
   31164    case 0x0F:
   31165       /* VPALIGNR imm8, xmm3/m128, xmm2, xmm1 */
   31166       /* VPALIGNR = VEX.NDS.128.66.0F3A.WIG 0F /r ib */
   31167       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31168          UChar  modrm = getUChar(delta);
   31169          UInt   rG    = gregOfRexRM(pfx, modrm);
   31170          UInt   rV    = getVexNvvvv(pfx);
   31171          IRTemp sV    = newTemp(Ity_V128);
   31172          IRTemp dV    = newTemp(Ity_V128);
   31173          UInt   imm8;
   31174 
   31175          assign( dV, getXMMReg(rV) );
   31176 
   31177          if ( epartIsReg( modrm ) ) {
   31178             UInt   rE = eregOfRexRM(pfx, modrm);
   31179             assign( sV, getXMMReg(rE) );
   31180             imm8 = getUChar(delta+1);
   31181             delta += 1+1;
   31182             DIP("vpalignr $%u,%s,%s,%s\n", imm8, nameXMMReg(rE),
   31183                                            nameXMMReg(rV), nameXMMReg(rG));
   31184          } else {
   31185             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31186             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   31187             imm8 = getUChar(delta+alen);
   31188             delta += alen+1;
   31189             DIP("vpalignr $%u,%s,%s,%s\n", imm8, dis_buf,
   31190                                            nameXMMReg(rV), nameXMMReg(rG));
   31191          }
   31192 
   31193          IRTemp res = math_PALIGNR_XMM( sV, dV, imm8 );
   31194          putYMMRegLoAndZU( rG, mkexpr(res) );
   31195          *uses_vvvv = True;
   31196          goto decode_success;
   31197       }
   31198       /* VPALIGNR imm8, ymm3/m256, ymm2, ymm1 */
   31199       /* VPALIGNR = VEX.NDS.256.66.0F3A.WIG 0F /r ib */
   31200       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   31201          UChar  modrm = getUChar(delta);
   31202          UInt   rG    = gregOfRexRM(pfx, modrm);
   31203          UInt   rV    = getVexNvvvv(pfx);
   31204          IRTemp sV    = newTemp(Ity_V256);
   31205          IRTemp dV    = newTemp(Ity_V256);
   31206          IRTemp sHi, sLo, dHi, dLo;
   31207          sHi = sLo = dHi = dLo = IRTemp_INVALID;
   31208          UInt   imm8;
   31209 
   31210          assign( dV, getYMMReg(rV) );
   31211 
   31212          if ( epartIsReg( modrm ) ) {
   31213             UInt   rE = eregOfRexRM(pfx, modrm);
   31214             assign( sV, getYMMReg(rE) );
   31215             imm8 = getUChar(delta+1);
   31216             delta += 1+1;
   31217             DIP("vpalignr $%u,%s,%s,%s\n", imm8, nameYMMReg(rE),
   31218                                            nameYMMReg(rV), nameYMMReg(rG));
   31219          } else {
   31220             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31221             assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   31222             imm8 = getUChar(delta+alen);
   31223             delta += alen+1;
   31224             DIP("vpalignr $%u,%s,%s,%s\n", imm8, dis_buf,
   31225                                            nameYMMReg(rV), nameYMMReg(rG));
   31226          }
   31227 
   31228          breakupV256toV128s( dV, &dHi, &dLo );
   31229          breakupV256toV128s( sV, &sHi, &sLo );
   31230          putYMMReg( rG, binop( Iop_V128HLtoV256,
   31231                                mkexpr( math_PALIGNR_XMM( sHi, dHi, imm8 ) ),
   31232                                mkexpr( math_PALIGNR_XMM( sLo, dLo, imm8 ) ) )
   31233                     );
   31234          *uses_vvvv = True;
   31235          goto decode_success;
   31236       }
   31237       break;
   31238 
   31239    case 0x14:
   31240       /* VPEXTRB imm8, xmm2, reg/m8 = VEX.128.66.0F3A.W0 14 /r ib */
   31241       if (have66noF2noF3(pfx)
   31242           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   31243          delta = dis_PEXTRB_128_GtoE( vbi, pfx, delta, False/*!isAvx*/ );
   31244          goto decode_success;
   31245       }
   31246       break;
   31247 
   31248    case 0x15:
   31249       /* VPEXTRW imm8, reg/m16, xmm2 */
   31250       /* VPEXTRW = VEX.128.66.0F3A.W0 15 /r ib */
   31251       if (have66noF2noF3(pfx)
   31252           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   31253          delta = dis_PEXTRW( vbi, pfx, delta, True/*isAvx*/ );
   31254          goto decode_success;
   31255       }
   31256       break;
   31257 
   31258    case 0x16:
   31259       /* VPEXTRD imm8, r32/m32, xmm2 */
   31260       /* VPEXTRD = VEX.128.66.0F3A.W0 16 /r ib */
   31261       if (have66noF2noF3(pfx)
   31262           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   31263          delta = dis_PEXTRD( vbi, pfx, delta, True/*isAvx*/ );
   31264          goto decode_success;
   31265       }
   31266       /* VPEXTRQ = VEX.128.66.0F3A.W1 16 /r ib */
   31267       if (have66noF2noF3(pfx)
   31268           && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
   31269          delta = dis_PEXTRQ( vbi, pfx, delta, True/*isAvx*/ );
   31270          goto decode_success;
   31271       }
   31272       break;
   31273 
   31274    case 0x17:
   31275       /* VEXTRACTPS imm8, xmm1, r32/m32 = VEX.128.66.0F3A.WIG 17 /r ib */
   31276       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31277          delta = dis_EXTRACTPS( vbi, pfx, delta, True/*isAvx*/ );
   31278          goto decode_success;
   31279       }
   31280       break;
   31281 
   31282    case 0x18:
   31283       /* VINSERTF128 r/m, rV, rD
   31284          ::: rD = insertinto(a lane in rV, 128 bits from r/m) */
   31285       /* VINSERTF128 = VEX.NDS.256.66.0F3A.W0 18 /r ib */
   31286       if (have66noF2noF3(pfx)
   31287           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   31288          UChar  modrm = getUChar(delta);
   31289          UInt   ib    = 0;
   31290          UInt   rG    = gregOfRexRM(pfx, modrm);
   31291          UInt   rV    = getVexNvvvv(pfx);
   31292          IRTemp t128  = newTemp(Ity_V128);
   31293          if (epartIsReg(modrm)) {
   31294             UInt rE = eregOfRexRM(pfx, modrm);
   31295             delta += 1;
   31296             assign(t128, getXMMReg(rE));
   31297             ib = getUChar(delta);
   31298             DIP("vinsertf128 $%u,%s,%s,%s\n",
   31299                 ib, nameXMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   31300          } else {
   31301             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31302             assign(t128, loadLE(Ity_V128, mkexpr(addr)));
   31303             delta += alen;
   31304             ib = getUChar(delta);
   31305             DIP("vinsertf128 $%u,%s,%s,%s\n",
   31306                 ib, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   31307          }
   31308          delta++;
   31309          putYMMRegLane128(rG, 0,   getYMMRegLane128(rV, 0));
   31310          putYMMRegLane128(rG, 1,   getYMMRegLane128(rV, 1));
   31311          putYMMRegLane128(rG, ib & 1, mkexpr(t128));
   31312          *uses_vvvv = True;
   31313          goto decode_success;
   31314       }
   31315       break;
   31316 
   31317    case 0x19:
   31318      /* VEXTRACTF128 $lane_no, rS, r/m
   31319         ::: r/m:V128 = a lane of rS:V256 (RM format) */
   31320      /* VEXTRACTF128 = VEX.256.66.0F3A.W0 19 /r ib */
   31321       if (have66noF2noF3(pfx)
   31322           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   31323          UChar  modrm = getUChar(delta);
   31324          UInt   ib    = 0;
   31325          UInt   rS    = gregOfRexRM(pfx, modrm);
   31326          IRTemp t128  = newTemp(Ity_V128);
   31327          if (epartIsReg(modrm)) {
   31328             UInt rD = eregOfRexRM(pfx, modrm);
   31329             delta += 1;
   31330             ib = getUChar(delta);
   31331             assign(t128, getYMMRegLane128(rS, ib & 1));
   31332             putYMMRegLoAndZU(rD, mkexpr(t128));
   31333             DIP("vextractf128 $%u,%s,%s\n",
   31334                 ib, nameXMMReg(rS), nameYMMReg(rD));
   31335          } else {
   31336             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31337             delta += alen;
   31338             ib = getUChar(delta);
   31339             assign(t128, getYMMRegLane128(rS, ib & 1));
   31340             storeLE(mkexpr(addr), mkexpr(t128));
   31341             DIP("vextractf128 $%u,%s,%s\n",
   31342                 ib, nameYMMReg(rS), dis_buf);
   31343          }
   31344          delta++;
   31345          /* doesn't use vvvv */
   31346          goto decode_success;
   31347       }
   31348       break;
   31349 
   31350    case 0x20:
   31351       /* VPINSRB r32/m8, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W0 20 /r ib */
   31352       if (have66noF2noF3(pfx)
   31353           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   31354          UChar  modrm  = getUChar(delta);
   31355          UInt   rG     = gregOfRexRM(pfx, modrm);
   31356          UInt   rV     = getVexNvvvv(pfx);
   31357          Int    imm8;
   31358          IRTemp src_u8 = newTemp(Ity_I8);
   31359 
   31360          if ( epartIsReg( modrm ) ) {
   31361             UInt rE = eregOfRexRM(pfx,modrm);
   31362             imm8 = (Int)(getUChar(delta+1) & 15);
   31363             assign( src_u8, unop(Iop_32to8, getIReg32( rE )) );
   31364             delta += 1+1;
   31365             DIP( "vpinsrb $%d,%s,%s,%s\n",
   31366                  imm8, nameIReg32(rE), nameXMMReg(rV), nameXMMReg(rG) );
   31367          } else {
   31368             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31369             imm8 = (Int)(getUChar(delta+alen) & 15);
   31370             assign( src_u8, loadLE( Ity_I8, mkexpr(addr) ) );
   31371             delta += alen+1;
   31372             DIP( "vpinsrb $%d,%s,%s,%s\n",
   31373                  imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   31374          }
   31375 
   31376          IRTemp src_vec = newTemp(Ity_V128);
   31377          assign(src_vec, getXMMReg( rV ));
   31378          IRTemp res_vec = math_PINSRB_128( src_vec, src_u8, imm8 );
   31379          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   31380          *uses_vvvv = True;
   31381          goto decode_success;
   31382       }
   31383       break;
   31384 
   31385    case 0x21:
   31386       /* VINSERTPS imm8, xmm3/m32, xmm2, xmm1
   31387          = VEX.NDS.128.66.0F3A.WIG 21 /r ib */
   31388       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31389          UChar  modrm = getUChar(delta);
   31390          UInt   rG    = gregOfRexRM(pfx, modrm);
   31391          UInt   rV    = getVexNvvvv(pfx);
   31392          UInt   imm8;
   31393          IRTemp d2ins = newTemp(Ity_I32); /* comes from the E part */
   31394          const IRTemp inval = IRTemp_INVALID;
   31395 
   31396          if ( epartIsReg( modrm ) ) {
   31397             UInt   rE = eregOfRexRM(pfx, modrm);
   31398             IRTemp vE = newTemp(Ity_V128);
   31399             assign( vE, getXMMReg(rE) );
   31400             IRTemp dsE[4] = { inval, inval, inval, inval };
   31401             breakupV128to32s( vE, &dsE[3], &dsE[2], &dsE[1], &dsE[0] );
   31402             imm8 = getUChar(delta+1);
   31403             d2ins = dsE[(imm8 >> 6) & 3]; /* "imm8_count_s" */
   31404             delta += 1+1;
   31405             DIP( "insertps $%u, %s,%s\n",
   31406                  imm8, nameXMMReg(rE), nameXMMReg(rG) );
   31407          } else {
   31408             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31409             assign( d2ins, loadLE( Ity_I32, mkexpr(addr) ) );
   31410             imm8 = getUChar(delta+alen);
   31411             delta += alen+1;
   31412             DIP( "insertps $%u, %s,%s\n",
   31413                  imm8, dis_buf, nameXMMReg(rG) );
   31414          }
   31415 
   31416          IRTemp vV = newTemp(Ity_V128);
   31417          assign( vV, getXMMReg(rV) );
   31418 
   31419          putYMMRegLoAndZU( rG, mkexpr(math_INSERTPS( vV, d2ins, imm8 )) );
   31420          *uses_vvvv = True;
   31421          goto decode_success;
   31422       }
   31423       break;
   31424 
   31425    case 0x22:
   31426       /* VPINSRD r32/m32, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W0 22 /r ib */
   31427       if (have66noF2noF3(pfx)
   31428           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   31429          UChar  modrm = getUChar(delta);
   31430          UInt   rG    = gregOfRexRM(pfx, modrm);
   31431          UInt   rV    = getVexNvvvv(pfx);
   31432          Int    imm8_10;
   31433          IRTemp src_u32 = newTemp(Ity_I32);
   31434 
   31435          if ( epartIsReg( modrm ) ) {
   31436             UInt rE = eregOfRexRM(pfx,modrm);
   31437             imm8_10 = (Int)(getUChar(delta+1) & 3);
   31438             assign( src_u32, getIReg32( rE ) );
   31439             delta += 1+1;
   31440             DIP( "vpinsrd $%d,%s,%s,%s\n",
   31441                  imm8_10, nameIReg32(rE), nameXMMReg(rV), nameXMMReg(rG) );
   31442          } else {
   31443             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31444             imm8_10 = (Int)(getUChar(delta+alen) & 3);
   31445             assign( src_u32, loadLE( Ity_I32, mkexpr(addr) ) );
   31446             delta += alen+1;
   31447             DIP( "vpinsrd $%d,%s,%s,%s\n",
   31448                  imm8_10, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   31449          }
   31450 
   31451          IRTemp src_vec = newTemp(Ity_V128);
   31452          assign(src_vec, getXMMReg( rV ));
   31453          IRTemp res_vec = math_PINSRD_128( src_vec, src_u32, imm8_10 );
   31454          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   31455          *uses_vvvv = True;
   31456          goto decode_success;
   31457       }
   31458       /* VPINSRQ r64/m64, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W1 22 /r ib */
   31459       if (have66noF2noF3(pfx)
   31460           && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
   31461          UChar  modrm = getUChar(delta);
   31462          UInt   rG    = gregOfRexRM(pfx, modrm);
   31463          UInt   rV    = getVexNvvvv(pfx);
   31464          Int    imm8_0;
   31465          IRTemp src_u64 = newTemp(Ity_I64);
   31466 
   31467          if ( epartIsReg( modrm ) ) {
   31468             UInt rE = eregOfRexRM(pfx,modrm);
   31469             imm8_0 = (Int)(getUChar(delta+1) & 1);
   31470             assign( src_u64, getIReg64( rE ) );
   31471             delta += 1+1;
   31472             DIP( "vpinsrq $%d,%s,%s,%s\n",
   31473                  imm8_0, nameIReg64(rE), nameXMMReg(rV), nameXMMReg(rG) );
   31474          } else {
   31475             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31476             imm8_0 = (Int)(getUChar(delta+alen) & 1);
   31477             assign( src_u64, loadLE( Ity_I64, mkexpr(addr) ) );
   31478             delta += alen+1;
   31479             DIP( "vpinsrd $%d,%s,%s,%s\n",
   31480                  imm8_0, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   31481          }
   31482 
   31483          IRTemp src_vec = newTemp(Ity_V128);
   31484          assign(src_vec, getXMMReg( rV ));
   31485          IRTemp res_vec = math_PINSRQ_128( src_vec, src_u64, imm8_0 );
   31486          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   31487          *uses_vvvv = True;
   31488          goto decode_success;
   31489       }
   31490       break;
   31491 
   31492    case 0x38:
   31493       /* VINSERTI128 r/m, rV, rD
   31494          ::: rD = insertinto(a lane in rV, 128 bits from r/m) */
   31495       /* VINSERTI128 = VEX.NDS.256.66.0F3A.W0 38 /r ib */
   31496       if (have66noF2noF3(pfx)
   31497           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   31498          UChar  modrm = getUChar(delta);
   31499          UInt   ib    = 0;
   31500          UInt   rG    = gregOfRexRM(pfx, modrm);
   31501          UInt   rV    = getVexNvvvv(pfx);
   31502          IRTemp t128  = newTemp(Ity_V128);
   31503          if (epartIsReg(modrm)) {
   31504             UInt rE = eregOfRexRM(pfx, modrm);
   31505             delta += 1;
   31506             assign(t128, getXMMReg(rE));
   31507             ib = getUChar(delta);
   31508             DIP("vinserti128 $%u,%s,%s,%s\n",
   31509                 ib, nameXMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   31510          } else {
   31511             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31512             assign(t128, loadLE(Ity_V128, mkexpr(addr)));
   31513             delta += alen;
   31514             ib = getUChar(delta);
   31515             DIP("vinserti128 $%u,%s,%s,%s\n",
   31516                 ib, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   31517          }
   31518          delta++;
   31519          putYMMRegLane128(rG, 0,   getYMMRegLane128(rV, 0));
   31520          putYMMRegLane128(rG, 1,   getYMMRegLane128(rV, 1));
   31521          putYMMRegLane128(rG, ib & 1, mkexpr(t128));
   31522          *uses_vvvv = True;
   31523          goto decode_success;
   31524       }
   31525       break;
   31526 
   31527    case 0x39:
   31528       /* VEXTRACTI128 $lane_no, rS, r/m
   31529          ::: r/m:V128 = a lane of rS:V256 (RM format) */
   31530       /* VEXTRACTI128 = VEX.256.66.0F3A.W0 39 /r ib */
   31531       if (have66noF2noF3(pfx)
   31532           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   31533          UChar  modrm = getUChar(delta);
   31534          UInt   ib    = 0;
   31535          UInt   rS    = gregOfRexRM(pfx, modrm);
   31536          IRTemp t128  = newTemp(Ity_V128);
   31537          if (epartIsReg(modrm)) {
   31538             UInt rD = eregOfRexRM(pfx, modrm);
   31539             delta += 1;
   31540             ib = getUChar(delta);
   31541             assign(t128, getYMMRegLane128(rS, ib & 1));
   31542             putYMMRegLoAndZU(rD, mkexpr(t128));
   31543             DIP("vextracti128 $%u,%s,%s\n",
   31544                 ib, nameXMMReg(rS), nameYMMReg(rD));
   31545          } else {
   31546             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31547             delta += alen;
   31548             ib = getUChar(delta);
   31549             assign(t128, getYMMRegLane128(rS, ib & 1));
   31550             storeLE(mkexpr(addr), mkexpr(t128));
   31551             DIP("vextracti128 $%u,%s,%s\n",
   31552                 ib, nameYMMReg(rS), dis_buf);
   31553          }
   31554          delta++;
   31555          /* doesn't use vvvv */
   31556          goto decode_success;
   31557       }
   31558       break;
   31559 
   31560    case 0x40:
   31561       /* VDPPS imm8, xmm3/m128,xmm2,xmm1 = VEX.NDS.128.66.0F3A.WIG 40 /r ib */
   31562       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31563          UChar  modrm   = getUChar(delta);
   31564          UInt   rG      = gregOfRexRM(pfx, modrm);
   31565          UInt   rV      = getVexNvvvv(pfx);
   31566          IRTemp dst_vec = newTemp(Ity_V128);
   31567          Int    imm8;
   31568          if (epartIsReg( modrm )) {
   31569             UInt rE = eregOfRexRM(pfx,modrm);
   31570             imm8 = (Int)getUChar(delta+1);
   31571             assign( dst_vec, getXMMReg( rE ) );
   31572             delta += 1+1;
   31573             DIP( "vdpps $%d,%s,%s,%s\n",
   31574                  imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
   31575          } else {
   31576             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31577             imm8 = (Int)getUChar(delta+alen);
   31578             assign( dst_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   31579             delta += alen+1;
   31580             DIP( "vdpps $%d,%s,%s,%s\n",
   31581                  imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   31582          }
   31583 
   31584          IRTemp src_vec = newTemp(Ity_V128);
   31585          assign(src_vec, getXMMReg( rV ));
   31586          IRTemp res_vec = math_DPPS_128( src_vec, dst_vec, imm8 );
   31587          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   31588          *uses_vvvv = True;
   31589          goto decode_success;
   31590       }
   31591       /* VDPPS imm8, ymm3/m128,ymm2,ymm1 = VEX.NDS.256.66.0F3A.WIG 40 /r ib */
   31592       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   31593          UChar  modrm   = getUChar(delta);
   31594          UInt   rG      = gregOfRexRM(pfx, modrm);
   31595          UInt   rV      = getVexNvvvv(pfx);
   31596          IRTemp dst_vec = newTemp(Ity_V256);
   31597          Int    imm8;
   31598          if (epartIsReg( modrm )) {
   31599             UInt rE = eregOfRexRM(pfx,modrm);
   31600             imm8 = (Int)getUChar(delta+1);
   31601             assign( dst_vec, getYMMReg( rE ) );
   31602             delta += 1+1;
   31603             DIP( "vdpps $%d,%s,%s,%s\n",
   31604                  imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG) );
   31605          } else {
   31606             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31607             imm8 = (Int)getUChar(delta+alen);
   31608             assign( dst_vec, loadLE( Ity_V256, mkexpr(addr) ) );
   31609             delta += alen+1;
   31610             DIP( "vdpps $%d,%s,%s,%s\n",
   31611                  imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
   31612          }
   31613 
   31614          IRTemp src_vec = newTemp(Ity_V256);
   31615          assign(src_vec, getYMMReg( rV ));
   31616          IRTemp s0, s1, d0, d1;
   31617          s0 = s1 = d0 = d1 = IRTemp_INVALID;
   31618          breakupV256toV128s( dst_vec, &d1, &d0 );
   31619          breakupV256toV128s( src_vec, &s1, &s0 );
   31620          putYMMReg( rG, binop( Iop_V128HLtoV256,
   31621                                mkexpr( math_DPPS_128(s1, d1, imm8) ),
   31622                                mkexpr( math_DPPS_128(s0, d0, imm8) ) ) );
   31623          *uses_vvvv = True;
   31624          goto decode_success;
   31625       }
   31626       break;
   31627 
   31628    case 0x41:
   31629       /* VDPPD imm8, xmm3/m128,xmm2,xmm1 = VEX.NDS.128.66.0F3A.WIG 41 /r ib */
   31630       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31631          UChar  modrm   = getUChar(delta);
   31632          UInt   rG      = gregOfRexRM(pfx, modrm);
   31633          UInt   rV      = getVexNvvvv(pfx);
   31634          IRTemp dst_vec = newTemp(Ity_V128);
   31635          Int    imm8;
   31636          if (epartIsReg( modrm )) {
   31637             UInt rE = eregOfRexRM(pfx,modrm);
   31638             imm8 = (Int)getUChar(delta+1);
   31639             assign( dst_vec, getXMMReg( rE ) );
   31640             delta += 1+1;
   31641             DIP( "vdppd $%d,%s,%s,%s\n",
   31642                  imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
   31643          } else {
   31644             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31645             imm8 = (Int)getUChar(delta+alen);
   31646             assign( dst_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   31647             delta += alen+1;
   31648             DIP( "vdppd $%d,%s,%s,%s\n",
   31649                  imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   31650          }
   31651 
   31652          IRTemp src_vec = newTemp(Ity_V128);
   31653          assign(src_vec, getXMMReg( rV ));
   31654          IRTemp res_vec = math_DPPD_128( src_vec, dst_vec, imm8 );
   31655          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   31656          *uses_vvvv = True;
   31657          goto decode_success;
   31658       }
   31659       break;
   31660 
   31661    case 0x42:
   31662       /* VMPSADBW imm8, xmm3/m128,xmm2,xmm1 */
   31663       /* VMPSADBW = VEX.NDS.128.66.0F3A.WIG 42 /r ib */
   31664       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31665          UChar  modrm   = getUChar(delta);
   31666          Int    imm8;
   31667          IRTemp src_vec = newTemp(Ity_V128);
   31668          IRTemp dst_vec = newTemp(Ity_V128);
   31669          UInt   rG      = gregOfRexRM(pfx, modrm);
   31670          UInt   rV      = getVexNvvvv(pfx);
   31671 
   31672          assign( dst_vec, getXMMReg(rV) );
   31673 
   31674          if ( epartIsReg( modrm ) ) {
   31675             UInt rE = eregOfRexRM(pfx, modrm);
   31676 
   31677             imm8 = (Int)getUChar(delta+1);
   31678             assign( src_vec, getXMMReg(rE) );
   31679             delta += 1+1;
   31680             DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
   31681                  nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
   31682          } else {
   31683             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   31684                              1/* imm8 is 1 byte after the amode */ );
   31685             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   31686             imm8 = (Int)getUChar(delta+alen);
   31687             delta += alen+1;
   31688             DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
   31689                  dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   31690          }
   31691 
   31692          putYMMRegLoAndZU( rG, mkexpr( math_MPSADBW_128(dst_vec,
   31693                                                         src_vec, imm8) ) );
   31694          *uses_vvvv = True;
   31695          goto decode_success;
   31696       }
   31697       /* VMPSADBW imm8, ymm3/m256,ymm2,ymm1 */
   31698       /* VMPSADBW = VEX.NDS.256.66.0F3A.WIG 42 /r ib */
   31699       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   31700          UChar  modrm   = getUChar(delta);
   31701          Int    imm8;
   31702          IRTemp src_vec = newTemp(Ity_V256);
   31703          IRTemp dst_vec = newTemp(Ity_V256);
   31704          UInt   rG      = gregOfRexRM(pfx, modrm);
   31705          UInt   rV      = getVexNvvvv(pfx);
   31706          IRTemp sHi, sLo, dHi, dLo;
   31707          sHi = sLo = dHi = dLo = IRTemp_INVALID;
   31708 
   31709          assign( dst_vec, getYMMReg(rV) );
   31710 
   31711          if ( epartIsReg( modrm ) ) {
   31712             UInt rE = eregOfRexRM(pfx, modrm);
   31713 
   31714             imm8 = (Int)getUChar(delta+1);
   31715             assign( src_vec, getYMMReg(rE) );
   31716             delta += 1+1;
   31717             DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
   31718                  nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG) );
   31719          } else {
   31720             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   31721                              1/* imm8 is 1 byte after the amode */ );
   31722             assign( src_vec, loadLE( Ity_V256, mkexpr(addr) ) );
   31723             imm8 = (Int)getUChar(delta+alen);
   31724             delta += alen+1;
   31725             DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
   31726                  dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
   31727          }
   31728 
   31729          breakupV256toV128s( dst_vec, &dHi, &dLo );
   31730          breakupV256toV128s( src_vec, &sHi, &sLo );
   31731          putYMMReg( rG, binop( Iop_V128HLtoV256,
   31732                                mkexpr( math_MPSADBW_128(dHi, sHi, imm8 >> 3) ),
   31733                                mkexpr( math_MPSADBW_128(dLo, sLo, imm8) ) ) );
   31734          *uses_vvvv = True;
   31735          goto decode_success;
   31736       }
   31737       break;
   31738 
   31739    case 0x44:
   31740       /* VPCLMULQDQ imm8, xmm3/m128,xmm2,xmm1 */
   31741       /* VPCLMULQDQ = VEX.NDS.128.66.0F3A.WIG 44 /r ib */
   31742       /* 66 0F 3A 44 /r ib = PCLMULQDQ xmm1, xmm2/m128, imm8
   31743        * Carry-less multiplication of selected XMM quadwords into XMM
   31744        * registers (a.k.a multiplication of polynomials over GF(2))
   31745        */
   31746       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31747          UChar  modrm = getUChar(delta);
   31748          Int imm8;
   31749          IRTemp sV    = newTemp(Ity_V128);
   31750          IRTemp dV    = newTemp(Ity_V128);
   31751          UInt   rG    = gregOfRexRM(pfx, modrm);
   31752          UInt   rV    = getVexNvvvv(pfx);
   31753 
   31754          assign( dV, getXMMReg(rV) );
   31755 
   31756          if ( epartIsReg( modrm ) ) {
   31757             UInt rE = eregOfRexRM(pfx, modrm);
   31758             imm8 = (Int)getUChar(delta+1);
   31759             assign( sV, getXMMReg(rE) );
   31760             delta += 1+1;
   31761             DIP( "vpclmulqdq $%d, %s,%s,%s\n", imm8,
   31762                  nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
   31763          } else {
   31764             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   31765                              1/* imm8 is 1 byte after the amode */ );
   31766             assign( sV, loadLE( Ity_V128, mkexpr(addr) ) );
   31767             imm8 = (Int)getUChar(delta+alen);
   31768             delta += alen+1;
   31769             DIP( "vpclmulqdq $%d, %s,%s,%s\n",
   31770                  imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   31771          }
   31772 
   31773          putYMMRegLoAndZU( rG, mkexpr( math_PCLMULQDQ(dV, sV, imm8) ) );
   31774          *uses_vvvv = True;
   31775          goto decode_success;
   31776       }
   31777       break;
   31778 
   31779    case 0x46:
   31780       /* VPERM2I128 imm8, ymm3/m256, ymm2, ymm1 = VEX.NDS.66.0F3A.W0 46 /r ib */
   31781       if (have66noF2noF3(pfx)
   31782           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   31783          UChar  modrm = getUChar(delta);
   31784          UInt   imm8  = 0;
   31785          UInt   rG    = gregOfRexRM(pfx, modrm);
   31786          UInt   rV    = getVexNvvvv(pfx);
   31787          IRTemp s00   = newTemp(Ity_V128);
   31788          IRTemp s01   = newTemp(Ity_V128);
   31789          IRTemp s10   = newTemp(Ity_V128);
   31790          IRTemp s11   = newTemp(Ity_V128);
   31791          assign(s00, getYMMRegLane128(rV, 0));
   31792          assign(s01, getYMMRegLane128(rV, 1));
   31793          if (epartIsReg(modrm)) {
   31794             UInt rE = eregOfRexRM(pfx, modrm);
   31795             delta += 1;
   31796             imm8 = getUChar(delta);
   31797             DIP("vperm2i128 $%u,%s,%s,%s\n",
   31798                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   31799             assign(s10, getYMMRegLane128(rE, 0));
   31800             assign(s11, getYMMRegLane128(rE, 1));
   31801          } else {
   31802             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31803             delta += alen;
   31804             imm8 = getUChar(delta);
   31805             DIP("vperm2i128 $%u,%s,%s,%s\n",
   31806                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   31807             assign(s10, loadLE(Ity_V128, binop(Iop_Add64,
   31808                                                mkexpr(addr), mkU64(0))));
   31809             assign(s11, loadLE(Ity_V128, binop(Iop_Add64,
   31810                                                mkexpr(addr), mkU64(16))));
   31811          }
   31812          delta++;
   31813 #        define SEL(_nn) (((_nn)==0) ? s00 : ((_nn)==1) ? s01 \
   31814                                            : ((_nn)==2) ? s10 : s11)
   31815          putYMMRegLane128(rG, 0, mkexpr(SEL((imm8 >> 0) & 3)));
   31816          putYMMRegLane128(rG, 1, mkexpr(SEL((imm8 >> 4) & 3)));
   31817 #        undef SEL
   31818          if (imm8 & (1<<3)) putYMMRegLane128(rG, 0, mkV128(0));
   31819          if (imm8 & (1<<7)) putYMMRegLane128(rG, 1, mkV128(0));
   31820          *uses_vvvv = True;
   31821          goto decode_success;
   31822       }
   31823       break;
   31824 
   31825    case 0x4A:
   31826       /* VBLENDVPS xmmG, xmmE/memE, xmmV, xmmIS4
   31827          ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */
   31828       /* VBLENDVPS = VEX.NDS.128.66.0F3A.WIG 4A /r /is4 */
   31829       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31830          delta = dis_VBLENDV_128 ( vbi, pfx, delta,
   31831                                    "vblendvps", 4, Iop_SarN32x4 );
   31832          *uses_vvvv = True;
   31833          goto decode_success;
   31834       }
   31835       /* VBLENDVPS ymmG, ymmE/memE, ymmV, ymmIS4
   31836          ::: ymmG:V256 = PBLEND(ymmE, ymmV, ymmIS4) (RMVR) */
   31837       /* VBLENDVPS = VEX.NDS.256.66.0F3A.WIG 4A /r /is4 */
   31838       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   31839          delta = dis_VBLENDV_256 ( vbi, pfx, delta,
   31840                                    "vblendvps", 4, Iop_SarN32x4 );
   31841          *uses_vvvv = True;
   31842          goto decode_success;
   31843       }
   31844       break;
   31845 
   31846    case 0x4B:
   31847       /* VBLENDVPD xmmG, xmmE/memE, xmmV, xmmIS4
   31848          ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */
   31849       /* VBLENDVPD = VEX.NDS.128.66.0F3A.WIG 4B /r /is4 */
   31850       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31851          delta = dis_VBLENDV_128 ( vbi, pfx, delta,
   31852                                    "vblendvpd", 8, Iop_SarN64x2 );
   31853          *uses_vvvv = True;
   31854          goto decode_success;
   31855       }
   31856       /* VBLENDVPD ymmG, ymmE/memE, ymmV, ymmIS4
   31857          ::: ymmG:V256 = PBLEND(ymmE, ymmV, ymmIS4) (RMVR) */
   31858       /* VBLENDVPD = VEX.NDS.256.66.0F3A.WIG 4B /r /is4 */
   31859       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   31860          delta = dis_VBLENDV_256 ( vbi, pfx, delta,
   31861                                    "vblendvpd", 8, Iop_SarN64x2 );
   31862          *uses_vvvv = True;
   31863          goto decode_success;
   31864       }
   31865       break;
   31866 
   31867    case 0x4C:
   31868       /* VPBLENDVB xmmG, xmmE/memE, xmmV, xmmIS4
   31869          ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */
   31870       /* VPBLENDVB = VEX.NDS.128.66.0F3A.WIG 4C /r /is4 */
   31871       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31872          delta = dis_VBLENDV_128 ( vbi, pfx, delta,
   31873                                    "vpblendvb", 1, Iop_SarN8x16 );
   31874          *uses_vvvv = True;
   31875          goto decode_success;
   31876       }
   31877       /* VPBLENDVB ymmG, ymmE/memE, ymmV, ymmIS4
   31878          ::: ymmG:V256 = PBLEND(ymmE, ymmV, ymmIS4) (RMVR) */
   31879       /* VPBLENDVB = VEX.NDS.256.66.0F3A.WIG 4C /r /is4 */
   31880       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   31881          delta = dis_VBLENDV_256 ( vbi, pfx, delta,
   31882                                    "vpblendvb", 1, Iop_SarN8x16 );
   31883          *uses_vvvv = True;
   31884          goto decode_success;
   31885       }
   31886       break;
   31887 
   31888    case 0x60:
   31889    case 0x61:
   31890    case 0x62:
   31891    case 0x63:
   31892       /* VEX.128.66.0F3A.WIG 63 /r ib = VPCMPISTRI imm8, xmm2/m128, xmm1
   31893          VEX.128.66.0F3A.WIG 62 /r ib = VPCMPISTRM imm8, xmm2/m128, xmm1
   31894          VEX.128.66.0F3A.WIG 61 /r ib = VPCMPESTRI imm8, xmm2/m128, xmm1
   31895          VEX.128.66.0F3A.WIG 60 /r ib = VPCMPESTRM imm8, xmm2/m128, xmm1
   31896          (selected special cases that actually occur in glibc,
   31897           not by any means a complete implementation.)
   31898       */
   31899       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31900          Long delta0 = delta;
   31901          delta = dis_PCMPxSTRx( vbi, pfx, delta, True/*isAvx*/, opc );
   31902          if (delta > delta0) goto decode_success;
   31903          /* else fall though; dis_PCMPxSTRx failed to decode it */
   31904       }
   31905       break;
   31906 
   31907    case 0x5C ... 0x5F:
   31908    case 0x68 ... 0x6F:
   31909    case 0x78 ... 0x7F:
   31910       /* FIXME: list the instructions decoded here */
   31911       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31912          Long delta0 = delta;
   31913          delta = dis_FMA4( pfx, delta, opc, uses_vvvv, vbi );
   31914          if (delta > delta0) {
   31915             dres->hint = Dis_HintVerbose;
   31916             goto decode_success;
   31917          }
   31918          /* else fall though; dis_FMA4 failed to decode it */
   31919       }
   31920       break;
   31921 
   31922    case 0xDF:
   31923       /* VAESKEYGENASSIST imm8, xmm2/m128, xmm1 = VEX.128.66.0F3A.WIG DF /r */
   31924       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31925          delta = dis_AESKEYGENASSIST( vbi, pfx, delta, True/*!isAvx*/ );
   31926          goto decode_success;
   31927       }
   31928       break;
   31929 
   31930    case 0xF0:
   31931       /* RORX imm8, r/m32, r32a = VEX.LZ.F2.0F3A.W0 F0 /r /i */
   31932       /* RORX imm8, r/m64, r64a = VEX.LZ.F2.0F3A.W1 F0 /r /i */
   31933       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   31934          Int     size = getRexW(pfx) ? 8 : 4;
   31935          IRType  ty   = szToITy(size);
   31936          IRTemp  src  = newTemp(ty);
   31937          UChar   rm   = getUChar(delta);
   31938          UChar   imm8;
   31939 
   31940          if (epartIsReg(rm)) {
   31941             imm8 = getUChar(delta+1);
   31942             assign( src, getIRegE(size,pfx,rm) );
   31943             DIP("rorx %d,%s,%s\n", imm8, nameIRegE(size,pfx,rm),
   31944                                    nameIRegG(size,pfx,rm));
   31945             delta += 2;
   31946          } else {
   31947             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   31948             imm8 = getUChar(delta+alen);
   31949             assign( src, loadLE(ty, mkexpr(addr)) );
   31950             DIP("rorx %d,%s,%s\n", imm8, dis_buf, nameIRegG(size,pfx,rm));
   31951             delta += alen + 1;
   31952          }
   31953          imm8 &= 8*size-1;
   31954 
   31955          /* dst = (src >>u imm8) | (src << (size-imm8)) */
   31956          putIRegG( size, pfx, rm,
   31957                    imm8 == 0 ? mkexpr(src)
   31958                    : binop( mkSizedOp(ty,Iop_Or8),
   31959                             binop( mkSizedOp(ty,Iop_Shr8), mkexpr(src),
   31960                                    mkU8(imm8) ),
   31961                             binop( mkSizedOp(ty,Iop_Shl8), mkexpr(src),
   31962                                    mkU8(8*size-imm8) ) ) );
   31963          /* Flags aren't modified.  */
   31964          goto decode_success;
   31965       }
   31966       break;
   31967 
   31968    default:
   31969       break;
   31970 
   31971    }
   31972 
   31973   //decode_failure:
   31974    return deltaIN;
   31975 
   31976   decode_success:
   31977    return delta;
   31978 }
   31979 
   31980 
   31981 /*------------------------------------------------------------*/
   31982 /*---                                                      ---*/
   31983 /*--- Disassemble a single instruction                     ---*/
   31984 /*---                                                      ---*/
   31985 /*------------------------------------------------------------*/
   31986 
   31987 /* Disassemble a single instruction into IR.  The instruction is
   31988    located in host memory at &guest_code[delta]. */
   31989 
   31990 static
   31991 DisResult disInstr_AMD64_WRK (
   31992              /*OUT*/Bool* expect_CAS,
   31993              Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
   31994              Bool         resteerCisOk,
   31995              void*        callback_opaque,
   31996              Long         delta64,
   31997              const VexArchInfo* archinfo,
   31998              const VexAbiInfo*  vbi,
   31999              Bool         sigill_diag
   32000           )
   32001 {
   32002    IRTemp    t1, t2;
   32003    UChar     pre;
   32004    Int       n, n_prefixes;
   32005    DisResult dres;
   32006 
   32007    /* The running delta */
   32008    Long delta = delta64;
   32009 
   32010    /* Holds eip at the start of the insn, so that we can print
   32011       consistent error messages for unimplemented insns. */
   32012    Long delta_start = delta;
   32013 
   32014    /* sz denotes the nominal data-op size of the insn; we change it to
   32015       2 if an 0x66 prefix is seen and 8 if REX.W is 1.  In case of
   32016       conflict REX.W takes precedence. */
   32017    Int sz = 4;
   32018 
   32019    /* pfx holds the summary of prefixes. */
   32020    Prefix pfx = PFX_EMPTY;
   32021 
   32022    /* Holds the computed opcode-escape indication. */
   32023    Escape esc = ESC_NONE;
   32024 
   32025    /* Set result defaults. */
   32026    dres.whatNext    = Dis_Continue;
   32027    dres.len         = 0;
   32028    dres.continueAt  = 0;
   32029    dres.jk_StopHere = Ijk_INVALID;
   32030    dres.hint        = Dis_HintNone;
   32031    *expect_CAS = False;
   32032 
   32033    vassert(guest_RIP_next_assumed == 0);
   32034    vassert(guest_RIP_next_mustcheck == False);
   32035 
   32036    t1 = t2 = IRTemp_INVALID;
   32037 
   32038    DIP("\t0x%llx:  ", guest_RIP_bbstart+delta);
   32039 
   32040    /* Spot "Special" instructions (see comment at top of file). */
   32041    {
   32042       const UChar* code = guest_code + delta;
   32043       /* Spot the 16-byte preamble:
   32044          48C1C703   rolq $3,  %rdi
   32045          48C1C70D   rolq $13, %rdi
   32046          48C1C73D   rolq $61, %rdi
   32047          48C1C733   rolq $51, %rdi
   32048       */
   32049       if (code[ 0] == 0x48 && code[ 1] == 0xC1 && code[ 2] == 0xC7
   32050                                                && code[ 3] == 0x03 &&
   32051           code[ 4] == 0x48 && code[ 5] == 0xC1 && code[ 6] == 0xC7
   32052                                                && code[ 7] == 0x0D &&
   32053           code[ 8] == 0x48 && code[ 9] == 0xC1 && code[10] == 0xC7
   32054                                                && code[11] == 0x3D &&
   32055           code[12] == 0x48 && code[13] == 0xC1 && code[14] == 0xC7
   32056                                                && code[15] == 0x33) {
   32057          /* Got a "Special" instruction preamble.  Which one is it? */
   32058          if (code[16] == 0x48 && code[17] == 0x87
   32059                               && code[18] == 0xDB /* xchgq %rbx,%rbx */) {
   32060             /* %RDX = client_request ( %RAX ) */
   32061             DIP("%%rdx = client_request ( %%rax )\n");
   32062             delta += 19;
   32063             jmp_lit(&dres, Ijk_ClientReq, guest_RIP_bbstart+delta);
   32064             vassert(dres.whatNext == Dis_StopHere);
   32065             goto decode_success;
   32066          }
   32067          else
   32068          if (code[16] == 0x48 && code[17] == 0x87
   32069                               && code[18] == 0xC9 /* xchgq %rcx,%rcx */) {
   32070             /* %RAX = guest_NRADDR */
   32071             DIP("%%rax = guest_NRADDR\n");
   32072             delta += 19;
   32073             putIRegRAX(8, IRExpr_Get( OFFB_NRADDR, Ity_I64 ));
   32074             goto decode_success;
   32075          }
   32076          else
   32077          if (code[16] == 0x48 && code[17] == 0x87
   32078                               && code[18] == 0xD2 /* xchgq %rdx,%rdx */) {
   32079             /* call-noredir *%RAX */
   32080             DIP("call-noredir *%%rax\n");
   32081             delta += 19;
   32082             t1 = newTemp(Ity_I64);
   32083             assign(t1, getIRegRAX(8));
   32084             t2 = newTemp(Ity_I64);
   32085             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   32086             putIReg64(R_RSP, mkexpr(t2));
   32087             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta));
   32088             jmp_treg(&dres, Ijk_NoRedir, t1);
   32089             vassert(dres.whatNext == Dis_StopHere);
   32090             goto decode_success;
   32091          }
   32092          else
   32093          if (code[16] == 0x48 && code[17] == 0x87
   32094                               && code[18] == 0xff /* xchgq %rdi,%rdi */) {
   32095            /* IR injection */
   32096             DIP("IR injection\n");
   32097             vex_inject_ir(irsb, Iend_LE);
   32098 
   32099             // Invalidate the current insn. The reason is that the IRop we're
   32100             // injecting here can change. In which case the translation has to
   32101             // be redone. For ease of handling, we simply invalidate all the
   32102             // time.
   32103             stmt(IRStmt_Put(OFFB_CMSTART, mkU64(guest_RIP_curr_instr)));
   32104             stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(19)));
   32105 
   32106             delta += 19;
   32107 
   32108             stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_bbstart + delta) ) );
   32109             dres.whatNext    = Dis_StopHere;
   32110             dres.jk_StopHere = Ijk_InvalICache;
   32111             goto decode_success;
   32112          }
   32113          /* We don't know what it is. */
   32114          goto decode_failure;
   32115          /*NOTREACHED*/
   32116       }
   32117    }
   32118 
   32119    /* Eat prefixes, summarising the result in pfx and sz, and rejecting
   32120       as many invalid combinations as possible. */
   32121    n_prefixes = 0;
   32122    while (True) {
   32123       if (n_prefixes > 7) goto decode_failure;
   32124       pre = getUChar(delta);
   32125       switch (pre) {
   32126          case 0x66: pfx |= PFX_66; break;
   32127          case 0x67: pfx |= PFX_ASO; break;
   32128          case 0xF2: pfx |= PFX_F2; break;
   32129          case 0xF3: pfx |= PFX_F3; break;
   32130          case 0xF0: pfx |= PFX_LOCK; *expect_CAS = True; break;
   32131          case 0x2E: pfx |= PFX_CS; break;
   32132          case 0x3E: pfx |= PFX_DS; break;
   32133          case 0x26: pfx |= PFX_ES; break;
   32134          case 0x64: pfx |= PFX_FS; break;
   32135          case 0x65: pfx |= PFX_GS; break;
   32136          case 0x36: pfx |= PFX_SS; break;
   32137          case 0x40 ... 0x4F:
   32138             pfx |= PFX_REX;
   32139             if (pre & (1<<3)) pfx |= PFX_REXW;
   32140             if (pre & (1<<2)) pfx |= PFX_REXR;
   32141             if (pre & (1<<1)) pfx |= PFX_REXX;
   32142             if (pre & (1<<0)) pfx |= PFX_REXB;
   32143             break;
   32144          default:
   32145             goto not_a_legacy_prefix;
   32146       }
   32147       n_prefixes++;
   32148       delta++;
   32149    }
   32150 
   32151    not_a_legacy_prefix:
   32152    /* We've used up all the non-VEX prefixes.  Parse and validate a
   32153       VEX prefix if that's appropriate. */
   32154    if (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX) {
   32155       /* Used temporarily for holding VEX prefixes. */
   32156       UChar vex0 = getUChar(delta);
   32157       if (vex0 == 0xC4) {
   32158          /* 3-byte VEX */
   32159          UChar vex1 = getUChar(delta+1);
   32160          UChar vex2 = getUChar(delta+2);
   32161          delta += 3;
   32162          pfx |= PFX_VEX;
   32163          /* Snarf contents of byte 1 */
   32164          /* R */ pfx |= (vex1 & (1<<7)) ? 0 : PFX_REXR;
   32165          /* X */ pfx |= (vex1 & (1<<6)) ? 0 : PFX_REXX;
   32166          /* B */ pfx |= (vex1 & (1<<5)) ? 0 : PFX_REXB;
   32167          /* m-mmmm */
   32168          switch (vex1 & 0x1F) {
   32169             case 1: esc = ESC_0F;   break;
   32170             case 2: esc = ESC_0F38; break;
   32171             case 3: esc = ESC_0F3A; break;
   32172             /* Any other m-mmmm field will #UD */
   32173             default: goto decode_failure;
   32174          }
   32175          /* Snarf contents of byte 2 */
   32176          /* W */    pfx |= (vex2 & (1<<7)) ? PFX_REXW : 0;
   32177          /* ~v3 */  pfx |= (vex2 & (1<<6)) ? 0 : PFX_VEXnV3;
   32178          /* ~v2 */  pfx |= (vex2 & (1<<5)) ? 0 : PFX_VEXnV2;
   32179          /* ~v1 */  pfx |= (vex2 & (1<<4)) ? 0 : PFX_VEXnV1;
   32180          /* ~v0 */  pfx |= (vex2 & (1<<3)) ? 0 : PFX_VEXnV0;
   32181          /* L */    pfx |= (vex2 & (1<<2)) ? PFX_VEXL : 0;
   32182          /* pp */
   32183          switch (vex2 & 3) {
   32184             case 0: break;
   32185             case 1: pfx |= PFX_66; break;
   32186             case 2: pfx |= PFX_F3; break;
   32187             case 3: pfx |= PFX_F2; break;
   32188             default: vassert(0);
   32189          }
   32190       }
   32191       else if (vex0 == 0xC5) {
   32192          /* 2-byte VEX */
   32193          UChar vex1 = getUChar(delta+1);
   32194          delta += 2;
   32195          pfx |= PFX_VEX;
   32196          /* Snarf contents of byte 1 */
   32197          /* R */    pfx |= (vex1 & (1<<7)) ? 0 : PFX_REXR;
   32198          /* ~v3 */  pfx |= (vex1 & (1<<6)) ? 0 : PFX_VEXnV3;
   32199          /* ~v2 */  pfx |= (vex1 & (1<<5)) ? 0 : PFX_VEXnV2;
   32200          /* ~v1 */  pfx |= (vex1 & (1<<4)) ? 0 : PFX_VEXnV1;
   32201          /* ~v0 */  pfx |= (vex1 & (1<<3)) ? 0 : PFX_VEXnV0;
   32202          /* L */    pfx |= (vex1 & (1<<2)) ? PFX_VEXL : 0;
   32203          /* pp */
   32204          switch (vex1 & 3) {
   32205             case 0: break;
   32206             case 1: pfx |= PFX_66; break;
   32207             case 2: pfx |= PFX_F3; break;
   32208             case 3: pfx |= PFX_F2; break;
   32209             default: vassert(0);
   32210          }
   32211          /* implied: */
   32212          esc = ESC_0F;
   32213       }
   32214       /* Can't have both VEX and REX */
   32215       if ((pfx & PFX_VEX) && (pfx & PFX_REX))
   32216          goto decode_failure; /* can't have both */
   32217    }
   32218 
   32219    /* Dump invalid combinations */
   32220    n = 0;
   32221    if (pfx & PFX_F2) n++;
   32222    if (pfx & PFX_F3) n++;
   32223    if (n > 1)
   32224       goto decode_failure; /* can't have both */
   32225 
   32226    n = 0;
   32227    if (pfx & PFX_CS) n++;
   32228    if (pfx & PFX_DS) n++;
   32229    if (pfx & PFX_ES) n++;
   32230    if (pfx & PFX_FS) n++;
   32231    if (pfx & PFX_GS) n++;
   32232    if (pfx & PFX_SS) n++;
   32233    if (n > 1)
   32234       goto decode_failure; /* multiple seg overrides == illegal */
   32235 
   32236    /* We have a %fs prefix.  Reject it if there's no evidence in 'vbi'
   32237       that we should accept it. */
   32238    if ((pfx & PFX_FS) && !vbi->guest_amd64_assume_fs_is_const)
   32239       goto decode_failure;
   32240 
   32241    /* Ditto for %gs prefixes. */
   32242    if ((pfx & PFX_GS) && !vbi->guest_amd64_assume_gs_is_const)
   32243       goto decode_failure;
   32244 
   32245    /* Set up sz. */
   32246    sz = 4;
   32247    if (pfx & PFX_66) sz = 2;
   32248    if ((pfx & PFX_REX) && (pfx & PFX_REXW)) sz = 8;
   32249 
   32250    /* Now we should be looking at the primary opcode byte or the
   32251       leading escapes.  Check that any LOCK prefix is actually
   32252       allowed. */
   32253    if (haveLOCK(pfx)) {
   32254       if (can_be_used_with_LOCK_prefix( &guest_code[delta] )) {
   32255          DIP("lock ");
   32256       } else {
   32257          *expect_CAS = False;
   32258          goto decode_failure;
   32259       }
   32260    }
   32261 
   32262    /* Eat up opcode escape bytes, until we're really looking at the
   32263       primary opcode byte.  But only if there's no VEX present. */
   32264    if (!(pfx & PFX_VEX)) {
   32265       vassert(esc == ESC_NONE);
   32266       pre = getUChar(delta);
   32267       if (pre == 0x0F) {
   32268          delta++;
   32269          pre = getUChar(delta);
   32270          switch (pre) {
   32271             case 0x38: esc = ESC_0F38; delta++; break;
   32272             case 0x3A: esc = ESC_0F3A; delta++; break;
   32273             default:   esc = ESC_0F; break;
   32274          }
   32275       }
   32276    }
   32277 
   32278    /* So now we're really really looking at the primary opcode
   32279       byte. */
   32280    Long delta_at_primary_opcode = delta;
   32281 
   32282    if (!(pfx & PFX_VEX)) {
   32283       /* Handle non-VEX prefixed instructions.  "Legacy" (non-VEX) SSE
   32284          instructions preserve the upper 128 bits of YMM registers;
   32285          iow we can simply ignore the presence of the upper halves of
   32286          these registers. */
   32287       switch (esc) {
   32288          case ESC_NONE:
   32289             delta = dis_ESC_NONE( &dres, expect_CAS,
   32290                                   resteerOkFn, resteerCisOk, callback_opaque,
   32291                                   archinfo, vbi, pfx, sz, delta );
   32292             break;
   32293          case ESC_0F:
   32294             delta = dis_ESC_0F  ( &dres, expect_CAS,
   32295                                   resteerOkFn, resteerCisOk, callback_opaque,
   32296                                   archinfo, vbi, pfx, sz, delta );
   32297             break;
   32298          case ESC_0F38:
   32299             delta = dis_ESC_0F38( &dres,
   32300                                   resteerOkFn, resteerCisOk, callback_opaque,
   32301                                   archinfo, vbi, pfx, sz, delta );
   32302             break;
   32303          case ESC_0F3A:
   32304             delta = dis_ESC_0F3A( &dres,
   32305                                   resteerOkFn, resteerCisOk, callback_opaque,
   32306                                   archinfo, vbi, pfx, sz, delta );
   32307             break;
   32308          default:
   32309             vassert(0);
   32310       }
   32311    } else {
   32312       /* VEX prefixed instruction */
   32313       /* Sloppy Intel wording: "An instruction encoded with a VEX.128
   32314          prefix that loads a YMM register operand ..." zeroes out bits
   32315          128 and above of the register. */
   32316       Bool uses_vvvv = False;
   32317       switch (esc) {
   32318          case ESC_0F:
   32319             delta = dis_ESC_0F__VEX ( &dres, &uses_vvvv,
   32320                                       resteerOkFn, resteerCisOk,
   32321                                       callback_opaque,
   32322                                       archinfo, vbi, pfx, sz, delta );
   32323             break;
   32324          case ESC_0F38:
   32325             delta = dis_ESC_0F38__VEX ( &dres, &uses_vvvv,
   32326                                         resteerOkFn, resteerCisOk,
   32327                                         callback_opaque,
   32328                                         archinfo, vbi, pfx, sz, delta );
   32329             break;
   32330          case ESC_0F3A:
   32331             delta = dis_ESC_0F3A__VEX ( &dres, &uses_vvvv,
   32332                                         resteerOkFn, resteerCisOk,
   32333                                         callback_opaque,
   32334                                         archinfo, vbi, pfx, sz, delta );
   32335             break;
   32336          case ESC_NONE:
   32337             /* The presence of a VEX prefix, by Intel definition,
   32338                always implies at least an 0F escape. */
   32339             goto decode_failure;
   32340          default:
   32341             vassert(0);
   32342       }
   32343       /* If the insn doesn't use VEX.vvvv then it must be all ones.
   32344          Check this. */
   32345       if (!uses_vvvv) {
   32346          if (getVexNvvvv(pfx) != 0)
   32347             goto decode_failure;
   32348       }
   32349    }
   32350 
   32351    vassert(delta - delta_at_primary_opcode >= 0);
   32352    vassert(delta - delta_at_primary_opcode < 16/*let's say*/);
   32353 
   32354    /* Use delta == delta_at_primary_opcode to denote decode failure.
   32355       This implies that any successful decode must use at least one
   32356       byte up. */
   32357    if (delta == delta_at_primary_opcode)
   32358       goto decode_failure;
   32359    else
   32360       goto decode_success; /* \o/ */
   32361 
   32362 
   32363   decode_failure:
   32364    /* All decode failures end up here. */
   32365    if (sigill_diag) {
   32366       vex_printf("vex amd64->IR: unhandled instruction bytes: "
   32367                  "0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
   32368                  getUChar(delta_start+0),
   32369                  getUChar(delta_start+1),
   32370                  getUChar(delta_start+2),
   32371                  getUChar(delta_start+3),
   32372                  getUChar(delta_start+4),
   32373                  getUChar(delta_start+5),
   32374                  getUChar(delta_start+6),
   32375                  getUChar(delta_start+7),
   32376                  getUChar(delta_start+8),
   32377                  getUChar(delta_start+9) );
   32378       vex_printf("vex amd64->IR:   REX=%d REX.W=%d REX.R=%d REX.X=%d REX.B=%d\n",
   32379                  haveREX(pfx) ? 1 : 0, getRexW(pfx), getRexR(pfx),
   32380                  getRexX(pfx), getRexB(pfx));
   32381       vex_printf("vex amd64->IR:   VEX=%d VEX.L=%d VEX.nVVVV=0x%x ESC=%s\n",
   32382                  haveVEX(pfx) ? 1 : 0, getVexL(pfx),
   32383                  getVexNvvvv(pfx),
   32384                  esc==ESC_NONE ? "NONE" :
   32385                    esc==ESC_0F ? "0F" :
   32386                    esc==ESC_0F38 ? "0F38" :
   32387                    esc==ESC_0F3A ? "0F3A" : "???");
   32388       vex_printf("vex amd64->IR:   PFX.66=%d PFX.F2=%d PFX.F3=%d\n",
   32389                  have66(pfx) ? 1 : 0, haveF2(pfx) ? 1 : 0,
   32390                  haveF3(pfx) ? 1 : 0);
   32391    }
   32392 
   32393    /* Tell the dispatcher that this insn cannot be decoded, and so has
   32394       not been executed, and (is currently) the next to be executed.
   32395       RIP should be up-to-date since it made so at the start of each
   32396       insn, but nevertheless be paranoid and update it again right
   32397       now. */
   32398    stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_curr_instr) ) );
   32399    jmp_lit(&dres, Ijk_NoDecode, guest_RIP_curr_instr);
   32400    vassert(dres.whatNext == Dis_StopHere);
   32401    dres.len = 0;
   32402    /* We also need to say that a CAS is not expected now, regardless
   32403       of what it might have been set to at the start of the function,
   32404       since the IR that we've emitted just above (to synthesis a
   32405       SIGILL) does not involve any CAS, and presumably no other IR has
   32406       been emitted for this (non-decoded) insn. */
   32407    *expect_CAS = False;
   32408    return dres;
   32409 
   32410 
   32411   decode_success:
   32412    /* All decode successes end up here. */
   32413    switch (dres.whatNext) {
   32414       case Dis_Continue:
   32415          stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_bbstart + delta) ) );
   32416          break;
   32417       case Dis_ResteerU:
   32418       case Dis_ResteerC:
   32419          stmt( IRStmt_Put( OFFB_RIP, mkU64(dres.continueAt) ) );
   32420          break;
   32421       case Dis_StopHere:
   32422          break;
   32423       default:
   32424          vassert(0);
   32425    }
   32426 
   32427    DIP("\n");
   32428    dres.len = toUInt(delta - delta_start);
   32429    return dres;
   32430 }
   32431 
   32432 #undef DIP
   32433 #undef DIS
   32434 
   32435 
   32436 /*------------------------------------------------------------*/
   32437 /*--- Top-level fn                                         ---*/
   32438 /*------------------------------------------------------------*/
   32439 
   32440 /* Disassemble a single instruction into IR.  The instruction
   32441    is located in host memory at &guest_code[delta]. */
   32442 
   32443 DisResult disInstr_AMD64 ( IRSB*        irsb_IN,
   32444                            Bool         (*resteerOkFn) ( void*, Addr ),
   32445                            Bool         resteerCisOk,
   32446                            void*        callback_opaque,
   32447                            const UChar* guest_code_IN,
   32448                            Long         delta,
   32449                            Addr         guest_IP,
   32450                            VexArch      guest_arch,
   32451                            const VexArchInfo* archinfo,
   32452                            const VexAbiInfo*  abiinfo,
   32453                            VexEndness   host_endness_IN,
   32454                            Bool         sigill_diag_IN )
   32455 {
   32456    Int       i, x1, x2;
   32457    Bool      expect_CAS, has_CAS;
   32458    DisResult dres;
   32459 
   32460    /* Set globals (see top of this file) */
   32461    vassert(guest_arch == VexArchAMD64);
   32462    guest_code           = guest_code_IN;
   32463    irsb                 = irsb_IN;
   32464    host_endness         = host_endness_IN;
   32465    guest_RIP_curr_instr = guest_IP;
   32466    guest_RIP_bbstart    = guest_IP - delta;
   32467 
   32468    /* We'll consult these after doing disInstr_AMD64_WRK. */
   32469    guest_RIP_next_assumed   = 0;
   32470    guest_RIP_next_mustcheck = False;
   32471 
   32472    x1 = irsb_IN->stmts_used;
   32473    expect_CAS = False;
   32474    dres = disInstr_AMD64_WRK ( &expect_CAS, resteerOkFn,
   32475                                resteerCisOk,
   32476                                callback_opaque,
   32477                                delta, archinfo, abiinfo, sigill_diag_IN );
   32478    x2 = irsb_IN->stmts_used;
   32479    vassert(x2 >= x1);
   32480 
   32481    /* If disInstr_AMD64_WRK tried to figure out the next rip, check it
   32482       got it right.  Failure of this assertion is serious and denotes
   32483       a bug in disInstr. */
   32484    if (guest_RIP_next_mustcheck
   32485        && guest_RIP_next_assumed != guest_RIP_curr_instr + dres.len) {
   32486       vex_printf("\n");
   32487       vex_printf("assumed next %%rip = 0x%llx\n",
   32488                  guest_RIP_next_assumed );
   32489       vex_printf(" actual next %%rip = 0x%llx\n",
   32490                  guest_RIP_curr_instr + dres.len );
   32491       vpanic("disInstr_AMD64: disInstr miscalculated next %rip");
   32492    }
   32493 
   32494    /* See comment at the top of disInstr_AMD64_WRK for meaning of
   32495       expect_CAS.  Here, we (sanity-)check for the presence/absence of
   32496       IRCAS as directed by the returned expect_CAS value. */
   32497    has_CAS = False;
   32498    for (i = x1; i < x2; i++) {
   32499       if (irsb_IN->stmts[i]->tag == Ist_CAS)
   32500          has_CAS = True;
   32501    }
   32502 
   32503    if (expect_CAS != has_CAS) {
   32504       /* inconsistency detected.  re-disassemble the instruction so as
   32505          to generate a useful error message; then assert. */
   32506       vex_traceflags |= VEX_TRACE_FE;
   32507       dres = disInstr_AMD64_WRK ( &expect_CAS, resteerOkFn,
   32508                                   resteerCisOk,
   32509                                   callback_opaque,
   32510                                   delta, archinfo, abiinfo, sigill_diag_IN );
   32511       for (i = x1; i < x2; i++) {
   32512          vex_printf("\t\t");
   32513          ppIRStmt(irsb_IN->stmts[i]);
   32514          vex_printf("\n");
   32515       }
   32516       /* Failure of this assertion is serious and denotes a bug in
   32517          disInstr. */
   32518       vpanic("disInstr_AMD64: inconsistency in LOCK prefix handling");
   32519    }
   32520 
   32521    return dres;
   32522 }
   32523 
   32524 
   32525 /*------------------------------------------------------------*/
   32526 /*--- Unused stuff                                         ---*/
   32527 /*------------------------------------------------------------*/
   32528 
   32529 // A potentially more Memcheck-friendly version of gen_LZCNT, if
   32530 // this should ever be needed.
   32531 //
   32532 //static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
   32533 //{
   32534 //   /* Scheme is simple: propagate the most significant 1-bit into all
   32535 //      lower positions in the word.  This gives a word of the form
   32536 //      0---01---1.  Now invert it, giving a word of the form
   32537 //      1---10---0, then do a population-count idiom (to count the 1s,
   32538 //      which is the number of leading zeroes, or the word size if the
   32539 //      original word was 0.
   32540 //   */
   32541 //   Int i;
   32542 //   IRTemp t[7];
   32543 //   for (i = 0; i < 7; i++) {
   32544 //      t[i] = newTemp(ty);
   32545 //   }
   32546 //   if (ty == Ity_I64) {
   32547 //      assign(t[0], binop(Iop_Or64, mkexpr(src),
   32548 //                                   binop(Iop_Shr64, mkexpr(src),  mkU8(1))));
   32549 //      assign(t[1], binop(Iop_Or64, mkexpr(t[0]),
   32550 //                                   binop(Iop_Shr64, mkexpr(t[0]), mkU8(2))));
   32551 //      assign(t[2], binop(Iop_Or64, mkexpr(t[1]),
   32552 //                                   binop(Iop_Shr64, mkexpr(t[1]), mkU8(4))));
   32553 //      assign(t[3], binop(Iop_Or64, mkexpr(t[2]),
   32554 //                                   binop(Iop_Shr64, mkexpr(t[2]), mkU8(8))));
   32555 //      assign(t[4], binop(Iop_Or64, mkexpr(t[3]),
   32556 //                                   binop(Iop_Shr64, mkexpr(t[3]), mkU8(16))));
   32557 //      assign(t[5], binop(Iop_Or64, mkexpr(t[4]),
   32558 //                                   binop(Iop_Shr64, mkexpr(t[4]), mkU8(32))));
   32559 //      assign(t[6], unop(Iop_Not64, mkexpr(t[5])));
   32560 //      return gen_POPCOUNT(ty, t[6]);
   32561 //   }
   32562 //   if (ty == Ity_I32) {
   32563 //      assign(t[0], binop(Iop_Or32, mkexpr(src),
   32564 //                                   binop(Iop_Shr32, mkexpr(src),  mkU8(1))));
   32565 //      assign(t[1], binop(Iop_Or32, mkexpr(t[0]),
   32566 //                                   binop(Iop_Shr32, mkexpr(t[0]), mkU8(2))));
   32567 //      assign(t[2], binop(Iop_Or32, mkexpr(t[1]),
   32568 //                                   binop(Iop_Shr32, mkexpr(t[1]), mkU8(4))));
   32569 //      assign(t[3], binop(Iop_Or32, mkexpr(t[2]),
   32570 //                                   binop(Iop_Shr32, mkexpr(t[2]), mkU8(8))));
   32571 //      assign(t[4], binop(Iop_Or32, mkexpr(t[3]),
   32572 //                                   binop(Iop_Shr32, mkexpr(t[3]), mkU8(16))));
   32573 //      assign(t[5], unop(Iop_Not32, mkexpr(t[4])));
   32574 //      return gen_POPCOUNT(ty, t[5]);
   32575 //   }
   32576 //   if (ty == Ity_I16) {
   32577 //      assign(t[0], binop(Iop_Or16, mkexpr(src),
   32578 //                                   binop(Iop_Shr16, mkexpr(src),  mkU8(1))));
   32579 //      assign(t[1], binop(Iop_Or16, mkexpr(t[0]),
   32580 //                                   binop(Iop_Shr16, mkexpr(t[0]), mkU8(2))));
   32581 //      assign(t[2], binop(Iop_Or16, mkexpr(t[1]),
   32582 //                                   binop(Iop_Shr16, mkexpr(t[1]), mkU8(4))));
   32583 //      assign(t[3], binop(Iop_Or16, mkexpr(t[2]),
   32584 //                                   binop(Iop_Shr16, mkexpr(t[2]), mkU8(8))));
   32585 //      assign(t[4], unop(Iop_Not16, mkexpr(t[3])));
   32586 //      return gen_POPCOUNT(ty, t[4]);
   32587 //   }
   32588 //   vassert(0);
   32589 //}
   32590 
   32591 
   32592 /*--------------------------------------------------------------------*/
   32593 /*--- end                                       guest_amd64_toIR.c ---*/
   32594 /*--------------------------------------------------------------------*/
   32595