Home | History | Annotate | Download | only in priv
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- begin                                     guest_amd64_toIR.c ---*/
      4 /*--------------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2015 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 /* Translates AMD64 code to IR. */
     37 
     38 /* TODO:
     39 
     40    All Puts to CC_OP/CC_DEP1/CC_DEP2/CC_NDEP should really be checked
     41    to ensure a 64-bit value is being written.
     42 
     43    x87 FP Limitations:
     44 
     45    * all arithmetic done at 64 bits
     46 
     47    * no FP exceptions, except for handling stack over/underflow
     48 
     49    * FP rounding mode observed only for float->int conversions and
     50      int->float conversions which could lose accuracy, and for
     51      float-to-float rounding.  For all other operations,
     52      round-to-nearest is used, regardless.
     53 
     54    * some of the FCOM cases could do with testing -- not convinced
     55      that the args are the right way round.
     56 
     57    * FSAVE does not re-initialise the FPU; it should do
     58 
     59    * FINIT not only initialises the FPU environment, it also zeroes
     60      all the FP registers.  It should leave the registers unchanged.
     61 
     62     SAHF should cause eflags[1] == 1, and in fact it produces 0.  As
     63     per Intel docs this bit has no meaning anyway.  Since PUSHF is the
     64     only way to observe eflags[1], a proper fix would be to make that
     65     bit be set by PUSHF.
     66 
     67     This module uses global variables and so is not MT-safe (if that
     68     should ever become relevant).
     69 */
     70 
     71 /* Notes re address size overrides (0x67).
     72 
     73    According to the AMD documentation (24594 Rev 3.09, Sept 2003,
     74    "AMD64 Architecture Programmer's Manual Volume 3: General-Purpose
     75    and System Instructions"), Section 1.2.3 ("Address-Size Override
     76    Prefix"):
     77 
     78    0x67 applies to all explicit memory references, causing the top
     79    32 bits of the effective address to become zero.
     80 
     81    0x67 has no effect on stack references (push/pop); these always
     82    use a 64-bit address.
     83 
     84    0x67 changes the interpretation of instructions which implicitly
     85    reference RCX/RSI/RDI, so that in fact ECX/ESI/EDI are used
     86    instead.  These are:
     87 
     88       cmp{s,sb,sw,sd,sq}
     89       in{s,sb,sw,sd}
     90       jcxz, jecxz, jrcxz
     91       lod{s,sb,sw,sd,sq}
     92       loop{,e,bz,be,z}
     93       mov{s,sb,sw,sd,sq}
     94       out{s,sb,sw,sd}
     95       rep{,e,ne,nz}
     96       sca{s,sb,sw,sd,sq}
     97       sto{s,sb,sw,sd,sq}
     98       xlat{,b} */
     99 
    100 /* "Special" instructions.
    101 
    102    This instruction decoder can decode three special instructions
    103    which mean nothing natively (are no-ops as far as regs/mem are
    104    concerned) but have meaning for supporting Valgrind.  A special
    105    instruction is flagged by the 16-byte preamble 48C1C703 48C1C70D
    106    48C1C73D 48C1C733 (in the standard interpretation, that means: rolq
    107    $3, %rdi; rolq $13, %rdi; rolq $61, %rdi; rolq $51, %rdi).
    108    Following that, one of the following 3 are allowed (standard
    109    interpretation in parentheses):
    110 
    111       4887DB (xchgq %rbx,%rbx)   %RDX = client_request ( %RAX )
    112       4887C9 (xchgq %rcx,%rcx)   %RAX = guest_NRADDR
    113       4887D2 (xchgq %rdx,%rdx)   call-noredir *%RAX
    114       4887F6 (xchgq %rdi,%rdi)   IR injection
    115 
    116    Any other bytes following the 16-byte preamble are illegal and
    117    constitute a failure in instruction decoding.  This all assumes
    118    that the preamble will never occur except in specific code
    119    fragments designed for Valgrind to catch.
    120 
    121    No prefixes may precede a "Special" instruction.
    122 */
    123 
    124 /* casLE (implementation of lock-prefixed insns) and rep-prefixed
    125    insns: the side-exit back to the start of the insn is done with
    126    Ijk_Boring.  This is quite wrong, it should be done with
    127    Ijk_NoRedir, since otherwise the side exit, which is intended to
    128    restart the instruction for whatever reason, could go somewhere
    129    entirely else.  Doing it right (with Ijk_NoRedir jumps) would make
    130    no-redir jumps performance critical, at least for rep-prefixed
    131    instructions, since all iterations thereof would involve such a
    132    jump.  It's not such a big deal with casLE since the side exit is
    133    only taken if the CAS fails, that is, the location is contended,
    134    which is relatively unlikely.
    135 
    136    Note also, the test for CAS success vs failure is done using
    137    Iop_CasCmp{EQ,NE}{8,16,32,64} rather than the ordinary
    138    Iop_Cmp{EQ,NE} equivalents.  This is so as to tell Memcheck that it
    139    shouldn't definedness-check these comparisons.  See
    140    COMMENT_ON_CasCmpEQ in memcheck/mc_translate.c for
    141    background/rationale.
    142 */
    143 
    144 /* LOCK prefixed instructions.  These are translated using IR-level
    145    CAS statements (IRCAS) and are believed to preserve atomicity, even
    146    from the point of view of some other process racing against a
    147    simulated one (presumably they communicate via a shared memory
    148    segment).
    149 
    150    Handlers which are aware of LOCK prefixes are:
    151       dis_op2_G_E      (add, or, adc, sbb, and, sub, xor)
    152       dis_cmpxchg_G_E  (cmpxchg)
    153       dis_Grp1         (add, or, adc, sbb, and, sub, xor)
    154       dis_Grp3         (not, neg)
    155       dis_Grp4         (inc, dec)
    156       dis_Grp5         (inc, dec)
    157       dis_Grp8_Imm     (bts, btc, btr)
    158       dis_bt_G_E       (bts, btc, btr)
    159       dis_xadd_G_E     (xadd)
    160 */
    161 
    162 
    163 #include "libvex_basictypes.h"
    164 #include "libvex_ir.h"
    165 #include "libvex.h"
    166 #include "libvex_guest_amd64.h"
    167 
    168 #include "main_util.h"
    169 #include "main_globals.h"
    170 #include "guest_generic_bb_to_IR.h"
    171 #include "guest_generic_x87.h"
    172 #include "guest_amd64_defs.h"
    173 
    174 
    175 /*------------------------------------------------------------*/
    176 /*--- Globals                                              ---*/
    177 /*------------------------------------------------------------*/
    178 
    179 /* These are set at the start of the translation of an insn, right
    180    down in disInstr_AMD64, so that we don't have to pass them around
    181    endlessly.  They are all constant during the translation of any
    182    given insn. */
    183 
    184 /* These are set at the start of the translation of a BB, so
    185    that we don't have to pass them around endlessly. */
    186 
    187 /* We need to know this to do sub-register accesses correctly. */
    188 static VexEndness host_endness;
    189 
    190 /* Pointer to the guest code area (points to start of BB, not to the
    191    insn being processed). */
    192 static const UChar* guest_code;
    193 
    194 /* The guest address corresponding to guest_code[0]. */
    195 static Addr64 guest_RIP_bbstart;
    196 
    197 /* The guest address for the instruction currently being
    198    translated. */
    199 static Addr64 guest_RIP_curr_instr;
    200 
    201 /* The IRSB* into which we're generating code. */
    202 static IRSB* irsb;
    203 
    204 /* For ensuring that %rip-relative addressing is done right.  A read
    205    of %rip generates the address of the next instruction.  It may be
    206    that we don't conveniently know that inside disAMode().  For sanity
    207    checking, if the next insn %rip is needed, we make a guess at what
    208    it is, record that guess here, and set the accompanying Bool to
    209    indicate that -- after this insn's decode is finished -- that guess
    210    needs to be checked.  */
    211 
    212 /* At the start of each insn decode, is set to (0, False).
    213    After the decode, if _mustcheck is now True, _assumed is
    214    checked. */
    215 
    216 static Addr64 guest_RIP_next_assumed;
    217 static Bool   guest_RIP_next_mustcheck;
    218 
    219 
    220 /*------------------------------------------------------------*/
    221 /*--- Helpers for constructing IR.                         ---*/
    222 /*------------------------------------------------------------*/
    223 
    224 /* Generate a new temporary of the given type. */
    225 static IRTemp newTemp ( IRType ty )
    226 {
    227    vassert(isPlausibleIRType(ty));
    228    return newIRTemp( irsb->tyenv, ty );
    229 }
    230 
    231 /* Add a statement to the list held by "irsb". */
    232 static void stmt ( IRStmt* st )
    233 {
    234    addStmtToIRSB( irsb, st );
    235 }
    236 
    237 /* Generate a statement "dst := e". */
    238 static void assign ( IRTemp dst, IRExpr* e )
    239 {
    240    stmt( IRStmt_WrTmp(dst, e) );
    241 }
    242 
    243 static IRExpr* unop ( IROp op, IRExpr* a )
    244 {
    245    return IRExpr_Unop(op, a);
    246 }
    247 
    248 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
    249 {
    250    return IRExpr_Binop(op, a1, a2);
    251 }
    252 
    253 static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
    254 {
    255    return IRExpr_Triop(op, a1, a2, a3);
    256 }
    257 
    258 static IRExpr* mkexpr ( IRTemp tmp )
    259 {
    260    return IRExpr_RdTmp(tmp);
    261 }
    262 
    263 static IRExpr* mkU8 ( ULong i )
    264 {
    265    vassert(i < 256);
    266    return IRExpr_Const(IRConst_U8( (UChar)i ));
    267 }
    268 
    269 static IRExpr* mkU16 ( ULong i )
    270 {
    271    vassert(i < 0x10000ULL);
    272    return IRExpr_Const(IRConst_U16( (UShort)i ));
    273 }
    274 
    275 static IRExpr* mkU32 ( ULong i )
    276 {
    277    vassert(i < 0x100000000ULL);
    278    return IRExpr_Const(IRConst_U32( (UInt)i ));
    279 }
    280 
    281 static IRExpr* mkU64 ( ULong i )
    282 {
    283    return IRExpr_Const(IRConst_U64(i));
    284 }
    285 
    286 static IRExpr* mkU ( IRType ty, ULong i )
    287 {
    288    switch (ty) {
    289       case Ity_I8:  return mkU8(i);
    290       case Ity_I16: return mkU16(i);
    291       case Ity_I32: return mkU32(i);
    292       case Ity_I64: return mkU64(i);
    293       default: vpanic("mkU(amd64)");
    294    }
    295 }
    296 
    297 static void storeLE ( IRExpr* addr, IRExpr* data )
    298 {
    299    stmt( IRStmt_Store(Iend_LE, addr, data) );
    300 }
    301 
    302 static IRExpr* loadLE ( IRType ty, IRExpr* addr )
    303 {
    304    return IRExpr_Load(Iend_LE, ty, addr);
    305 }
    306 
    307 static IROp mkSizedOp ( IRType ty, IROp op8 )
    308 {
    309    vassert(op8 == Iop_Add8 || op8 == Iop_Sub8
    310            || op8 == Iop_Mul8
    311            || op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
    312            || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
    313            || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
    314            || op8 == Iop_CasCmpNE8
    315            || op8 == Iop_Not8 );
    316    switch (ty) {
    317       case Ity_I8:  return 0 +op8;
    318       case Ity_I16: return 1 +op8;
    319       case Ity_I32: return 2 +op8;
    320       case Ity_I64: return 3 +op8;
    321       default: vpanic("mkSizedOp(amd64)");
    322    }
    323 }
    324 
    325 static
    326 IRExpr* doScalarWidening ( Int szSmall, Int szBig, Bool signd, IRExpr* src )
    327 {
    328    if (szSmall == 1 && szBig == 4) {
    329       return unop(signd ? Iop_8Sto32 : Iop_8Uto32, src);
    330    }
    331    if (szSmall == 1 && szBig == 2) {
    332       return unop(signd ? Iop_8Sto16 : Iop_8Uto16, src);
    333    }
    334    if (szSmall == 2 && szBig == 4) {
    335       return unop(signd ? Iop_16Sto32 : Iop_16Uto32, src);
    336    }
    337    if (szSmall == 1 && szBig == 8 && !signd) {
    338       return unop(Iop_8Uto64, src);
    339    }
    340    if (szSmall == 1 && szBig == 8 && signd) {
    341       return unop(Iop_8Sto64, src);
    342    }
    343    if (szSmall == 2 && szBig == 8 && !signd) {
    344       return unop(Iop_16Uto64, src);
    345    }
    346    if (szSmall == 2 && szBig == 8 && signd) {
    347       return unop(Iop_16Sto64, src);
    348    }
    349    vpanic("doScalarWidening(amd64)");
    350 }
    351 
    352 static
    353 void putGuarded ( Int gstOffB, IRExpr* guard, IRExpr* value )
    354 {
    355    IRType ty = typeOfIRExpr(irsb->tyenv, value);
    356    stmt( IRStmt_Put(gstOffB,
    357                     IRExpr_ITE(guard, value, IRExpr_Get(gstOffB, ty))) );
    358 }
    359 
    360 
    361 /*------------------------------------------------------------*/
    362 /*--- Debugging output                                     ---*/
    363 /*------------------------------------------------------------*/
    364 
    365 /* Bomb out if we can't handle something. */
    366 __attribute__ ((noreturn))
    367 static void unimplemented ( const HChar* str )
    368 {
    369    vex_printf("amd64toIR: unimplemented feature\n");
    370    vpanic(str);
    371 }
    372 
    373 #define DIP(format, args...)           \
    374    if (vex_traceflags & VEX_TRACE_FE)  \
    375       vex_printf(format, ## args)
    376 
    377 #define DIS(buf, format, args...)      \
    378    if (vex_traceflags & VEX_TRACE_FE)  \
    379       vex_sprintf(buf, format, ## args)
    380 
    381 
    382 /*------------------------------------------------------------*/
    383 /*--- Offsets of various parts of the amd64 guest state.   ---*/
    384 /*------------------------------------------------------------*/
    385 
    386 #define OFFB_RAX       offsetof(VexGuestAMD64State,guest_RAX)
    387 #define OFFB_RBX       offsetof(VexGuestAMD64State,guest_RBX)
    388 #define OFFB_RCX       offsetof(VexGuestAMD64State,guest_RCX)
    389 #define OFFB_RDX       offsetof(VexGuestAMD64State,guest_RDX)
    390 #define OFFB_RSP       offsetof(VexGuestAMD64State,guest_RSP)
    391 #define OFFB_RBP       offsetof(VexGuestAMD64State,guest_RBP)
    392 #define OFFB_RSI       offsetof(VexGuestAMD64State,guest_RSI)
    393 #define OFFB_RDI       offsetof(VexGuestAMD64State,guest_RDI)
    394 #define OFFB_R8        offsetof(VexGuestAMD64State,guest_R8)
    395 #define OFFB_R9        offsetof(VexGuestAMD64State,guest_R9)
    396 #define OFFB_R10       offsetof(VexGuestAMD64State,guest_R10)
    397 #define OFFB_R11       offsetof(VexGuestAMD64State,guest_R11)
    398 #define OFFB_R12       offsetof(VexGuestAMD64State,guest_R12)
    399 #define OFFB_R13       offsetof(VexGuestAMD64State,guest_R13)
    400 #define OFFB_R14       offsetof(VexGuestAMD64State,guest_R14)
    401 #define OFFB_R15       offsetof(VexGuestAMD64State,guest_R15)
    402 
    403 #define OFFB_RIP       offsetof(VexGuestAMD64State,guest_RIP)
    404 
    405 #define OFFB_FS_CONST  offsetof(VexGuestAMD64State,guest_FS_CONST)
    406 #define OFFB_GS_CONST  offsetof(VexGuestAMD64State,guest_GS_CONST)
    407 
    408 #define OFFB_CC_OP     offsetof(VexGuestAMD64State,guest_CC_OP)
    409 #define OFFB_CC_DEP1   offsetof(VexGuestAMD64State,guest_CC_DEP1)
    410 #define OFFB_CC_DEP2   offsetof(VexGuestAMD64State,guest_CC_DEP2)
    411 #define OFFB_CC_NDEP   offsetof(VexGuestAMD64State,guest_CC_NDEP)
    412 
    413 #define OFFB_FPREGS    offsetof(VexGuestAMD64State,guest_FPREG[0])
    414 #define OFFB_FPTAGS    offsetof(VexGuestAMD64State,guest_FPTAG[0])
    415 #define OFFB_DFLAG     offsetof(VexGuestAMD64State,guest_DFLAG)
    416 #define OFFB_ACFLAG    offsetof(VexGuestAMD64State,guest_ACFLAG)
    417 #define OFFB_IDFLAG    offsetof(VexGuestAMD64State,guest_IDFLAG)
    418 #define OFFB_FTOP      offsetof(VexGuestAMD64State,guest_FTOP)
    419 #define OFFB_FC3210    offsetof(VexGuestAMD64State,guest_FC3210)
    420 #define OFFB_FPROUND   offsetof(VexGuestAMD64State,guest_FPROUND)
    421 
    422 #define OFFB_SSEROUND  offsetof(VexGuestAMD64State,guest_SSEROUND)
    423 #define OFFB_YMM0      offsetof(VexGuestAMD64State,guest_YMM0)
    424 #define OFFB_YMM1      offsetof(VexGuestAMD64State,guest_YMM1)
    425 #define OFFB_YMM2      offsetof(VexGuestAMD64State,guest_YMM2)
    426 #define OFFB_YMM3      offsetof(VexGuestAMD64State,guest_YMM3)
    427 #define OFFB_YMM4      offsetof(VexGuestAMD64State,guest_YMM4)
    428 #define OFFB_YMM5      offsetof(VexGuestAMD64State,guest_YMM5)
    429 #define OFFB_YMM6      offsetof(VexGuestAMD64State,guest_YMM6)
    430 #define OFFB_YMM7      offsetof(VexGuestAMD64State,guest_YMM7)
    431 #define OFFB_YMM8      offsetof(VexGuestAMD64State,guest_YMM8)
    432 #define OFFB_YMM9      offsetof(VexGuestAMD64State,guest_YMM9)
    433 #define OFFB_YMM10     offsetof(VexGuestAMD64State,guest_YMM10)
    434 #define OFFB_YMM11     offsetof(VexGuestAMD64State,guest_YMM11)
    435 #define OFFB_YMM12     offsetof(VexGuestAMD64State,guest_YMM12)
    436 #define OFFB_YMM13     offsetof(VexGuestAMD64State,guest_YMM13)
    437 #define OFFB_YMM14     offsetof(VexGuestAMD64State,guest_YMM14)
    438 #define OFFB_YMM15     offsetof(VexGuestAMD64State,guest_YMM15)
    439 #define OFFB_YMM16     offsetof(VexGuestAMD64State,guest_YMM16)
    440 
    441 #define OFFB_EMNOTE    offsetof(VexGuestAMD64State,guest_EMNOTE)
    442 #define OFFB_CMSTART   offsetof(VexGuestAMD64State,guest_CMSTART)
    443 #define OFFB_CMLEN     offsetof(VexGuestAMD64State,guest_CMLEN)
    444 
    445 #define OFFB_NRADDR    offsetof(VexGuestAMD64State,guest_NRADDR)
    446 
    447 
    448 /*------------------------------------------------------------*/
    449 /*--- Helper bits and pieces for deconstructing the        ---*/
    450 /*--- amd64 insn stream.                                   ---*/
    451 /*------------------------------------------------------------*/
    452 
    453 /* This is the AMD64 register encoding -- integer regs. */
    454 #define R_RAX 0
    455 #define R_RCX 1
    456 #define R_RDX 2
    457 #define R_RBX 3
    458 #define R_RSP 4
    459 #define R_RBP 5
    460 #define R_RSI 6
    461 #define R_RDI 7
    462 #define R_R8  8
    463 #define R_R9  9
    464 #define R_R10 10
    465 #define R_R11 11
    466 #define R_R12 12
    467 #define R_R13 13
    468 #define R_R14 14
    469 #define R_R15 15
    470 
    471 /* This is the Intel register encoding -- segment regs. */
    472 #define R_ES 0
    473 #define R_CS 1
    474 #define R_SS 2
    475 #define R_DS 3
    476 #define R_FS 4
    477 #define R_GS 5
    478 
    479 
    480 /* Various simple conversions */
    481 
    482 static ULong extend_s_8to64 ( UChar x )
    483 {
    484    return (ULong)((Long)(((ULong)x) << 56) >> 56);
    485 }
    486 
    487 static ULong extend_s_16to64 ( UShort x )
    488 {
    489    return (ULong)((Long)(((ULong)x) << 48) >> 48);
    490 }
    491 
    492 static ULong extend_s_32to64 ( UInt x )
    493 {
    494    return (ULong)((Long)(((ULong)x) << 32) >> 32);
    495 }
    496 
    497 /* Figure out whether the mod and rm parts of a modRM byte refer to a
    498    register or memory.  If so, the byte will have the form 11XXXYYY,
    499    where YYY is the register number. */
    500 inline
    501 static Bool epartIsReg ( UChar mod_reg_rm )
    502 {
    503    return toBool(0xC0 == (mod_reg_rm & 0xC0));
    504 }
    505 
    506 /* Extract the 'g' field from a modRM byte.  This only produces 3
    507    bits, which is not a complete register number.  You should avoid
    508    this function if at all possible. */
    509 inline
    510 static Int gregLO3ofRM ( UChar mod_reg_rm )
    511 {
    512    return (Int)( (mod_reg_rm >> 3) & 7 );
    513 }
    514 
    515 /* Ditto the 'e' field of a modRM byte. */
    516 inline
    517 static Int eregLO3ofRM ( UChar mod_reg_rm )
    518 {
    519    return (Int)(mod_reg_rm & 0x7);
    520 }
    521 
    522 /* Get a 8/16/32-bit unsigned value out of the insn stream. */
    523 
    524 static inline UChar getUChar ( Long delta )
    525 {
    526    UChar v = guest_code[delta+0];
    527    return v;
    528 }
    529 
    530 static UInt getUDisp16 ( Long delta )
    531 {
    532    UInt v = guest_code[delta+1]; v <<= 8;
    533    v |= guest_code[delta+0];
    534    return v & 0xFFFF;
    535 }
    536 
    537 //.. static UInt getUDisp ( Int size, Long delta )
    538 //.. {
    539 //..    switch (size) {
    540 //..       case 4: return getUDisp32(delta);
    541 //..       case 2: return getUDisp16(delta);
    542 //..       case 1: return getUChar(delta);
    543 //..       default: vpanic("getUDisp(x86)");
    544 //..    }
    545 //..    return 0; /*notreached*/
    546 //.. }
    547 
    548 
    549 /* Get a byte value out of the insn stream and sign-extend to 64
    550    bits. */
    551 static Long getSDisp8 ( Long delta )
    552 {
    553    return extend_s_8to64( guest_code[delta] );
    554 }
    555 
    556 /* Get a 16-bit value out of the insn stream and sign-extend to 64
    557    bits. */
    558 static Long getSDisp16 ( Long delta )
    559 {
    560    UInt v = guest_code[delta+1]; v <<= 8;
    561    v |= guest_code[delta+0];
    562    return extend_s_16to64( (UShort)v );
    563 }
    564 
    565 /* Get a 32-bit value out of the insn stream and sign-extend to 64
    566    bits. */
    567 static Long getSDisp32 ( Long delta )
    568 {
    569    UInt v = guest_code[delta+3]; v <<= 8;
    570    v |= guest_code[delta+2]; v <<= 8;
    571    v |= guest_code[delta+1]; v <<= 8;
    572    v |= guest_code[delta+0];
    573    return extend_s_32to64( v );
    574 }
    575 
    576 /* Get a 64-bit value out of the insn stream. */
    577 static Long getDisp64 ( Long delta )
    578 {
    579    ULong v = 0;
    580    v |= guest_code[delta+7]; v <<= 8;
    581    v |= guest_code[delta+6]; v <<= 8;
    582    v |= guest_code[delta+5]; v <<= 8;
    583    v |= guest_code[delta+4]; v <<= 8;
    584    v |= guest_code[delta+3]; v <<= 8;
    585    v |= guest_code[delta+2]; v <<= 8;
    586    v |= guest_code[delta+1]; v <<= 8;
    587    v |= guest_code[delta+0];
    588    return v;
    589 }
    590 
    591 /* Note: because AMD64 doesn't allow 64-bit literals, it is an error
    592    if this is called with size==8.  Should not happen. */
    593 static Long getSDisp ( Int size, Long delta )
    594 {
    595    switch (size) {
    596       case 4: return getSDisp32(delta);
    597       case 2: return getSDisp16(delta);
    598       case 1: return getSDisp8(delta);
    599       default: vpanic("getSDisp(amd64)");
    600   }
    601 }
    602 
    603 static ULong mkSizeMask ( Int sz )
    604 {
    605    switch (sz) {
    606       case 1: return 0x00000000000000FFULL;
    607       case 2: return 0x000000000000FFFFULL;
    608       case 4: return 0x00000000FFFFFFFFULL;
    609       case 8: return 0xFFFFFFFFFFFFFFFFULL;
    610       default: vpanic("mkSzMask(amd64)");
    611    }
    612 }
    613 
    614 static Int imin ( Int a, Int b )
    615 {
    616    return (a < b) ? a : b;
    617 }
    618 
    619 static IRType szToITy ( Int n )
    620 {
    621    switch (n) {
    622       case 1: return Ity_I8;
    623       case 2: return Ity_I16;
    624       case 4: return Ity_I32;
    625       case 8: return Ity_I64;
    626       default: vex_printf("\nszToITy(%d)\n", n);
    627                vpanic("szToITy(amd64)");
    628    }
    629 }
    630 
    631 
    632 /*------------------------------------------------------------*/
    633 /*--- For dealing with prefixes.                           ---*/
    634 /*------------------------------------------------------------*/
    635 
    636 /* The idea is to pass around an int holding a bitmask summarising
    637    info from the prefixes seen on the current instruction, including
    638    info from the REX byte.  This info is used in various places, but
    639    most especially when making sense of register fields in
    640    instructions.
    641 
    642    The top 8 bits of the prefix are 0x55, just as a hacky way to
    643    ensure it really is a valid prefix.
    644 
    645    Things you can safely assume about a well-formed prefix:
    646    * at most one segment-override bit (CS,DS,ES,FS,GS,SS) is set.
    647    * if REX is not present then REXW,REXR,REXX,REXB will read
    648      as zero.
    649    * F2 and F3 will not both be 1.
    650 */
    651 
    652 typedef UInt  Prefix;
    653 
    654 #define PFX_ASO    (1<<0)    /* address-size override present (0x67) */
    655 #define PFX_66     (1<<1)    /* operand-size override-to-16 present (0x66) */
    656 #define PFX_REX    (1<<2)    /* REX byte present (0x40 to 0x4F) */
    657 #define PFX_REXW   (1<<3)    /* REX W bit, if REX present, else 0 */
    658 #define PFX_REXR   (1<<4)    /* REX R bit, if REX present, else 0 */
    659 #define PFX_REXX   (1<<5)    /* REX X bit, if REX present, else 0 */
    660 #define PFX_REXB   (1<<6)    /* REX B bit, if REX present, else 0 */
    661 #define PFX_LOCK   (1<<7)    /* bus LOCK prefix present (0xF0) */
    662 #define PFX_F2     (1<<8)    /* REP/REPE/REPZ prefix present (0xF2) */
    663 #define PFX_F3     (1<<9)    /* REPNE/REPNZ prefix present (0xF3) */
    664 #define PFX_CS     (1<<10)   /* CS segment prefix present (0x2E) */
    665 #define PFX_DS     (1<<11)   /* DS segment prefix present (0x3E) */
    666 #define PFX_ES     (1<<12)   /* ES segment prefix present (0x26) */
    667 #define PFX_FS     (1<<13)   /* FS segment prefix present (0x64) */
    668 #define PFX_GS     (1<<14)   /* GS segment prefix present (0x65) */
    669 #define PFX_SS     (1<<15)   /* SS segment prefix present (0x36) */
    670 #define PFX_VEX    (1<<16)   /* VEX prefix present (0xC4 or 0xC5) */
    671 #define PFX_VEXL   (1<<17)   /* VEX L bit, if VEX present, else 0 */
    672 /* The extra register field VEX.vvvv is encoded (after not-ing it) as
    673    PFX_VEXnV3 .. PFX_VEXnV0, so these must occupy adjacent bit
    674    positions. */
    675 #define PFX_VEXnV0 (1<<18)   /* ~VEX vvvv[0], if VEX present, else 0 */
    676 #define PFX_VEXnV1 (1<<19)   /* ~VEX vvvv[1], if VEX present, else 0 */
    677 #define PFX_VEXnV2 (1<<20)   /* ~VEX vvvv[2], if VEX present, else 0 */
    678 #define PFX_VEXnV3 (1<<21)   /* ~VEX vvvv[3], if VEX present, else 0 */
    679 
    680 
    681 #define PFX_EMPTY 0x55000000
    682 
    683 static Bool IS_VALID_PFX ( Prefix pfx ) {
    684    return toBool((pfx & 0xFF000000) == PFX_EMPTY);
    685 }
    686 
    687 static Bool haveREX ( Prefix pfx ) {
    688    return toBool(pfx & PFX_REX);
    689 }
    690 
    691 static Int getRexW ( Prefix pfx ) {
    692    return (pfx & PFX_REXW) ? 1 : 0;
    693 }
    694 static Int getRexR ( Prefix pfx ) {
    695    return (pfx & PFX_REXR) ? 1 : 0;
    696 }
    697 static Int getRexX ( Prefix pfx ) {
    698    return (pfx & PFX_REXX) ? 1 : 0;
    699 }
    700 static Int getRexB ( Prefix pfx ) {
    701    return (pfx & PFX_REXB) ? 1 : 0;
    702 }
    703 
    704 /* Check a prefix doesn't have F2 or F3 set in it, since usually that
    705    completely changes what instruction it really is. */
    706 static Bool haveF2orF3 ( Prefix pfx ) {
    707    return toBool((pfx & (PFX_F2|PFX_F3)) > 0);
    708 }
    709 static Bool haveF2andF3 ( Prefix pfx ) {
    710    return toBool((pfx & (PFX_F2|PFX_F3)) == (PFX_F2|PFX_F3));
    711 }
    712 static Bool haveF2 ( Prefix pfx ) {
    713    return toBool((pfx & PFX_F2) > 0);
    714 }
    715 static Bool haveF3 ( Prefix pfx ) {
    716    return toBool((pfx & PFX_F3) > 0);
    717 }
    718 
    719 static Bool have66 ( Prefix pfx ) {
    720    return toBool((pfx & PFX_66) > 0);
    721 }
    722 static Bool haveASO ( Prefix pfx ) {
    723    return toBool((pfx & PFX_ASO) > 0);
    724 }
    725 static Bool haveLOCK ( Prefix pfx ) {
    726    return toBool((pfx & PFX_LOCK) > 0);
    727 }
    728 
    729 /* Return True iff pfx has 66 set and F2 and F3 clear */
    730 static Bool have66noF2noF3 ( Prefix pfx )
    731 {
    732   return
    733      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_66);
    734 }
    735 
    736 /* Return True iff pfx has F2 set and 66 and F3 clear */
    737 static Bool haveF2no66noF3 ( Prefix pfx )
    738 {
    739   return
    740      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F2);
    741 }
    742 
    743 /* Return True iff pfx has F3 set and 66 and F2 clear */
    744 static Bool haveF3no66noF2 ( Prefix pfx )
    745 {
    746   return
    747      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F3);
    748 }
    749 
    750 /* Return True iff pfx has F3 set and F2 clear */
    751 static Bool haveF3noF2 ( Prefix pfx )
    752 {
    753   return
    754      toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F3);
    755 }
    756 
    757 /* Return True iff pfx has F2 set and F3 clear */
    758 static Bool haveF2noF3 ( Prefix pfx )
    759 {
    760   return
    761      toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F2);
    762 }
    763 
    764 /* Return True iff pfx has 66, F2 and F3 clear */
    765 static Bool haveNo66noF2noF3 ( Prefix pfx )
    766 {
    767   return
    768      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == 0);
    769 }
    770 
    771 /* Return True iff pfx has any of 66, F2 and F3 set */
    772 static Bool have66orF2orF3 ( Prefix pfx )
    773 {
    774   return toBool( ! haveNo66noF2noF3(pfx) );
    775 }
    776 
    777 /* Return True iff pfx has 66 or F3 set */
    778 static Bool have66orF3 ( Prefix pfx )
    779 {
    780    return toBool((pfx & (PFX_66|PFX_F3)) > 0);
    781 }
    782 
    783 /* Clear all the segment-override bits in a prefix. */
    784 static Prefix clearSegBits ( Prefix p )
    785 {
    786    return
    787       p & ~(PFX_CS | PFX_DS | PFX_ES | PFX_FS | PFX_GS | PFX_SS);
    788 }
    789 
    790 /* Get the (inverted, hence back to "normal") VEX.vvvv field. */
    791 static UInt getVexNvvvv ( Prefix pfx ) {
    792    UInt r = (UInt)pfx;
    793    r /= (UInt)PFX_VEXnV0; /* pray this turns into a shift */
    794    return r & 0xF;
    795 }
    796 
    797 static Bool haveVEX ( Prefix pfx ) {
    798    return toBool(pfx & PFX_VEX);
    799 }
    800 
    801 static Int getVexL ( Prefix pfx ) {
    802    return (pfx & PFX_VEXL) ? 1 : 0;
    803 }
    804 
    805 
    806 /*------------------------------------------------------------*/
    807 /*--- For dealing with escapes                             ---*/
    808 /*------------------------------------------------------------*/
    809 
    810 
    811 /* Escapes come after the prefixes, but before the primary opcode
    812    byte.  They escape the primary opcode byte into a bigger space.
    813    The 0xF0000000 isn't significant, except so as to make it not
    814    overlap valid Prefix values, for sanity checking.
    815 */
    816 
    817 typedef
    818    enum {
    819       ESC_NONE=0xF0000000, // none
    820       ESC_0F,              // 0F
    821       ESC_0F38,            // 0F 38
    822       ESC_0F3A             // 0F 3A
    823    }
    824    Escape;
    825 
    826 
    827 /*------------------------------------------------------------*/
    828 /*--- For dealing with integer registers                   ---*/
    829 /*------------------------------------------------------------*/
    830 
    831 /* This is somewhat complex.  The rules are:
    832 
    833    For 64, 32 and 16 bit register references, the e or g fields in the
    834    modrm bytes supply the low 3 bits of the register number.  The
    835    fourth (most-significant) bit of the register number is supplied by
    836    the REX byte, if it is present; else that bit is taken to be zero.
    837 
    838    The REX.R bit supplies the high bit corresponding to the g register
    839    field, and the REX.B bit supplies the high bit corresponding to the
    840    e register field (when the mod part of modrm indicates that modrm's
    841    e component refers to a register and not to memory).
    842 
    843    The REX.X bit supplies a high register bit for certain registers
    844    in SIB address modes, and is generally rarely used.
    845 
    846    For 8 bit register references, the presence of the REX byte itself
    847    has significance.  If there is no REX present, then the 3-bit
    848    number extracted from the modrm e or g field is treated as an index
    849    into the sequence %al %cl %dl %bl %ah %ch %dh %bh -- that is, the
    850    old x86 encoding scheme.
    851 
    852    But if there is a REX present, the register reference is
    853    interpreted in the same way as for 64/32/16-bit references: a high
    854    bit is extracted from REX, giving a 4-bit number, and the denoted
    855    register is the lowest 8 bits of the 16 integer registers denoted
    856    by the number.  In particular, values 3 through 7 of this sequence
    857    do not refer to %ah %ch %dh %bh but instead to the lowest 8 bits of
    858    %rsp %rbp %rsi %rdi.
    859 
    860    The REX.W bit has no bearing at all on register numbers.  Instead
    861    its presence indicates that the operand size is to be overridden
    862    from its default value (32 bits) to 64 bits instead.  This is in
    863    the same fashion that an 0x66 prefix indicates the operand size is
    864    to be overridden from 32 bits down to 16 bits.  When both REX.W and
    865    0x66 are present there is a conflict, and REX.W takes precedence.
    866 
    867    Rather than try to handle this complexity using a single huge
    868    function, several smaller ones are provided.  The aim is to make it
    869    as difficult as possible to screw up register decoding in a subtle
    870    and hard-to-track-down way.
    871 
    872    Because these routines fish around in the host's memory (that is,
    873    in the guest state area) for sub-parts of guest registers, their
    874    correctness depends on the host's endianness.  So far these
    875    routines only work for little-endian hosts.  Those for which
    876    endianness is important have assertions to ensure sanity.
    877 */
    878 
    879 
    880 /* About the simplest question you can ask: where do the 64-bit
    881    integer registers live (in the guest state) ? */
    882 
    883 static Int integerGuestReg64Offset ( UInt reg )
    884 {
    885    switch (reg) {
    886       case R_RAX: return OFFB_RAX;
    887       case R_RCX: return OFFB_RCX;
    888       case R_RDX: return OFFB_RDX;
    889       case R_RBX: return OFFB_RBX;
    890       case R_RSP: return OFFB_RSP;
    891       case R_RBP: return OFFB_RBP;
    892       case R_RSI: return OFFB_RSI;
    893       case R_RDI: return OFFB_RDI;
    894       case R_R8:  return OFFB_R8;
    895       case R_R9:  return OFFB_R9;
    896       case R_R10: return OFFB_R10;
    897       case R_R11: return OFFB_R11;
    898       case R_R12: return OFFB_R12;
    899       case R_R13: return OFFB_R13;
    900       case R_R14: return OFFB_R14;
    901       case R_R15: return OFFB_R15;
    902       default: vpanic("integerGuestReg64Offset(amd64)");
    903    }
    904 }
    905 
    906 
    907 /* Produce the name of an integer register, for printing purposes.
    908    reg is a number in the range 0 .. 15 that has been generated from a
    909    3-bit reg-field number and a REX extension bit.  irregular denotes
    910    the case where sz==1 and no REX byte is present. */
    911 
    912 static
    913 const HChar* nameIReg ( Int sz, UInt reg, Bool irregular )
    914 {
    915    static const HChar* ireg64_names[16]
    916      = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
    917          "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
    918    static const HChar* ireg32_names[16]
    919      = { "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
    920          "%r8d", "%r9d", "%r10d","%r11d","%r12d","%r13d","%r14d","%r15d" };
    921    static const HChar* ireg16_names[16]
    922      = { "%ax",  "%cx",  "%dx",  "%bx",  "%sp",  "%bp",  "%si",  "%di",
    923          "%r8w", "%r9w", "%r10w","%r11w","%r12w","%r13w","%r14w","%r15w" };
    924    static const HChar* ireg8_names[16]
    925      = { "%al",  "%cl",  "%dl",  "%bl",  "%spl", "%bpl", "%sil", "%dil",
    926          "%r8b", "%r9b", "%r10b","%r11b","%r12b","%r13b","%r14b","%r15b" };
    927    static const HChar* ireg8_irregular[8]
    928      = { "%al", "%cl", "%dl", "%bl", "%ah", "%ch", "%dh", "%bh" };
    929 
    930    vassert(reg < 16);
    931    if (sz == 1) {
    932       if (irregular)
    933          vassert(reg < 8);
    934    } else {
    935       vassert(irregular == False);
    936    }
    937 
    938    switch (sz) {
    939       case 8: return ireg64_names[reg];
    940       case 4: return ireg32_names[reg];
    941       case 2: return ireg16_names[reg];
    942       case 1: if (irregular) {
    943                  return ireg8_irregular[reg];
    944               } else {
    945                  return ireg8_names[reg];
    946               }
    947       default: vpanic("nameIReg(amd64)");
    948    }
    949 }
    950 
    951 /* Using the same argument conventions as nameIReg, produce the
    952    guest state offset of an integer register. */
    953 
    954 static
    955 Int offsetIReg ( Int sz, UInt reg, Bool irregular )
    956 {
    957    vassert(reg < 16);
    958    if (sz == 1) {
    959       if (irregular)
    960          vassert(reg < 8);
    961    } else {
    962       vassert(irregular == False);
    963    }
    964 
    965    /* Deal with irregular case -- sz==1 and no REX present */
    966    if (sz == 1 && irregular) {
    967       switch (reg) {
    968          case R_RSP: return 1+ OFFB_RAX;
    969          case R_RBP: return 1+ OFFB_RCX;
    970          case R_RSI: return 1+ OFFB_RDX;
    971          case R_RDI: return 1+ OFFB_RBX;
    972          default:    break; /* use the normal case */
    973       }
    974    }
    975 
    976    /* Normal case */
    977    return integerGuestReg64Offset(reg);
    978 }
    979 
    980 
    981 /* Read the %CL register :: Ity_I8, for shift/rotate operations. */
    982 
    983 static IRExpr* getIRegCL ( void )
    984 {
    985    vassert(host_endness == VexEndnessLE);
    986    return IRExpr_Get( OFFB_RCX, Ity_I8 );
    987 }
    988 
    989 
    990 /* Write to the %AH register. */
    991 
    992 static void putIRegAH ( IRExpr* e )
    993 {
    994    vassert(host_endness == VexEndnessLE);
    995    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I8);
    996    stmt( IRStmt_Put( OFFB_RAX+1, e ) );
    997 }
    998 
    999 
   1000 /* Read/write various widths of %RAX, as it has various
   1001    special-purpose uses. */
   1002 
   1003 static const HChar* nameIRegRAX ( Int sz )
   1004 {
   1005    switch (sz) {
   1006       case 1: return "%al";
   1007       case 2: return "%ax";
   1008       case 4: return "%eax";
   1009       case 8: return "%rax";
   1010       default: vpanic("nameIRegRAX(amd64)");
   1011    }
   1012 }
   1013 
   1014 static IRExpr* getIRegRAX ( Int sz )
   1015 {
   1016    vassert(host_endness == VexEndnessLE);
   1017    switch (sz) {
   1018       case 1: return IRExpr_Get( OFFB_RAX, Ity_I8 );
   1019       case 2: return IRExpr_Get( OFFB_RAX, Ity_I16 );
   1020       case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RAX, Ity_I64 ));
   1021       case 8: return IRExpr_Get( OFFB_RAX, Ity_I64 );
   1022       default: vpanic("getIRegRAX(amd64)");
   1023    }
   1024 }
   1025 
   1026 static void putIRegRAX ( Int sz, IRExpr* e )
   1027 {
   1028    IRType ty = typeOfIRExpr(irsb->tyenv, e);
   1029    vassert(host_endness == VexEndnessLE);
   1030    switch (sz) {
   1031       case 8: vassert(ty == Ity_I64);
   1032               stmt( IRStmt_Put( OFFB_RAX, e ));
   1033               break;
   1034       case 4: vassert(ty == Ity_I32);
   1035               stmt( IRStmt_Put( OFFB_RAX, unop(Iop_32Uto64,e) ));
   1036               break;
   1037       case 2: vassert(ty == Ity_I16);
   1038               stmt( IRStmt_Put( OFFB_RAX, e ));
   1039               break;
   1040       case 1: vassert(ty == Ity_I8);
   1041               stmt( IRStmt_Put( OFFB_RAX, e ));
   1042               break;
   1043       default: vpanic("putIRegRAX(amd64)");
   1044    }
   1045 }
   1046 
   1047 
   1048 /* Read/write various widths of %RDX, as it has various
   1049    special-purpose uses. */
   1050 
   1051 static const HChar* nameIRegRDX ( Int sz )
   1052 {
   1053    switch (sz) {
   1054       case 1: return "%dl";
   1055       case 2: return "%dx";
   1056       case 4: return "%edx";
   1057       case 8: return "%rdx";
   1058       default: vpanic("nameIRegRDX(amd64)");
   1059    }
   1060 }
   1061 
   1062 static IRExpr* getIRegRDX ( Int sz )
   1063 {
   1064    vassert(host_endness == VexEndnessLE);
   1065    switch (sz) {
   1066       case 1: return IRExpr_Get( OFFB_RDX, Ity_I8 );
   1067       case 2: return IRExpr_Get( OFFB_RDX, Ity_I16 );
   1068       case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RDX, Ity_I64 ));
   1069       case 8: return IRExpr_Get( OFFB_RDX, Ity_I64 );
   1070       default: vpanic("getIRegRDX(amd64)");
   1071    }
   1072 }
   1073 
   1074 static void putIRegRDX ( Int sz, IRExpr* e )
   1075 {
   1076    vassert(host_endness == VexEndnessLE);
   1077    vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
   1078    switch (sz) {
   1079       case 8: stmt( IRStmt_Put( OFFB_RDX, e ));
   1080               break;
   1081       case 4: stmt( IRStmt_Put( OFFB_RDX, unop(Iop_32Uto64,e) ));
   1082               break;
   1083       case 2: stmt( IRStmt_Put( OFFB_RDX, e ));
   1084               break;
   1085       case 1: stmt( IRStmt_Put( OFFB_RDX, e ));
   1086               break;
   1087       default: vpanic("putIRegRDX(amd64)");
   1088    }
   1089 }
   1090 
   1091 
   1092 /* Simplistic functions to deal with the integer registers as a
   1093    straightforward bank of 16 64-bit regs. */
   1094 
   1095 static IRExpr* getIReg64 ( UInt regno )
   1096 {
   1097    return IRExpr_Get( integerGuestReg64Offset(regno),
   1098                       Ity_I64 );
   1099 }
   1100 
   1101 static void putIReg64 ( UInt regno, IRExpr* e )
   1102 {
   1103    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   1104    stmt( IRStmt_Put( integerGuestReg64Offset(regno), e ) );
   1105 }
   1106 
   1107 static const HChar* nameIReg64 ( UInt regno )
   1108 {
   1109    return nameIReg( 8, regno, False );
   1110 }
   1111 
   1112 
   1113 /* Simplistic functions to deal with the lower halves of integer
   1114    registers as a straightforward bank of 16 32-bit regs. */
   1115 
   1116 static IRExpr* getIReg32 ( UInt regno )
   1117 {
   1118    vassert(host_endness == VexEndnessLE);
   1119    return unop(Iop_64to32,
   1120                IRExpr_Get( integerGuestReg64Offset(regno),
   1121                            Ity_I64 ));
   1122 }
   1123 
   1124 static void putIReg32 ( UInt regno, IRExpr* e )
   1125 {
   1126    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
   1127    stmt( IRStmt_Put( integerGuestReg64Offset(regno),
   1128                      unop(Iop_32Uto64,e) ) );
   1129 }
   1130 
   1131 static const HChar* nameIReg32 ( UInt regno )
   1132 {
   1133    return nameIReg( 4, regno, False );
   1134 }
   1135 
   1136 
   1137 /* Simplistic functions to deal with the lower quarters of integer
   1138    registers as a straightforward bank of 16 16-bit regs. */
   1139 
   1140 static IRExpr* getIReg16 ( UInt regno )
   1141 {
   1142    vassert(host_endness == VexEndnessLE);
   1143    return IRExpr_Get( integerGuestReg64Offset(regno),
   1144                       Ity_I16 );
   1145 }
   1146 
   1147 static void putIReg16 ( UInt regno, IRExpr* e )
   1148 {
   1149    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
   1150    stmt( IRStmt_Put( integerGuestReg64Offset(regno),
   1151                      unop(Iop_16Uto64,e) ) );
   1152 }
   1153 
   1154 static const HChar* nameIReg16 ( UInt regno )
   1155 {
   1156    return nameIReg( 2, regno, False );
   1157 }
   1158 
   1159 
   1160 /* Sometimes what we know is a 3-bit register number, a REX byte, and
   1161    which field of the REX byte is to be used to extend to a 4-bit
   1162    number.  These functions cater for that situation.
   1163 */
   1164 static IRExpr* getIReg64rexX ( Prefix pfx, UInt lo3bits )
   1165 {
   1166    vassert(lo3bits < 8);
   1167    vassert(IS_VALID_PFX(pfx));
   1168    return getIReg64( lo3bits | (getRexX(pfx) << 3) );
   1169 }
   1170 
   1171 static const HChar* nameIReg64rexX ( Prefix pfx, UInt lo3bits )
   1172 {
   1173    vassert(lo3bits < 8);
   1174    vassert(IS_VALID_PFX(pfx));
   1175    return nameIReg( 8, lo3bits | (getRexX(pfx) << 3), False );
   1176 }
   1177 
   1178 static const HChar* nameIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
   1179 {
   1180    vassert(lo3bits < 8);
   1181    vassert(IS_VALID_PFX(pfx));
   1182    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1183    return nameIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1184                         toBool(sz==1 && !haveREX(pfx)) );
   1185 }
   1186 
   1187 static IRExpr* getIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
   1188 {
   1189    vassert(lo3bits < 8);
   1190    vassert(IS_VALID_PFX(pfx));
   1191    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1192    if (sz == 4) {
   1193       sz = 8;
   1194       return unop(Iop_64to32,
   1195                   IRExpr_Get(
   1196                      offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1197                                      False/*!irregular*/ ),
   1198                      szToITy(sz)
   1199                  )
   1200              );
   1201    } else {
   1202       return IRExpr_Get(
   1203                 offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1204                                 toBool(sz==1 && !haveREX(pfx)) ),
   1205                 szToITy(sz)
   1206              );
   1207    }
   1208 }
   1209 
   1210 static void putIRegRexB ( Int sz, Prefix pfx, UInt lo3bits, IRExpr* e )
   1211 {
   1212    vassert(lo3bits < 8);
   1213    vassert(IS_VALID_PFX(pfx));
   1214    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1215    vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
   1216    stmt( IRStmt_Put(
   1217             offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1218                             toBool(sz==1 && !haveREX(pfx)) ),
   1219             sz==4 ? unop(Iop_32Uto64,e) : e
   1220    ));
   1221 }
   1222 
   1223 
   1224 /* Functions for getting register numbers from modrm bytes and REX
   1225    when we don't have to consider the complexities of integer subreg
   1226    accesses.
   1227 */
   1228 /* Extract the g reg field from a modRM byte, and augment it using the
   1229    REX.R bit from the supplied REX byte.  The R bit usually is
   1230    associated with the g register field.
   1231 */
   1232 static UInt gregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
   1233 {
   1234    Int reg = (Int)( (mod_reg_rm >> 3) & 7 );
   1235    reg += (pfx & PFX_REXR) ? 8 : 0;
   1236    return reg;
   1237 }
   1238 
   1239 /* Extract the e reg field from a modRM byte, and augment it using the
   1240    REX.B bit from the supplied REX byte.  The B bit usually is
   1241    associated with the e register field (when modrm indicates e is a
   1242    register, that is).
   1243 */
   1244 static UInt eregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
   1245 {
   1246    Int rm;
   1247    vassert(epartIsReg(mod_reg_rm));
   1248    rm = (Int)(mod_reg_rm & 0x7);
   1249    rm += (pfx & PFX_REXB) ? 8 : 0;
   1250    return rm;
   1251 }
   1252 
   1253 
   1254 /* General functions for dealing with integer register access. */
   1255 
   1256 /* Produce the guest state offset for a reference to the 'g' register
   1257    field in a modrm byte, taking into account REX (or its absence),
   1258    and the size of the access.
   1259 */
   1260 static UInt offsetIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1261 {
   1262    UInt reg;
   1263    vassert(host_endness == VexEndnessLE);
   1264    vassert(IS_VALID_PFX(pfx));
   1265    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1266    reg = gregOfRexRM( pfx, mod_reg_rm );
   1267    return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
   1268 }
   1269 
   1270 static
   1271 IRExpr* getIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1272 {
   1273    if (sz == 4) {
   1274       sz = 8;
   1275       return unop(Iop_64to32,
   1276                   IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
   1277                               szToITy(sz) ));
   1278    } else {
   1279       return IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
   1280                          szToITy(sz) );
   1281    }
   1282 }
   1283 
   1284 static
   1285 void putIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
   1286 {
   1287    vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
   1288    if (sz == 4) {
   1289       e = unop(Iop_32Uto64,e);
   1290    }
   1291    stmt( IRStmt_Put( offsetIRegG( sz, pfx, mod_reg_rm ), e ) );
   1292 }
   1293 
   1294 static
   1295 const HChar* nameIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1296 {
   1297    return nameIReg( sz, gregOfRexRM(pfx,mod_reg_rm),
   1298                         toBool(sz==1 && !haveREX(pfx)) );
   1299 }
   1300 
   1301 
   1302 static
   1303 IRExpr* getIRegV ( Int sz, Prefix pfx )
   1304 {
   1305    if (sz == 4) {
   1306       sz = 8;
   1307       return unop(Iop_64to32,
   1308                   IRExpr_Get( offsetIReg( sz, getVexNvvvv(pfx), False ),
   1309                               szToITy(sz) ));
   1310    } else {
   1311       return IRExpr_Get( offsetIReg( sz, getVexNvvvv(pfx), False ),
   1312                          szToITy(sz) );
   1313    }
   1314 }
   1315 
   1316 static
   1317 void putIRegV ( Int sz, Prefix pfx, IRExpr* e )
   1318 {
   1319    vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
   1320    if (sz == 4) {
   1321       e = unop(Iop_32Uto64,e);
   1322    }
   1323    stmt( IRStmt_Put( offsetIReg( sz, getVexNvvvv(pfx), False ), e ) );
   1324 }
   1325 
   1326 static
   1327 const HChar* nameIRegV ( Int sz, Prefix pfx )
   1328 {
   1329    return nameIReg( sz, getVexNvvvv(pfx), False );
   1330 }
   1331 
   1332 
   1333 
   1334 /* Produce the guest state offset for a reference to the 'e' register
   1335    field in a modrm byte, taking into account REX (or its absence),
   1336    and the size of the access.  eregOfRexRM will assert if mod_reg_rm
   1337    denotes a memory access rather than a register access.
   1338 */
   1339 static UInt offsetIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1340 {
   1341    UInt reg;
   1342    vassert(host_endness == VexEndnessLE);
   1343    vassert(IS_VALID_PFX(pfx));
   1344    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1345    reg = eregOfRexRM( pfx, mod_reg_rm );
   1346    return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
   1347 }
   1348 
   1349 static
   1350 IRExpr* getIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1351 {
   1352    if (sz == 4) {
   1353       sz = 8;
   1354       return unop(Iop_64to32,
   1355                   IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
   1356                               szToITy(sz) ));
   1357    } else {
   1358       return IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
   1359                          szToITy(sz) );
   1360    }
   1361 }
   1362 
   1363 static
   1364 void putIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
   1365 {
   1366    vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
   1367    if (sz == 4) {
   1368       e = unop(Iop_32Uto64,e);
   1369    }
   1370    stmt( IRStmt_Put( offsetIRegE( sz, pfx, mod_reg_rm ), e ) );
   1371 }
   1372 
   1373 static
   1374 const HChar* nameIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1375 {
   1376    return nameIReg( sz, eregOfRexRM(pfx,mod_reg_rm),
   1377                         toBool(sz==1 && !haveREX(pfx)) );
   1378 }
   1379 
   1380 
   1381 /*------------------------------------------------------------*/
   1382 /*--- For dealing with XMM registers                       ---*/
   1383 /*------------------------------------------------------------*/
   1384 
   1385 static Int ymmGuestRegOffset ( UInt ymmreg )
   1386 {
   1387    switch (ymmreg) {
   1388       case 0:  return OFFB_YMM0;
   1389       case 1:  return OFFB_YMM1;
   1390       case 2:  return OFFB_YMM2;
   1391       case 3:  return OFFB_YMM3;
   1392       case 4:  return OFFB_YMM4;
   1393       case 5:  return OFFB_YMM5;
   1394       case 6:  return OFFB_YMM6;
   1395       case 7:  return OFFB_YMM7;
   1396       case 8:  return OFFB_YMM8;
   1397       case 9:  return OFFB_YMM9;
   1398       case 10: return OFFB_YMM10;
   1399       case 11: return OFFB_YMM11;
   1400       case 12: return OFFB_YMM12;
   1401       case 13: return OFFB_YMM13;
   1402       case 14: return OFFB_YMM14;
   1403       case 15: return OFFB_YMM15;
   1404       default: vpanic("ymmGuestRegOffset(amd64)");
   1405    }
   1406 }
   1407 
   1408 static Int xmmGuestRegOffset ( UInt xmmreg )
   1409 {
   1410    /* Correct for little-endian host only. */
   1411    vassert(host_endness == VexEndnessLE);
   1412    return ymmGuestRegOffset( xmmreg );
   1413 }
   1414 
   1415 /* Lanes of vector registers are always numbered from zero being the
   1416    least significant lane (rightmost in the register).  */
   1417 
   1418 static Int xmmGuestRegLane16offset ( UInt xmmreg, Int laneno )
   1419 {
   1420    /* Correct for little-endian host only. */
   1421    vassert(host_endness == VexEndnessLE);
   1422    vassert(laneno >= 0 && laneno < 8);
   1423    return xmmGuestRegOffset( xmmreg ) + 2 * laneno;
   1424 }
   1425 
   1426 static Int xmmGuestRegLane32offset ( UInt xmmreg, Int laneno )
   1427 {
   1428    /* Correct for little-endian host only. */
   1429    vassert(host_endness == VexEndnessLE);
   1430    vassert(laneno >= 0 && laneno < 4);
   1431    return xmmGuestRegOffset( xmmreg ) + 4 * laneno;
   1432 }
   1433 
   1434 static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno )
   1435 {
   1436    /* Correct for little-endian host only. */
   1437    vassert(host_endness == VexEndnessLE);
   1438    vassert(laneno >= 0 && laneno < 2);
   1439    return xmmGuestRegOffset( xmmreg ) + 8 * laneno;
   1440 }
   1441 
   1442 static Int ymmGuestRegLane128offset ( UInt ymmreg, Int laneno )
   1443 {
   1444    /* Correct for little-endian host only. */
   1445    vassert(host_endness == VexEndnessLE);
   1446    vassert(laneno >= 0 && laneno < 2);
   1447    return ymmGuestRegOffset( ymmreg ) + 16 * laneno;
   1448 }
   1449 
   1450 static Int ymmGuestRegLane64offset ( UInt ymmreg, Int laneno )
   1451 {
   1452    /* Correct for little-endian host only. */
   1453    vassert(host_endness == VexEndnessLE);
   1454    vassert(laneno >= 0 && laneno < 4);
   1455    return ymmGuestRegOffset( ymmreg ) + 8 * laneno;
   1456 }
   1457 
   1458 static Int ymmGuestRegLane32offset ( UInt ymmreg, Int laneno )
   1459 {
   1460    /* Correct for little-endian host only. */
   1461    vassert(host_endness == VexEndnessLE);
   1462    vassert(laneno >= 0 && laneno < 8);
   1463    return ymmGuestRegOffset( ymmreg ) + 4 * laneno;
   1464 }
   1465 
   1466 static IRExpr* getXMMReg ( UInt xmmreg )
   1467 {
   1468    return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
   1469 }
   1470 
   1471 static IRExpr* getXMMRegLane64 ( UInt xmmreg, Int laneno )
   1472 {
   1473    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_I64 );
   1474 }
   1475 
   1476 static IRExpr* getXMMRegLane64F ( UInt xmmreg, Int laneno )
   1477 {
   1478    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_F64 );
   1479 }
   1480 
   1481 static IRExpr* getXMMRegLane32 ( UInt xmmreg, Int laneno )
   1482 {
   1483    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_I32 );
   1484 }
   1485 
   1486 static IRExpr* getXMMRegLane32F ( UInt xmmreg, Int laneno )
   1487 {
   1488    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_F32 );
   1489 }
   1490 
   1491 static IRExpr* getXMMRegLane16 ( UInt xmmreg, Int laneno )
   1492 {
   1493   return IRExpr_Get( xmmGuestRegLane16offset(xmmreg,laneno), Ity_I16 );
   1494 }
   1495 
   1496 static void putXMMReg ( UInt xmmreg, IRExpr* e )
   1497 {
   1498    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
   1499    stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
   1500 }
   1501 
   1502 static void putXMMRegLane64 ( UInt xmmreg, Int laneno, IRExpr* e )
   1503 {
   1504    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   1505    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
   1506 }
   1507 
   1508 static void putXMMRegLane64F ( UInt xmmreg, Int laneno, IRExpr* e )
   1509 {
   1510    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
   1511    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
   1512 }
   1513 
   1514 static void putXMMRegLane32F ( UInt xmmreg, Int laneno, IRExpr* e )
   1515 {
   1516    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
   1517    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
   1518 }
   1519 
   1520 static void putXMMRegLane32 ( UInt xmmreg, Int laneno, IRExpr* e )
   1521 {
   1522    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
   1523    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
   1524 }
   1525 
   1526 static IRExpr* getYMMReg ( UInt xmmreg )
   1527 {
   1528    return IRExpr_Get( ymmGuestRegOffset(xmmreg), Ity_V256 );
   1529 }
   1530 
   1531 static IRExpr* getYMMRegLane128 ( UInt ymmreg, Int laneno )
   1532 {
   1533    return IRExpr_Get( ymmGuestRegLane128offset(ymmreg,laneno), Ity_V128 );
   1534 }
   1535 
   1536 static IRExpr* getYMMRegLane64 ( UInt ymmreg, Int laneno )
   1537 {
   1538    return IRExpr_Get( ymmGuestRegLane64offset(ymmreg,laneno), Ity_I64 );
   1539 }
   1540 
   1541 static IRExpr* getYMMRegLane32 ( UInt ymmreg, Int laneno )
   1542 {
   1543    return IRExpr_Get( ymmGuestRegLane32offset(ymmreg,laneno), Ity_I32 );
   1544 }
   1545 
   1546 static void putYMMReg ( UInt ymmreg, IRExpr* e )
   1547 {
   1548    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V256);
   1549    stmt( IRStmt_Put( ymmGuestRegOffset(ymmreg), e ) );
   1550 }
   1551 
   1552 static void putYMMRegLane128 ( UInt ymmreg, Int laneno, IRExpr* e )
   1553 {
   1554    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
   1555    stmt( IRStmt_Put( ymmGuestRegLane128offset(ymmreg,laneno), e ) );
   1556 }
   1557 
   1558 static void putYMMRegLane64F ( UInt ymmreg, Int laneno, IRExpr* e )
   1559 {
   1560    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
   1561    stmt( IRStmt_Put( ymmGuestRegLane64offset(ymmreg,laneno), e ) );
   1562 }
   1563 
   1564 static void putYMMRegLane64 ( UInt ymmreg, Int laneno, IRExpr* e )
   1565 {
   1566    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   1567    stmt( IRStmt_Put( ymmGuestRegLane64offset(ymmreg,laneno), e ) );
   1568 }
   1569 
   1570 static void putYMMRegLane32F ( UInt ymmreg, Int laneno, IRExpr* e )
   1571 {
   1572    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
   1573    stmt( IRStmt_Put( ymmGuestRegLane32offset(ymmreg,laneno), e ) );
   1574 }
   1575 
   1576 static void putYMMRegLane32 ( UInt ymmreg, Int laneno, IRExpr* e )
   1577 {
   1578    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
   1579    stmt( IRStmt_Put( ymmGuestRegLane32offset(ymmreg,laneno), e ) );
   1580 }
   1581 
   1582 static IRExpr* mkV128 ( UShort mask )
   1583 {
   1584    return IRExpr_Const(IRConst_V128(mask));
   1585 }
   1586 
   1587 /* Write the low half of a YMM reg and zero out the upper half. */
   1588 static void putYMMRegLoAndZU ( UInt ymmreg, IRExpr* e )
   1589 {
   1590    putYMMRegLane128( ymmreg, 0, e );
   1591    putYMMRegLane128( ymmreg, 1, mkV128(0) );
   1592 }
   1593 
   1594 static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y )
   1595 {
   1596    vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1);
   1597    vassert(typeOfIRExpr(irsb->tyenv,y) == Ity_I1);
   1598    return unop(Iop_64to1,
   1599                binop(Iop_And64,
   1600                      unop(Iop_1Uto64,x),
   1601                      unop(Iop_1Uto64,y)));
   1602 }
   1603 
   1604 /* Generate a compare-and-swap operation, operating on memory at
   1605    'addr'.  The expected value is 'expVal' and the new value is
   1606    'newVal'.  If the operation fails, then transfer control (with a
   1607    no-redir jump (XXX no -- see comment at top of this file)) to
   1608    'restart_point', which is presumably the address of the guest
   1609    instruction again -- retrying, essentially. */
   1610 static void casLE ( IRExpr* addr, IRExpr* expVal, IRExpr* newVal,
   1611                     Addr64 restart_point )
   1612 {
   1613    IRCAS* cas;
   1614    IRType tyE    = typeOfIRExpr(irsb->tyenv, expVal);
   1615    IRType tyN    = typeOfIRExpr(irsb->tyenv, newVal);
   1616    IRTemp oldTmp = newTemp(tyE);
   1617    IRTemp expTmp = newTemp(tyE);
   1618    vassert(tyE == tyN);
   1619    vassert(tyE == Ity_I64 || tyE == Ity_I32
   1620            || tyE == Ity_I16 || tyE == Ity_I8);
   1621    assign(expTmp, expVal);
   1622    cas = mkIRCAS( IRTemp_INVALID, oldTmp, Iend_LE, addr,
   1623                   NULL, mkexpr(expTmp), NULL, newVal );
   1624    stmt( IRStmt_CAS(cas) );
   1625    stmt( IRStmt_Exit(
   1626             binop( mkSizedOp(tyE,Iop_CasCmpNE8),
   1627                    mkexpr(oldTmp), mkexpr(expTmp) ),
   1628             Ijk_Boring, /*Ijk_NoRedir*/
   1629             IRConst_U64( restart_point ),
   1630             OFFB_RIP
   1631          ));
   1632 }
   1633 
   1634 
   1635 /*------------------------------------------------------------*/
   1636 /*--- Helpers for %rflags.                                 ---*/
   1637 /*------------------------------------------------------------*/
   1638 
   1639 /* -------------- Evaluating the flags-thunk. -------------- */
   1640 
   1641 /* Build IR to calculate all the eflags from stored
   1642    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
   1643    Ity_I64. */
   1644 static IRExpr* mk_amd64g_calculate_rflags_all ( void )
   1645 {
   1646    IRExpr** args
   1647       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
   1648                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1649                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1650                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   1651    IRExpr* call
   1652       = mkIRExprCCall(
   1653            Ity_I64,
   1654            0/*regparm*/,
   1655            "amd64g_calculate_rflags_all", &amd64g_calculate_rflags_all,
   1656            args
   1657         );
   1658    /* Exclude OP and NDEP from definedness checking.  We're only
   1659       interested in DEP1 and DEP2. */
   1660    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
   1661    return call;
   1662 }
   1663 
   1664 /* Build IR to calculate some particular condition from stored
   1665    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
   1666    Ity_Bit. */
   1667 static IRExpr* mk_amd64g_calculate_condition ( AMD64Condcode cond )
   1668 {
   1669    IRExpr** args
   1670       = mkIRExprVec_5( mkU64(cond),
   1671                        IRExpr_Get(OFFB_CC_OP,   Ity_I64),
   1672                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1673                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1674                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   1675    IRExpr* call
   1676       = mkIRExprCCall(
   1677            Ity_I64,
   1678            0/*regparm*/,
   1679            "amd64g_calculate_condition", &amd64g_calculate_condition,
   1680            args
   1681         );
   1682    /* Exclude the requested condition, OP and NDEP from definedness
   1683       checking.  We're only interested in DEP1 and DEP2. */
   1684    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<1) | (1<<4);
   1685    return unop(Iop_64to1, call);
   1686 }
   1687 
   1688 /* Build IR to calculate just the carry flag from stored
   1689    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression :: Ity_I64. */
   1690 static IRExpr* mk_amd64g_calculate_rflags_c ( void )
   1691 {
   1692    IRExpr** args
   1693       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
   1694                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1695                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1696                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   1697    IRExpr* call
   1698       = mkIRExprCCall(
   1699            Ity_I64,
   1700            0/*regparm*/,
   1701            "amd64g_calculate_rflags_c", &amd64g_calculate_rflags_c,
   1702            args
   1703         );
   1704    /* Exclude OP and NDEP from definedness checking.  We're only
   1705       interested in DEP1 and DEP2. */
   1706    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
   1707    return call;
   1708 }
   1709 
   1710 
   1711 /* -------------- Building the flags-thunk. -------------- */
   1712 
   1713 /* The machinery in this section builds the flag-thunk following a
   1714    flag-setting operation.  Hence the various setFlags_* functions.
   1715 */
   1716 
   1717 static Bool isAddSub ( IROp op8 )
   1718 {
   1719    return toBool(op8 == Iop_Add8 || op8 == Iop_Sub8);
   1720 }
   1721 
   1722 static Bool isLogic ( IROp op8 )
   1723 {
   1724    return toBool(op8 == Iop_And8 || op8 == Iop_Or8 || op8 == Iop_Xor8);
   1725 }
   1726 
   1727 /* U-widen 1/8/16/32/64 bit int expr to 64. */
   1728 static IRExpr* widenUto64 ( IRExpr* e )
   1729 {
   1730    switch (typeOfIRExpr(irsb->tyenv,e)) {
   1731       case Ity_I64: return e;
   1732       case Ity_I32: return unop(Iop_32Uto64, e);
   1733       case Ity_I16: return unop(Iop_16Uto64, e);
   1734       case Ity_I8:  return unop(Iop_8Uto64, e);
   1735       case Ity_I1:  return unop(Iop_1Uto64, e);
   1736       default: vpanic("widenUto64");
   1737    }
   1738 }
   1739 
   1740 /* S-widen 8/16/32/64 bit int expr to 32. */
   1741 static IRExpr* widenSto64 ( IRExpr* e )
   1742 {
   1743    switch (typeOfIRExpr(irsb->tyenv,e)) {
   1744       case Ity_I64: return e;
   1745       case Ity_I32: return unop(Iop_32Sto64, e);
   1746       case Ity_I16: return unop(Iop_16Sto64, e);
   1747       case Ity_I8:  return unop(Iop_8Sto64, e);
   1748       default: vpanic("widenSto64");
   1749    }
   1750 }
   1751 
   1752 /* Narrow 8/16/32/64 bit int expr to 8/16/32/64.  Clearly only some
   1753    of these combinations make sense. */
   1754 static IRExpr* narrowTo ( IRType dst_ty, IRExpr* e )
   1755 {
   1756    IRType src_ty = typeOfIRExpr(irsb->tyenv,e);
   1757    if (src_ty == dst_ty)
   1758       return e;
   1759    if (src_ty == Ity_I32 && dst_ty == Ity_I16)
   1760       return unop(Iop_32to16, e);
   1761    if (src_ty == Ity_I32 && dst_ty == Ity_I8)
   1762       return unop(Iop_32to8, e);
   1763    if (src_ty == Ity_I64 && dst_ty == Ity_I32)
   1764       return unop(Iop_64to32, e);
   1765    if (src_ty == Ity_I64 && dst_ty == Ity_I16)
   1766       return unop(Iop_64to16, e);
   1767    if (src_ty == Ity_I64 && dst_ty == Ity_I8)
   1768       return unop(Iop_64to8, e);
   1769 
   1770    vex_printf("\nsrc, dst tys are: ");
   1771    ppIRType(src_ty);
   1772    vex_printf(", ");
   1773    ppIRType(dst_ty);
   1774    vex_printf("\n");
   1775    vpanic("narrowTo(amd64)");
   1776 }
   1777 
   1778 
   1779 /* Set the flags thunk OP, DEP1 and DEP2 fields.  The supplied op is
   1780    auto-sized up to the real op. */
   1781 
   1782 static
   1783 void setFlags_DEP1_DEP2 ( IROp op8, IRTemp dep1, IRTemp dep2, IRType ty )
   1784 {
   1785    Int ccOp = 0;
   1786    switch (ty) {
   1787       case Ity_I8:  ccOp = 0; break;
   1788       case Ity_I16: ccOp = 1; break;
   1789       case Ity_I32: ccOp = 2; break;
   1790       case Ity_I64: ccOp = 3; break;
   1791       default: vassert(0);
   1792    }
   1793    switch (op8) {
   1794       case Iop_Add8: ccOp += AMD64G_CC_OP_ADDB;   break;
   1795       case Iop_Sub8: ccOp += AMD64G_CC_OP_SUBB;   break;
   1796       default:       ppIROp(op8);
   1797                      vpanic("setFlags_DEP1_DEP2(amd64)");
   1798    }
   1799    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   1800    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
   1801    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(dep2))) );
   1802 }
   1803 
   1804 
   1805 /* Set the OP and DEP1 fields only, and write zero to DEP2. */
   1806 
   1807 static
   1808 void setFlags_DEP1 ( IROp op8, IRTemp dep1, IRType ty )
   1809 {
   1810    Int ccOp = 0;
   1811    switch (ty) {
   1812       case Ity_I8:  ccOp = 0; break;
   1813       case Ity_I16: ccOp = 1; break;
   1814       case Ity_I32: ccOp = 2; break;
   1815       case Ity_I64: ccOp = 3; break;
   1816       default: vassert(0);
   1817    }
   1818    switch (op8) {
   1819       case Iop_Or8:
   1820       case Iop_And8:
   1821       case Iop_Xor8: ccOp += AMD64G_CC_OP_LOGICB; break;
   1822       default:       ppIROp(op8);
   1823                      vpanic("setFlags_DEP1(amd64)");
   1824    }
   1825    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   1826    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
   1827    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
   1828 }
   1829 
   1830 
   1831 /* For shift operations, we put in the result and the undershifted
   1832    result.  Except if the shift amount is zero, the thunk is left
   1833    unchanged. */
   1834 
   1835 static void setFlags_DEP1_DEP2_shift ( IROp    op64,
   1836                                        IRTemp  res,
   1837                                        IRTemp  resUS,
   1838                                        IRType  ty,
   1839                                        IRTemp  guard )
   1840 {
   1841    Int ccOp = 0;
   1842    switch (ty) {
   1843       case Ity_I8:  ccOp = 0; break;
   1844       case Ity_I16: ccOp = 1; break;
   1845       case Ity_I32: ccOp = 2; break;
   1846       case Ity_I64: ccOp = 3; break;
   1847       default: vassert(0);
   1848    }
   1849 
   1850    vassert(guard);
   1851 
   1852    /* Both kinds of right shifts are handled by the same thunk
   1853       operation. */
   1854    switch (op64) {
   1855       case Iop_Shr64:
   1856       case Iop_Sar64: ccOp += AMD64G_CC_OP_SHRB; break;
   1857       case Iop_Shl64: ccOp += AMD64G_CC_OP_SHLB; break;
   1858       default:        ppIROp(op64);
   1859                       vpanic("setFlags_DEP1_DEP2_shift(amd64)");
   1860    }
   1861 
   1862    /* guard :: Ity_I8.  We need to convert it to I1. */
   1863    IRTemp guardB = newTemp(Ity_I1);
   1864    assign( guardB, binop(Iop_CmpNE8, mkexpr(guard), mkU8(0)) );
   1865 
   1866    /* DEP1 contains the result, DEP2 contains the undershifted value. */
   1867    stmt( IRStmt_Put( OFFB_CC_OP,
   1868                      IRExpr_ITE( mkexpr(guardB),
   1869                                  mkU64(ccOp),
   1870                                  IRExpr_Get(OFFB_CC_OP,Ity_I64) ) ));
   1871    stmt( IRStmt_Put( OFFB_CC_DEP1,
   1872                      IRExpr_ITE( mkexpr(guardB),
   1873                                  widenUto64(mkexpr(res)),
   1874                                  IRExpr_Get(OFFB_CC_DEP1,Ity_I64) ) ));
   1875    stmt( IRStmt_Put( OFFB_CC_DEP2,
   1876                      IRExpr_ITE( mkexpr(guardB),
   1877                                  widenUto64(mkexpr(resUS)),
   1878                                  IRExpr_Get(OFFB_CC_DEP2,Ity_I64) ) ));
   1879 }
   1880 
   1881 
   1882 /* For the inc/dec case, we store in DEP1 the result value and in NDEP
   1883    the former value of the carry flag, which unfortunately we have to
   1884    compute. */
   1885 
   1886 static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
   1887 {
   1888    Int ccOp = inc ? AMD64G_CC_OP_INCB : AMD64G_CC_OP_DECB;
   1889 
   1890    switch (ty) {
   1891       case Ity_I8:  ccOp += 0; break;
   1892       case Ity_I16: ccOp += 1; break;
   1893       case Ity_I32: ccOp += 2; break;
   1894       case Ity_I64: ccOp += 3; break;
   1895       default: vassert(0);
   1896    }
   1897 
   1898    /* This has to come first, because calculating the C flag
   1899       may require reading all four thunk fields. */
   1900    stmt( IRStmt_Put( OFFB_CC_NDEP, mk_amd64g_calculate_rflags_c()) );
   1901    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   1902    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(res))) );
   1903    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
   1904 }
   1905 
   1906 
   1907 /* Multiplies are pretty much like add and sub: DEP1 and DEP2 hold the
   1908    two arguments. */
   1909 
   1910 static
   1911 void setFlags_MUL ( IRType ty, IRTemp arg1, IRTemp arg2, ULong base_op )
   1912 {
   1913    switch (ty) {
   1914       case Ity_I8:
   1915          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+0) ) );
   1916          break;
   1917       case Ity_I16:
   1918          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+1) ) );
   1919          break;
   1920       case Ity_I32:
   1921          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+2) ) );
   1922          break;
   1923       case Ity_I64:
   1924          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+3) ) );
   1925          break;
   1926       default:
   1927          vpanic("setFlags_MUL(amd64)");
   1928    }
   1929    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(arg1)) ));
   1930    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(arg2)) ));
   1931 }
   1932 
   1933 
   1934 /* -------------- Condition codes. -------------- */
   1935 
   1936 /* Condition codes, using the AMD encoding.  */
   1937 
   1938 static const HChar* name_AMD64Condcode ( AMD64Condcode cond )
   1939 {
   1940    switch (cond) {
   1941       case AMD64CondO:      return "o";
   1942       case AMD64CondNO:     return "no";
   1943       case AMD64CondB:      return "b";
   1944       case AMD64CondNB:     return "ae"; /*"nb";*/
   1945       case AMD64CondZ:      return "e"; /*"z";*/
   1946       case AMD64CondNZ:     return "ne"; /*"nz";*/
   1947       case AMD64CondBE:     return "be";
   1948       case AMD64CondNBE:    return "a"; /*"nbe";*/
   1949       case AMD64CondS:      return "s";
   1950       case AMD64CondNS:     return "ns";
   1951       case AMD64CondP:      return "p";
   1952       case AMD64CondNP:     return "np";
   1953       case AMD64CondL:      return "l";
   1954       case AMD64CondNL:     return "ge"; /*"nl";*/
   1955       case AMD64CondLE:     return "le";
   1956       case AMD64CondNLE:    return "g"; /*"nle";*/
   1957       case AMD64CondAlways: return "ALWAYS";
   1958       default: vpanic("name_AMD64Condcode");
   1959    }
   1960 }
   1961 
   1962 static
   1963 AMD64Condcode positiveIse_AMD64Condcode ( AMD64Condcode  cond,
   1964                                           /*OUT*/Bool*   needInvert )
   1965 {
   1966    vassert(cond >= AMD64CondO && cond <= AMD64CondNLE);
   1967    if (cond & 1) {
   1968       *needInvert = True;
   1969       return cond-1;
   1970    } else {
   1971       *needInvert = False;
   1972       return cond;
   1973    }
   1974 }
   1975 
   1976 
   1977 /* -------------- Helpers for ADD/SUB with carry. -------------- */
   1978 
   1979 /* Given ta1, ta2 and tres, compute tres = ADC(ta1,ta2) and set flags
   1980    appropriately.
   1981 
   1982    Optionally, generate a store for the 'tres' value.  This can either
   1983    be a normal store, or it can be a cas-with-possible-failure style
   1984    store:
   1985 
   1986    if taddr is IRTemp_INVALID, then no store is generated.
   1987 
   1988    if taddr is not IRTemp_INVALID, then a store (using taddr as
   1989    the address) is generated:
   1990 
   1991      if texpVal is IRTemp_INVALID then a normal store is
   1992      generated, and restart_point must be zero (it is irrelevant).
   1993 
   1994      if texpVal is not IRTemp_INVALID then a cas-style store is
   1995      generated.  texpVal is the expected value, restart_point
   1996      is the restart point if the store fails, and texpVal must
   1997      have the same type as tres.
   1998 
   1999 */
   2000 static void helper_ADC ( Int sz,
   2001                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   2002                          /* info about optional store: */
   2003                          IRTemp taddr, IRTemp texpVal, Addr64 restart_point )
   2004 {
   2005    UInt    thunkOp;
   2006    IRType  ty    = szToITy(sz);
   2007    IRTemp  oldc  = newTemp(Ity_I64);
   2008    IRTemp  oldcn = newTemp(ty);
   2009    IROp    plus  = mkSizedOp(ty, Iop_Add8);
   2010    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   2011 
   2012    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   2013 
   2014    switch (sz) {
   2015       case 8:  thunkOp = AMD64G_CC_OP_ADCQ; break;
   2016       case 4:  thunkOp = AMD64G_CC_OP_ADCL; break;
   2017       case 2:  thunkOp = AMD64G_CC_OP_ADCW; break;
   2018       case 1:  thunkOp = AMD64G_CC_OP_ADCB; break;
   2019       default: vassert(0);
   2020    }
   2021 
   2022    /* oldc = old carry flag, 0 or 1 */
   2023    assign( oldc,  binop(Iop_And64,
   2024                         mk_amd64g_calculate_rflags_c(),
   2025                         mkU64(1)) );
   2026 
   2027    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   2028 
   2029    assign( tres, binop(plus,
   2030                        binop(plus,mkexpr(ta1),mkexpr(ta2)),
   2031                        mkexpr(oldcn)) );
   2032 
   2033    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   2034       start of this function. */
   2035    if (taddr != IRTemp_INVALID) {
   2036       if (texpVal == IRTemp_INVALID) {
   2037          vassert(restart_point == 0);
   2038          storeLE( mkexpr(taddr), mkexpr(tres) );
   2039       } else {
   2040          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   2041          /* .. and hence 'texpVal' has the same type as 'tres'. */
   2042          casLE( mkexpr(taddr),
   2043                 mkexpr(texpVal), mkexpr(tres), restart_point );
   2044       }
   2045    }
   2046 
   2047    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
   2048    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1))  ));
   2049    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
   2050                                                          mkexpr(oldcn)) )) );
   2051    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   2052 }
   2053 
   2054 
   2055 /* Given ta1, ta2 and tres, compute tres = SBB(ta1,ta2) and set flags
   2056    appropriately.  As with helper_ADC, possibly generate a store of
   2057    the result -- see comments on helper_ADC for details.
   2058 */
   2059 static void helper_SBB ( Int sz,
   2060                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   2061                          /* info about optional store: */
   2062                          IRTemp taddr, IRTemp texpVal, Addr64 restart_point )
   2063 {
   2064    UInt    thunkOp;
   2065    IRType  ty    = szToITy(sz);
   2066    IRTemp  oldc  = newTemp(Ity_I64);
   2067    IRTemp  oldcn = newTemp(ty);
   2068    IROp    minus = mkSizedOp(ty, Iop_Sub8);
   2069    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   2070 
   2071    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   2072 
   2073    switch (sz) {
   2074       case 8:  thunkOp = AMD64G_CC_OP_SBBQ; break;
   2075       case 4:  thunkOp = AMD64G_CC_OP_SBBL; break;
   2076       case 2:  thunkOp = AMD64G_CC_OP_SBBW; break;
   2077       case 1:  thunkOp = AMD64G_CC_OP_SBBB; break;
   2078       default: vassert(0);
   2079    }
   2080 
   2081    /* oldc = old carry flag, 0 or 1 */
   2082    assign( oldc, binop(Iop_And64,
   2083                        mk_amd64g_calculate_rflags_c(),
   2084                        mkU64(1)) );
   2085 
   2086    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   2087 
   2088    assign( tres, binop(minus,
   2089                        binop(minus,mkexpr(ta1),mkexpr(ta2)),
   2090                        mkexpr(oldcn)) );
   2091 
   2092    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   2093       start of this function. */
   2094    if (taddr != IRTemp_INVALID) {
   2095       if (texpVal == IRTemp_INVALID) {
   2096          vassert(restart_point == 0);
   2097          storeLE( mkexpr(taddr), mkexpr(tres) );
   2098       } else {
   2099          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   2100          /* .. and hence 'texpVal' has the same type as 'tres'. */
   2101          casLE( mkexpr(taddr),
   2102                 mkexpr(texpVal), mkexpr(tres), restart_point );
   2103       }
   2104    }
   2105 
   2106    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
   2107    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1) )) );
   2108    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
   2109                                                          mkexpr(oldcn)) )) );
   2110    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   2111 }
   2112 
   2113 
   2114 /* -------------- Helpers for disassembly printing. -------------- */
   2115 
   2116 static const HChar* nameGrp1 ( Int opc_aux )
   2117 {
   2118    static const HChar* grp1_names[8]
   2119      = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
   2120    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(amd64)");
   2121    return grp1_names[opc_aux];
   2122 }
   2123 
   2124 static const HChar* nameGrp2 ( Int opc_aux )
   2125 {
   2126    static const HChar* grp2_names[8]
   2127      = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
   2128    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(amd64)");
   2129    return grp2_names[opc_aux];
   2130 }
   2131 
   2132 static const HChar* nameGrp4 ( Int opc_aux )
   2133 {
   2134    static const HChar* grp4_names[8]
   2135      = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
   2136    if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(amd64)");
   2137    return grp4_names[opc_aux];
   2138 }
   2139 
   2140 static const HChar* nameGrp5 ( Int opc_aux )
   2141 {
   2142    static const HChar* grp5_names[8]
   2143      = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
   2144    if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(amd64)");
   2145    return grp5_names[opc_aux];
   2146 }
   2147 
   2148 static const HChar* nameGrp8 ( Int opc_aux )
   2149 {
   2150    static const HChar* grp8_names[8]
   2151       = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
   2152    if (opc_aux < 4 || opc_aux > 7) vpanic("nameGrp8(amd64)");
   2153    return grp8_names[opc_aux];
   2154 }
   2155 
   2156 static const HChar* nameSReg ( UInt sreg )
   2157 {
   2158    switch (sreg) {
   2159       case R_ES: return "%es";
   2160       case R_CS: return "%cs";
   2161       case R_SS: return "%ss";
   2162       case R_DS: return "%ds";
   2163       case R_FS: return "%fs";
   2164       case R_GS: return "%gs";
   2165       default: vpanic("nameSReg(amd64)");
   2166    }
   2167 }
   2168 
   2169 static const HChar* nameMMXReg ( Int mmxreg )
   2170 {
   2171    static const HChar* mmx_names[8]
   2172      = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
   2173    if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(amd64,guest)");
   2174    return mmx_names[mmxreg];
   2175 }
   2176 
   2177 static const HChar* nameXMMReg ( Int xmmreg )
   2178 {
   2179    static const HChar* xmm_names[16]
   2180      = { "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3",
   2181          "%xmm4",  "%xmm5",  "%xmm6",  "%xmm7",
   2182          "%xmm8",  "%xmm9",  "%xmm10", "%xmm11",
   2183          "%xmm12", "%xmm13", "%xmm14", "%xmm15" };
   2184    if (xmmreg < 0 || xmmreg > 15) vpanic("nameXMMReg(amd64)");
   2185    return xmm_names[xmmreg];
   2186 }
   2187 
   2188 static const HChar* nameMMXGran ( Int gran )
   2189 {
   2190    switch (gran) {
   2191       case 0: return "b";
   2192       case 1: return "w";
   2193       case 2: return "d";
   2194       case 3: return "q";
   2195       default: vpanic("nameMMXGran(amd64,guest)");
   2196    }
   2197 }
   2198 
   2199 static HChar nameISize ( Int size )
   2200 {
   2201    switch (size) {
   2202       case 8: return 'q';
   2203       case 4: return 'l';
   2204       case 2: return 'w';
   2205       case 1: return 'b';
   2206       default: vpanic("nameISize(amd64)");
   2207    }
   2208 }
   2209 
   2210 static const HChar* nameYMMReg ( Int ymmreg )
   2211 {
   2212    static const HChar* ymm_names[16]
   2213      = { "%ymm0",  "%ymm1",  "%ymm2",  "%ymm3",
   2214          "%ymm4",  "%ymm5",  "%ymm6",  "%ymm7",
   2215          "%ymm8",  "%ymm9",  "%ymm10", "%ymm11",
   2216          "%ymm12", "%ymm13", "%ymm14", "%ymm15" };
   2217    if (ymmreg < 0 || ymmreg > 15) vpanic("nameYMMReg(amd64)");
   2218    return ymm_names[ymmreg];
   2219 }
   2220 
   2221 
   2222 /*------------------------------------------------------------*/
   2223 /*--- JMP helpers                                          ---*/
   2224 /*------------------------------------------------------------*/
   2225 
   2226 static void jmp_lit( /*MOD*/DisResult* dres,
   2227                      IRJumpKind kind, Addr64 d64 )
   2228 {
   2229    vassert(dres->whatNext    == Dis_Continue);
   2230    vassert(dres->len         == 0);
   2231    vassert(dres->continueAt  == 0);
   2232    vassert(dres->jk_StopHere == Ijk_INVALID);
   2233    dres->whatNext    = Dis_StopHere;
   2234    dres->jk_StopHere = kind;
   2235    stmt( IRStmt_Put( OFFB_RIP, mkU64(d64) ) );
   2236 }
   2237 
   2238 static void jmp_treg( /*MOD*/DisResult* dres,
   2239                       IRJumpKind kind, IRTemp t )
   2240 {
   2241    vassert(dres->whatNext    == Dis_Continue);
   2242    vassert(dres->len         == 0);
   2243    vassert(dres->continueAt  == 0);
   2244    vassert(dres->jk_StopHere == Ijk_INVALID);
   2245    dres->whatNext    = Dis_StopHere;
   2246    dres->jk_StopHere = kind;
   2247    stmt( IRStmt_Put( OFFB_RIP, mkexpr(t) ) );
   2248 }
   2249 
   2250 static
   2251 void jcc_01 ( /*MOD*/DisResult* dres,
   2252               AMD64Condcode cond, Addr64 d64_false, Addr64 d64_true )
   2253 {
   2254    Bool          invert;
   2255    AMD64Condcode condPos;
   2256    vassert(dres->whatNext    == Dis_Continue);
   2257    vassert(dres->len         == 0);
   2258    vassert(dres->continueAt  == 0);
   2259    vassert(dres->jk_StopHere == Ijk_INVALID);
   2260    dres->whatNext    = Dis_StopHere;
   2261    dres->jk_StopHere = Ijk_Boring;
   2262    condPos = positiveIse_AMD64Condcode ( cond, &invert );
   2263    if (invert) {
   2264       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
   2265                          Ijk_Boring,
   2266                          IRConst_U64(d64_false),
   2267                          OFFB_RIP ) );
   2268       stmt( IRStmt_Put( OFFB_RIP, mkU64(d64_true) ) );
   2269    } else {
   2270       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
   2271                          Ijk_Boring,
   2272                          IRConst_U64(d64_true),
   2273                          OFFB_RIP ) );
   2274       stmt( IRStmt_Put( OFFB_RIP, mkU64(d64_false) ) );
   2275    }
   2276 }
   2277 
   2278 /* Let new_rsp be the %rsp value after a call/return.  Let nia be the
   2279    guest address of the next instruction to be executed.
   2280 
   2281    This function generates an AbiHint to say that -128(%rsp)
   2282    .. -1(%rsp) should now be regarded as uninitialised.
   2283 */
   2284 static
   2285 void make_redzone_AbiHint ( const VexAbiInfo* vbi,
   2286                             IRTemp new_rsp, IRTemp nia, const HChar* who )
   2287 {
   2288    Int szB = vbi->guest_stack_redzone_size;
   2289    vassert(szB >= 0);
   2290 
   2291    /* A bit of a kludge.  Currently the only AbI we've guested AMD64
   2292       for is ELF.  So just check it's the expected 128 value
   2293       (paranoia). */
   2294    vassert(szB == 128);
   2295 
   2296    if (0) vex_printf("AbiHint: %s\n", who);
   2297    vassert(typeOfIRTemp(irsb->tyenv, new_rsp) == Ity_I64);
   2298    vassert(typeOfIRTemp(irsb->tyenv, nia) == Ity_I64);
   2299    if (szB > 0)
   2300       stmt( IRStmt_AbiHint(
   2301                binop(Iop_Sub64, mkexpr(new_rsp), mkU64(szB)),
   2302                szB,
   2303                mkexpr(nia)
   2304             ));
   2305 }
   2306 
   2307 
   2308 /*------------------------------------------------------------*/
   2309 /*--- Disassembling addressing modes                       ---*/
   2310 /*------------------------------------------------------------*/
   2311 
   2312 static
   2313 const HChar* segRegTxt ( Prefix pfx )
   2314 {
   2315    if (pfx & PFX_CS) return "%cs:";
   2316    if (pfx & PFX_DS) return "%ds:";
   2317    if (pfx & PFX_ES) return "%es:";
   2318    if (pfx & PFX_FS) return "%fs:";
   2319    if (pfx & PFX_GS) return "%gs:";
   2320    if (pfx & PFX_SS) return "%ss:";
   2321    return ""; /* no override */
   2322 }
   2323 
   2324 
   2325 /* 'virtual' is an IRExpr* holding a virtual address.  Convert it to a
   2326    linear address by adding any required segment override as indicated
   2327    by sorb, and also dealing with any address size override
   2328    present. */
   2329 static
   2330 IRExpr* handleAddrOverrides ( const VexAbiInfo* vbi,
   2331                               Prefix pfx, IRExpr* virtual )
   2332 {
   2333    /* Note that the below are hacks that relies on the assumption
   2334       that %fs or %gs are constant.
   2335       Typically, %fs is always 0x63 on linux (in the main thread, it
   2336       stays at value 0), %gs always 0x60 on Darwin, ... */
   2337    /* --- segment overrides --- */
   2338    if (pfx & PFX_FS) {
   2339       if (vbi->guest_amd64_assume_fs_is_const) {
   2340          /* return virtual + guest_FS_CONST. */
   2341          virtual = binop(Iop_Add64, virtual,
   2342                                     IRExpr_Get(OFFB_FS_CONST, Ity_I64));
   2343       } else {
   2344          unimplemented("amd64 %fs segment override");
   2345       }
   2346    }
   2347 
   2348    if (pfx & PFX_GS) {
   2349       if (vbi->guest_amd64_assume_gs_is_const) {
   2350          /* return virtual + guest_GS_CONST. */
   2351          virtual = binop(Iop_Add64, virtual,
   2352                                     IRExpr_Get(OFFB_GS_CONST, Ity_I64));
   2353       } else {
   2354          unimplemented("amd64 %gs segment override");
   2355       }
   2356    }
   2357 
   2358    /* cs, ds, es and ss are simply ignored in 64-bit mode. */
   2359 
   2360    /* --- address size override --- */
   2361    if (haveASO(pfx))
   2362       virtual = unop(Iop_32Uto64, unop(Iop_64to32, virtual));
   2363 
   2364    return virtual;
   2365 }
   2366 
   2367 //.. {
   2368 //..    Int    sreg;
   2369 //..    IRType hWordTy;
   2370 //..    IRTemp ldt_ptr, gdt_ptr, seg_selector, r64;
   2371 //..
   2372 //..    if (sorb == 0)
   2373 //..       /* the common case - no override */
   2374 //..       return virtual;
   2375 //..
   2376 //..    switch (sorb) {
   2377 //..       case 0x3E: sreg = R_DS; break;
   2378 //..       case 0x26: sreg = R_ES; break;
   2379 //..       case 0x64: sreg = R_FS; break;
   2380 //..       case 0x65: sreg = R_GS; break;
   2381 //..       default: vpanic("handleAddrOverrides(x86,guest)");
   2382 //..    }
   2383 //..
   2384 //..    hWordTy = sizeof(HWord)==4 ? Ity_I32 : Ity_I64;
   2385 //..
   2386 //..    seg_selector = newTemp(Ity_I32);
   2387 //..    ldt_ptr      = newTemp(hWordTy);
   2388 //..    gdt_ptr      = newTemp(hWordTy);
   2389 //..    r64          = newTemp(Ity_I64);
   2390 //..
   2391 //..    assign( seg_selector, unop(Iop_16Uto32, getSReg(sreg)) );
   2392 //..    assign( ldt_ptr, IRExpr_Get( OFFB_LDT, hWordTy ));
   2393 //..    assign( gdt_ptr, IRExpr_Get( OFFB_GDT, hWordTy ));
   2394 //..
   2395 //..    /*
   2396 //..    Call this to do the translation and limit checks:
   2397 //..    ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
   2398 //..                                  UInt seg_selector, UInt virtual_addr )
   2399 //..    */
   2400 //..    assign(
   2401 //..       r64,
   2402 //..       mkIRExprCCall(
   2403 //..          Ity_I64,
   2404 //..          0/*regparms*/,
   2405 //..          "x86g_use_seg_selector",
   2406 //..          &x86g_use_seg_selector,
   2407 //..          mkIRExprVec_4( mkexpr(ldt_ptr), mkexpr(gdt_ptr),
   2408 //..                         mkexpr(seg_selector), virtual)
   2409 //..       )
   2410 //..    );
   2411 //..
   2412 //..    /* If the high 32 of the result are non-zero, there was a
   2413 //..       failure in address translation.  In which case, make a
   2414 //..       quick exit.
   2415 //..    */
   2416 //..    stmt(
   2417 //..       IRStmt_Exit(
   2418 //..          binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
   2419 //..          Ijk_MapFail,
   2420 //..          IRConst_U32( guest_eip_curr_instr )
   2421 //..       )
   2422 //..    );
   2423 //..
   2424 //..    /* otherwise, here's the translated result. */
   2425 //..    return unop(Iop_64to32, mkexpr(r64));
   2426 //.. }
   2427 
   2428 
   2429 /* Generate IR to calculate an address indicated by a ModRM and
   2430    following SIB bytes.  The expression, and the number of bytes in
   2431    the address mode, are returned (the latter in *len).  Note that
   2432    this fn should not be called if the R/M part of the address denotes
   2433    a register instead of memory.  If print_codegen is true, text of
   2434    the addressing mode is placed in buf.
   2435 
   2436    The computed address is stored in a new tempreg, and the
   2437    identity of the tempreg is returned.
   2438 
   2439    extra_bytes holds the number of bytes after the amode, as supplied
   2440    by the caller.  This is needed to make sense of %rip-relative
   2441    addresses.  Note that the value that *len is set to is only the
   2442    length of the amode itself and does not include the value supplied
   2443    in extra_bytes.
   2444  */
   2445 
   2446 static IRTemp disAMode_copy2tmp ( IRExpr* addr64 )
   2447 {
   2448    IRTemp tmp = newTemp(Ity_I64);
   2449    assign( tmp, addr64 );
   2450    return tmp;
   2451 }
   2452 
   2453 static
   2454 IRTemp disAMode ( /*OUT*/Int* len,
   2455                   const VexAbiInfo* vbi, Prefix pfx, Long delta,
   2456                   /*OUT*/HChar* buf, Int extra_bytes )
   2457 {
   2458    UChar mod_reg_rm = getUChar(delta);
   2459    delta++;
   2460 
   2461    buf[0] = (UChar)0;
   2462    vassert(extra_bytes >= 0 && extra_bytes < 10);
   2463 
   2464    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   2465       jump table seems a bit excessive.
   2466    */
   2467    mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
   2468    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   2469                                                /* is now XX0XXYYY */
   2470    mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
   2471    switch (mod_reg_rm) {
   2472 
   2473       /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
   2474          REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
   2475       */
   2476       case 0x00: case 0x01: case 0x02: case 0x03:
   2477       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   2478          { UChar rm = toUChar(mod_reg_rm & 7);
   2479            DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
   2480            *len = 1;
   2481            return disAMode_copy2tmp(
   2482                   handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,rm)));
   2483          }
   2484 
   2485       /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
   2486          REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
   2487       */
   2488       case 0x08: case 0x09: case 0x0A: case 0x0B:
   2489       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   2490          { UChar rm = toUChar(mod_reg_rm & 7);
   2491            Long d   = getSDisp8(delta);
   2492            if (d == 0) {
   2493               DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
   2494            } else {
   2495               DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
   2496            }
   2497            *len = 2;
   2498            return disAMode_copy2tmp(
   2499                   handleAddrOverrides(vbi, pfx,
   2500                      binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
   2501          }
   2502 
   2503       /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
   2504          REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
   2505       */
   2506       case 0x10: case 0x11: case 0x12: case 0x13:
   2507       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   2508          { UChar rm = toUChar(mod_reg_rm & 7);
   2509            Long  d  = getSDisp32(delta);
   2510            DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
   2511            *len = 5;
   2512            return disAMode_copy2tmp(
   2513                   handleAddrOverrides(vbi, pfx,
   2514                      binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
   2515          }
   2516 
   2517       /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
   2518       /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
   2519       case 0x18: case 0x19: case 0x1A: case 0x1B:
   2520       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   2521          vpanic("disAMode(amd64): not an addr!");
   2522 
   2523       /* RIP + disp32.  This assumes that guest_RIP_curr_instr is set
   2524          correctly at the start of handling each instruction. */
   2525       case 0x05:
   2526          { Long d = getSDisp32(delta);
   2527            *len = 5;
   2528            DIS(buf, "%s%lld(%%rip)", segRegTxt(pfx), d);
   2529            /* We need to know the next instruction's start address.
   2530               Try and figure out what it is, record the guess, and ask
   2531               the top-level driver logic (bbToIR_AMD64) to check we
   2532               guessed right, after the instruction is completely
   2533               decoded. */
   2534            guest_RIP_next_mustcheck = True;
   2535            guest_RIP_next_assumed = guest_RIP_bbstart
   2536                                     + delta+4 + extra_bytes;
   2537            return disAMode_copy2tmp(
   2538                      handleAddrOverrides(vbi, pfx,
   2539                         binop(Iop_Add64, mkU64(guest_RIP_next_assumed),
   2540                                          mkU64(d))));
   2541          }
   2542 
   2543       case 0x04: {
   2544          /* SIB, with no displacement.  Special cases:
   2545             -- %rsp cannot act as an index value.
   2546                If index_r indicates %rsp, zero is used for the index.
   2547             -- when mod is zero and base indicates RBP or R13, base is
   2548                instead a 32-bit sign-extended literal.
   2549             It's all madness, I tell you.  Extract %index, %base and
   2550             scale from the SIB byte.  The value denoted is then:
   2551                | %index == %RSP && (%base == %RBP || %base == %R13)
   2552                = d32 following SIB byte
   2553                | %index == %RSP && !(%base == %RBP || %base == %R13)
   2554                = %base
   2555                | %index != %RSP && (%base == %RBP || %base == %R13)
   2556                = d32 following SIB byte + (%index << scale)
   2557                | %index != %RSP && !(%base == %RBP || %base == %R13)
   2558                = %base + (%index << scale)
   2559          */
   2560          UChar sib     = getUChar(delta);
   2561          UChar scale   = toUChar((sib >> 6) & 3);
   2562          UChar index_r = toUChar((sib >> 3) & 7);
   2563          UChar base_r  = toUChar(sib & 7);
   2564          /* correct since #(R13) == 8 + #(RBP) */
   2565          Bool  base_is_BPor13 = toBool(base_r == R_RBP);
   2566          Bool  index_is_SP    = toBool(index_r == R_RSP && 0==getRexX(pfx));
   2567          delta++;
   2568 
   2569          if ((!index_is_SP) && (!base_is_BPor13)) {
   2570             if (scale == 0) {
   2571                DIS(buf, "%s(%s,%s)", segRegTxt(pfx),
   2572                          nameIRegRexB(8,pfx,base_r),
   2573                          nameIReg64rexX(pfx,index_r));
   2574             } else {
   2575                DIS(buf, "%s(%s,%s,%d)", segRegTxt(pfx),
   2576                          nameIRegRexB(8,pfx,base_r),
   2577                          nameIReg64rexX(pfx,index_r), 1<<scale);
   2578             }
   2579             *len = 2;
   2580             return
   2581                disAMode_copy2tmp(
   2582                handleAddrOverrides(vbi, pfx,
   2583                   binop(Iop_Add64,
   2584                         getIRegRexB(8,pfx,base_r),
   2585                         binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
   2586                               mkU8(scale)))));
   2587          }
   2588 
   2589          if ((!index_is_SP) && base_is_BPor13) {
   2590             Long d = getSDisp32(delta);
   2591             DIS(buf, "%s%lld(,%s,%d)", segRegTxt(pfx), d,
   2592                       nameIReg64rexX(pfx,index_r), 1<<scale);
   2593             *len = 6;
   2594             return
   2595                disAMode_copy2tmp(
   2596                handleAddrOverrides(vbi, pfx,
   2597                   binop(Iop_Add64,
   2598                         binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
   2599                                          mkU8(scale)),
   2600                         mkU64(d))));
   2601          }
   2602 
   2603          if (index_is_SP && (!base_is_BPor13)) {
   2604             DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,base_r));
   2605             *len = 2;
   2606             return disAMode_copy2tmp(
   2607                    handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,base_r)));
   2608          }
   2609 
   2610          if (index_is_SP && base_is_BPor13) {
   2611             Long d = getSDisp32(delta);
   2612             DIS(buf, "%s%lld", segRegTxt(pfx), d);
   2613             *len = 6;
   2614             return disAMode_copy2tmp(
   2615                    handleAddrOverrides(vbi, pfx, mkU64(d)));
   2616          }
   2617 
   2618          vassert(0);
   2619       }
   2620 
   2621       /* SIB, with 8-bit displacement.  Special cases:
   2622          -- %esp cannot act as an index value.
   2623             If index_r indicates %esp, zero is used for the index.
   2624          Denoted value is:
   2625             | %index == %ESP
   2626             = d8 + %base
   2627             | %index != %ESP
   2628             = d8 + %base + (%index << scale)
   2629       */
   2630       case 0x0C: {
   2631          UChar sib     = getUChar(delta);
   2632          UChar scale   = toUChar((sib >> 6) & 3);
   2633          UChar index_r = toUChar((sib >> 3) & 7);
   2634          UChar base_r  = toUChar(sib & 7);
   2635          Long d        = getSDisp8(delta+1);
   2636 
   2637          if (index_r == R_RSP && 0==getRexX(pfx)) {
   2638             DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
   2639                                    d, nameIRegRexB(8,pfx,base_r));
   2640             *len = 3;
   2641             return disAMode_copy2tmp(
   2642                    handleAddrOverrides(vbi, pfx,
   2643                       binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
   2644          } else {
   2645             if (scale == 0) {
   2646                DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
   2647                          nameIRegRexB(8,pfx,base_r),
   2648                          nameIReg64rexX(pfx,index_r));
   2649             } else {
   2650                DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
   2651                          nameIRegRexB(8,pfx,base_r),
   2652                          nameIReg64rexX(pfx,index_r), 1<<scale);
   2653             }
   2654             *len = 3;
   2655             return
   2656                 disAMode_copy2tmp(
   2657                 handleAddrOverrides(vbi, pfx,
   2658                   binop(Iop_Add64,
   2659                         binop(Iop_Add64,
   2660                               getIRegRexB(8,pfx,base_r),
   2661                               binop(Iop_Shl64,
   2662                                     getIReg64rexX(pfx,index_r), mkU8(scale))),
   2663                         mkU64(d))));
   2664          }
   2665          vassert(0); /*NOTREACHED*/
   2666       }
   2667 
   2668       /* SIB, with 32-bit displacement.  Special cases:
   2669          -- %rsp cannot act as an index value.
   2670             If index_r indicates %rsp, zero is used for the index.
   2671          Denoted value is:
   2672             | %index == %RSP
   2673             = d32 + %base
   2674             | %index != %RSP
   2675             = d32 + %base + (%index << scale)
   2676       */
   2677       case 0x14: {
   2678          UChar sib     = getUChar(delta);
   2679          UChar scale   = toUChar((sib >> 6) & 3);
   2680          UChar index_r = toUChar((sib >> 3) & 7);
   2681          UChar base_r  = toUChar(sib & 7);
   2682          Long d        = getSDisp32(delta+1);
   2683 
   2684          if (index_r == R_RSP && 0==getRexX(pfx)) {
   2685             DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
   2686                                    d, nameIRegRexB(8,pfx,base_r));
   2687             *len = 6;
   2688             return disAMode_copy2tmp(
   2689                    handleAddrOverrides(vbi, pfx,
   2690                       binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
   2691          } else {
   2692             if (scale == 0) {
   2693                DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
   2694                          nameIRegRexB(8,pfx,base_r),
   2695                          nameIReg64rexX(pfx,index_r));
   2696             } else {
   2697                DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
   2698                          nameIRegRexB(8,pfx,base_r),
   2699                          nameIReg64rexX(pfx,index_r), 1<<scale);
   2700             }
   2701             *len = 6;
   2702             return
   2703                 disAMode_copy2tmp(
   2704                 handleAddrOverrides(vbi, pfx,
   2705                   binop(Iop_Add64,
   2706                         binop(Iop_Add64,
   2707                               getIRegRexB(8,pfx,base_r),
   2708                               binop(Iop_Shl64,
   2709                                     getIReg64rexX(pfx,index_r), mkU8(scale))),
   2710                         mkU64(d))));
   2711          }
   2712          vassert(0); /*NOTREACHED*/
   2713       }
   2714 
   2715       default:
   2716          vpanic("disAMode(amd64)");
   2717          return 0; /*notreached*/
   2718    }
   2719 }
   2720 
   2721 
   2722 /* Similarly for VSIB addressing.  This returns just the addend,
   2723    and fills in *rI and *vscale with the register number of the vector
   2724    index and its multiplicand.  */
   2725 static
   2726 IRTemp disAVSIBMode ( /*OUT*/Int* len,
   2727                       const VexAbiInfo* vbi, Prefix pfx, Long delta,
   2728                       /*OUT*/HChar* buf, /*OUT*/UInt* rI,
   2729                       IRType ty, /*OUT*/Int* vscale )
   2730 {
   2731    UChar mod_reg_rm = getUChar(delta);
   2732    const HChar *vindex;
   2733 
   2734    *len = 0;
   2735    *rI = 0;
   2736    *vscale = 0;
   2737    buf[0] = (UChar)0;
   2738    if ((mod_reg_rm & 7) != 4 || epartIsReg(mod_reg_rm))
   2739       return IRTemp_INVALID;
   2740 
   2741    UChar sib     = getUChar(delta+1);
   2742    UChar scale   = toUChar((sib >> 6) & 3);
   2743    UChar index_r = toUChar((sib >> 3) & 7);
   2744    UChar base_r  = toUChar(sib & 7);
   2745    Long  d       = 0;
   2746    /* correct since #(R13) == 8 + #(RBP) */
   2747    Bool  base_is_BPor13 = toBool(base_r == R_RBP);
   2748    delta += 2;
   2749    *len = 2;
   2750 
   2751    *rI = index_r | (getRexX(pfx) << 3);
   2752    if (ty == Ity_V128)
   2753       vindex = nameXMMReg(*rI);
   2754    else
   2755       vindex = nameYMMReg(*rI);
   2756    *vscale = 1<<scale;
   2757 
   2758    switch (mod_reg_rm >> 6) {
   2759    case 0:
   2760       if (base_is_BPor13) {
   2761          d = getSDisp32(delta);
   2762          *len += 4;
   2763          if (scale == 0) {
   2764             DIS(buf, "%s%lld(,%s)", segRegTxt(pfx), d, vindex);
   2765          } else {
   2766             DIS(buf, "%s%lld(,%s,%d)", segRegTxt(pfx), d, vindex, 1<<scale);
   2767          }
   2768          return disAMode_copy2tmp( mkU64(d) );
   2769       } else {
   2770          if (scale == 0) {
   2771             DIS(buf, "%s(%s,%s)", segRegTxt(pfx),
   2772                      nameIRegRexB(8,pfx,base_r), vindex);
   2773          } else {
   2774             DIS(buf, "%s(%s,%s,%d)", segRegTxt(pfx),
   2775                      nameIRegRexB(8,pfx,base_r), vindex, 1<<scale);
   2776          }
   2777       }
   2778       break;
   2779    case 1:
   2780       d = getSDisp8(delta);
   2781       *len += 1;
   2782       goto have_disp;
   2783    case 2:
   2784       d = getSDisp32(delta);
   2785       *len += 4;
   2786    have_disp:
   2787       if (scale == 0) {
   2788          DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
   2789                   nameIRegRexB(8,pfx,base_r), vindex);
   2790       } else {
   2791          DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
   2792                   nameIRegRexB(8,pfx,base_r), vindex, 1<<scale);
   2793       }
   2794       break;
   2795    }
   2796 
   2797    if (!d)
   2798       return disAMode_copy2tmp( getIRegRexB(8,pfx,base_r) );
   2799    return disAMode_copy2tmp( binop(Iop_Add64, getIRegRexB(8,pfx,base_r),
   2800                                    mkU64(d)) );
   2801 }
   2802 
   2803 
   2804 /* Figure out the number of (insn-stream) bytes constituting the amode
   2805    beginning at delta.  Is useful for getting hold of literals beyond
   2806    the end of the amode before it has been disassembled.  */
   2807 
   2808 static UInt lengthAMode ( Prefix pfx, Long delta )
   2809 {
   2810    UChar mod_reg_rm = getUChar(delta);
   2811    delta++;
   2812 
   2813    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   2814       jump table seems a bit excessive.
   2815    */
   2816    mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
   2817    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   2818                                                /* is now XX0XXYYY */
   2819    mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
   2820    switch (mod_reg_rm) {
   2821 
   2822       /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
   2823          REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
   2824       */
   2825       case 0x00: case 0x01: case 0x02: case 0x03:
   2826       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   2827          return 1;
   2828 
   2829       /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
   2830          REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
   2831       */
   2832       case 0x08: case 0x09: case 0x0A: case 0x0B:
   2833       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   2834          return 2;
   2835 
   2836       /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
   2837          REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
   2838       */
   2839       case 0x10: case 0x11: case 0x12: case 0x13:
   2840       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   2841          return 5;
   2842 
   2843       /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
   2844       /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
   2845       /* Not an address, but still handled. */
   2846       case 0x18: case 0x19: case 0x1A: case 0x1B:
   2847       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   2848          return 1;
   2849 
   2850       /* RIP + disp32. */
   2851       case 0x05:
   2852          return 5;
   2853 
   2854       case 0x04: {
   2855          /* SIB, with no displacement. */
   2856          UChar sib     = getUChar(delta);
   2857          UChar base_r  = toUChar(sib & 7);
   2858          /* correct since #(R13) == 8 + #(RBP) */
   2859          Bool  base_is_BPor13 = toBool(base_r == R_RBP);
   2860 
   2861          if (base_is_BPor13) {
   2862             return 6;
   2863          } else {
   2864             return 2;
   2865          }
   2866       }
   2867 
   2868       /* SIB, with 8-bit displacement. */
   2869       case 0x0C:
   2870          return 3;
   2871 
   2872       /* SIB, with 32-bit displacement. */
   2873       case 0x14:
   2874          return 6;
   2875 
   2876       default:
   2877          vpanic("lengthAMode(amd64)");
   2878          return 0; /*notreached*/
   2879    }
   2880 }
   2881 
   2882 
   2883 /*------------------------------------------------------------*/
   2884 /*--- Disassembling common idioms                          ---*/
   2885 /*------------------------------------------------------------*/
   2886 
   2887 /* Handle binary integer instructions of the form
   2888       op E, G  meaning
   2889       op reg-or-mem, reg
   2890    Is passed the a ptr to the modRM byte, the actual operation, and the
   2891    data size.  Returns the address advanced completely over this
   2892    instruction.
   2893 
   2894    E(src) is reg-or-mem
   2895    G(dst) is reg.
   2896 
   2897    If E is reg, -->    GET %G,  tmp
   2898                        OP %E,   tmp
   2899                        PUT tmp, %G
   2900 
   2901    If E is mem and OP is not reversible,
   2902                 -->    (getAddr E) -> tmpa
   2903                        LD (tmpa), tmpa
   2904                        GET %G, tmp2
   2905                        OP tmpa, tmp2
   2906                        PUT tmp2, %G
   2907 
   2908    If E is mem and OP is reversible
   2909                 -->    (getAddr E) -> tmpa
   2910                        LD (tmpa), tmpa
   2911                        OP %G, tmpa
   2912                        PUT tmpa, %G
   2913 */
   2914 static
   2915 ULong dis_op2_E_G ( const VexAbiInfo* vbi,
   2916                     Prefix      pfx,
   2917                     Bool        addSubCarry,
   2918                     IROp        op8,
   2919                     Bool        keep,
   2920                     Int         size,
   2921                     Long        delta0,
   2922                     const HChar* t_amd64opc )
   2923 {
   2924    HChar   dis_buf[50];
   2925    Int     len;
   2926    IRType  ty   = szToITy(size);
   2927    IRTemp  dst1 = newTemp(ty);
   2928    IRTemp  src  = newTemp(ty);
   2929    IRTemp  dst0 = newTemp(ty);
   2930    UChar   rm   = getUChar(delta0);
   2931    IRTemp  addr = IRTemp_INVALID;
   2932 
   2933    /* addSubCarry == True indicates the intended operation is
   2934       add-with-carry or subtract-with-borrow. */
   2935    if (addSubCarry) {
   2936       vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
   2937       vassert(keep);
   2938    }
   2939 
   2940    if (epartIsReg(rm)) {
   2941       /* Specially handle XOR reg,reg, because that doesn't really
   2942          depend on reg, and doing the obvious thing potentially
   2943          generates a spurious value check failure due to the bogus
   2944          dependency. */
   2945       if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
   2946           && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
   2947          if (False && op8 == Iop_Sub8)
   2948             vex_printf("vex amd64->IR: sbb %%r,%%r optimisation(1)\n");
   2949          putIRegG(size,pfx,rm, mkU(ty,0));
   2950       }
   2951 
   2952       assign( dst0, getIRegG(size,pfx,rm) );
   2953       assign( src,  getIRegE(size,pfx,rm) );
   2954 
   2955       if (addSubCarry && op8 == Iop_Add8) {
   2956          helper_ADC( size, dst1, dst0, src,
   2957                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2958          putIRegG(size, pfx, rm, mkexpr(dst1));
   2959       } else
   2960       if (addSubCarry && op8 == Iop_Sub8) {
   2961          helper_SBB( size, dst1, dst0, src,
   2962                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2963          putIRegG(size, pfx, rm, mkexpr(dst1));
   2964       } else {
   2965          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   2966          if (isAddSub(op8))
   2967             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2968          else
   2969             setFlags_DEP1(op8, dst1, ty);
   2970          if (keep)
   2971             putIRegG(size, pfx, rm, mkexpr(dst1));
   2972       }
   2973 
   2974       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   2975                           nameIRegE(size,pfx,rm),
   2976                           nameIRegG(size,pfx,rm));
   2977       return 1+delta0;
   2978    } else {
   2979       /* E refers to memory */
   2980       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   2981       assign( dst0, getIRegG(size,pfx,rm) );
   2982       assign( src,  loadLE(szToITy(size), mkexpr(addr)) );
   2983 
   2984       if (addSubCarry && op8 == Iop_Add8) {
   2985          helper_ADC( size, dst1, dst0, src,
   2986                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2987          putIRegG(size, pfx, rm, mkexpr(dst1));
   2988       } else
   2989       if (addSubCarry && op8 == Iop_Sub8) {
   2990          helper_SBB( size, dst1, dst0, src,
   2991                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2992          putIRegG(size, pfx, rm, mkexpr(dst1));
   2993       } else {
   2994          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   2995          if (isAddSub(op8))
   2996             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2997          else
   2998             setFlags_DEP1(op8, dst1, ty);
   2999          if (keep)
   3000             putIRegG(size, pfx, rm, mkexpr(dst1));
   3001       }
   3002 
   3003       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   3004                           dis_buf, nameIRegG(size, pfx, rm));
   3005       return len+delta0;
   3006    }
   3007 }
   3008 
   3009 
   3010 
   3011 /* Handle binary integer instructions of the form
   3012       op G, E  meaning
   3013       op reg, reg-or-mem
   3014    Is passed the a ptr to the modRM byte, the actual operation, and the
   3015    data size.  Returns the address advanced completely over this
   3016    instruction.
   3017 
   3018    G(src) is reg.
   3019    E(dst) is reg-or-mem
   3020 
   3021    If E is reg, -->    GET %E,  tmp
   3022                        OP %G,   tmp
   3023                        PUT tmp, %E
   3024 
   3025    If E is mem, -->    (getAddr E) -> tmpa
   3026                        LD (tmpa), tmpv
   3027                        OP %G, tmpv
   3028                        ST tmpv, (tmpa)
   3029 */
   3030 static
   3031 ULong dis_op2_G_E ( const VexAbiInfo* vbi,
   3032                     Prefix      pfx,
   3033                     Bool        addSubCarry,
   3034                     IROp        op8,
   3035                     Bool        keep,
   3036                     Int         size,
   3037                     Long        delta0,
   3038                     const HChar* t_amd64opc )
   3039 {
   3040    HChar   dis_buf[50];
   3041    Int     len;
   3042    IRType  ty   = szToITy(size);
   3043    IRTemp  dst1 = newTemp(ty);
   3044    IRTemp  src  = newTemp(ty);
   3045    IRTemp  dst0 = newTemp(ty);
   3046    UChar   rm   = getUChar(delta0);
   3047    IRTemp  addr = IRTemp_INVALID;
   3048 
   3049    /* addSubCarry == True indicates the intended operation is
   3050       add-with-carry or subtract-with-borrow. */
   3051    if (addSubCarry) {
   3052       vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
   3053       vassert(keep);
   3054    }
   3055 
   3056    if (epartIsReg(rm)) {
   3057       /* Specially handle XOR reg,reg, because that doesn't really
   3058          depend on reg, and doing the obvious thing potentially
   3059          generates a spurious value check failure due to the bogus
   3060          dependency.  Ditto SBB reg,reg. */
   3061       if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
   3062           && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
   3063          putIRegE(size,pfx,rm, mkU(ty,0));
   3064       }
   3065 
   3066       assign(dst0, getIRegE(size,pfx,rm));
   3067       assign(src,  getIRegG(size,pfx,rm));
   3068 
   3069       if (addSubCarry && op8 == Iop_Add8) {
   3070          helper_ADC( size, dst1, dst0, src,
   3071                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3072          putIRegE(size, pfx, rm, mkexpr(dst1));
   3073       } else
   3074       if (addSubCarry && op8 == Iop_Sub8) {
   3075          helper_SBB( size, dst1, dst0, src,
   3076                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3077          putIRegE(size, pfx, rm, mkexpr(dst1));
   3078       } else {
   3079          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3080          if (isAddSub(op8))
   3081             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3082          else
   3083             setFlags_DEP1(op8, dst1, ty);
   3084          if (keep)
   3085             putIRegE(size, pfx, rm, mkexpr(dst1));
   3086       }
   3087 
   3088       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   3089                           nameIRegG(size,pfx,rm),
   3090                           nameIRegE(size,pfx,rm));
   3091       return 1+delta0;
   3092    }
   3093 
   3094    /* E refers to memory */
   3095    {
   3096       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   3097       assign(dst0, loadLE(ty,mkexpr(addr)));
   3098       assign(src,  getIRegG(size,pfx,rm));
   3099 
   3100       if (addSubCarry && op8 == Iop_Add8) {
   3101          if (haveLOCK(pfx)) {
   3102             /* cas-style store */
   3103             helper_ADC( size, dst1, dst0, src,
   3104                         /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3105          } else {
   3106             /* normal store */
   3107             helper_ADC( size, dst1, dst0, src,
   3108                         /*store*/addr, IRTemp_INVALID, 0 );
   3109          }
   3110       } else
   3111       if (addSubCarry && op8 == Iop_Sub8) {
   3112          if (haveLOCK(pfx)) {
   3113             /* cas-style store */
   3114             helper_SBB( size, dst1, dst0, src,
   3115                         /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3116          } else {
   3117             /* normal store */
   3118             helper_SBB( size, dst1, dst0, src,
   3119                         /*store*/addr, IRTemp_INVALID, 0 );
   3120          }
   3121       } else {
   3122          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3123          if (keep) {
   3124             if (haveLOCK(pfx)) {
   3125                if (0) vex_printf("locked case\n" );
   3126                casLE( mkexpr(addr),
   3127                       mkexpr(dst0)/*expval*/,
   3128                       mkexpr(dst1)/*newval*/, guest_RIP_curr_instr );
   3129             } else {
   3130                if (0) vex_printf("nonlocked case\n");
   3131                storeLE(mkexpr(addr), mkexpr(dst1));
   3132             }
   3133          }
   3134          if (isAddSub(op8))
   3135             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3136          else
   3137             setFlags_DEP1(op8, dst1, ty);
   3138       }
   3139 
   3140       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   3141                           nameIRegG(size,pfx,rm), dis_buf);
   3142       return len+delta0;
   3143    }
   3144 }
   3145 
   3146 
   3147 /* Handle move instructions of the form
   3148       mov E, G  meaning
   3149       mov reg-or-mem, reg
   3150    Is passed the a ptr to the modRM byte, and the data size.  Returns
   3151    the address advanced completely over this instruction.
   3152 
   3153    E(src) is reg-or-mem
   3154    G(dst) is reg.
   3155 
   3156    If E is reg, -->    GET %E,  tmpv
   3157                        PUT tmpv, %G
   3158 
   3159    If E is mem  -->    (getAddr E) -> tmpa
   3160                        LD (tmpa), tmpb
   3161                        PUT tmpb, %G
   3162 */
   3163 static
   3164 ULong dis_mov_E_G ( const VexAbiInfo* vbi,
   3165                     Prefix      pfx,
   3166                     Int         size,
   3167                     Long        delta0 )
   3168 {
   3169    Int len;
   3170    UChar rm = getUChar(delta0);
   3171    HChar dis_buf[50];
   3172 
   3173    if (epartIsReg(rm)) {
   3174       putIRegG(size, pfx, rm, getIRegE(size, pfx, rm));
   3175       DIP("mov%c %s,%s\n", nameISize(size),
   3176                            nameIRegE(size,pfx,rm),
   3177                            nameIRegG(size,pfx,rm));
   3178       return 1+delta0;
   3179    }
   3180 
   3181    /* E refers to memory */
   3182    {
   3183       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   3184       putIRegG(size, pfx, rm, loadLE(szToITy(size), mkexpr(addr)));
   3185       DIP("mov%c %s,%s\n", nameISize(size),
   3186                            dis_buf,
   3187                            nameIRegG(size,pfx,rm));
   3188       return delta0+len;
   3189    }
   3190 }
   3191 
   3192 
   3193 /* Handle move instructions of the form
   3194       mov G, E  meaning
   3195       mov reg, reg-or-mem
   3196    Is passed the a ptr to the modRM byte, and the data size.  Returns
   3197    the address advanced completely over this instruction.
   3198    We have to decide here whether F2 or F3 are acceptable.  F2 never is.
   3199 
   3200    G(src) is reg.
   3201    E(dst) is reg-or-mem
   3202 
   3203    If E is reg, -->    GET %G,  tmp
   3204                        PUT tmp, %E
   3205 
   3206    If E is mem, -->    (getAddr E) -> tmpa
   3207                        GET %G, tmpv
   3208                        ST tmpv, (tmpa)
   3209 */
   3210 static
   3211 ULong dis_mov_G_E ( const VexAbiInfo*  vbi,
   3212                     Prefix       pfx,
   3213                     Int          size,
   3214                     Long         delta0,
   3215                     /*OUT*/Bool* ok )
   3216 {
   3217    Int   len;
   3218    UChar rm = getUChar(delta0);
   3219    HChar dis_buf[50];
   3220 
   3221    *ok = True;
   3222 
   3223    if (epartIsReg(rm)) {
   3224       if (haveF2orF3(pfx)) { *ok = False; return delta0; }
   3225       putIRegE(size, pfx, rm, getIRegG(size, pfx, rm));
   3226       DIP("mov%c %s,%s\n", nameISize(size),
   3227                            nameIRegG(size,pfx,rm),
   3228                            nameIRegE(size,pfx,rm));
   3229       return 1+delta0;
   3230    }
   3231 
   3232    /* E refers to memory */
   3233    {
   3234       if (haveF2(pfx)) { *ok = False; return delta0; }
   3235       /* F3(XRELEASE) is acceptable, though. */
   3236       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   3237       storeLE( mkexpr(addr), getIRegG(size, pfx, rm) );
   3238       DIP("mov%c %s,%s\n", nameISize(size),
   3239                            nameIRegG(size,pfx,rm),
   3240                            dis_buf);
   3241       return len+delta0;
   3242    }
   3243 }
   3244 
   3245 
   3246 /* op $immediate, AL/AX/EAX/RAX. */
   3247 static
   3248 ULong dis_op_imm_A ( Int    size,
   3249                      Bool   carrying,
   3250                      IROp   op8,
   3251                      Bool   keep,
   3252                      Long   delta,
   3253                      const HChar* t_amd64opc )
   3254 {
   3255    Int    size4 = imin(size,4);
   3256    IRType ty    = szToITy(size);
   3257    IRTemp dst0  = newTemp(ty);
   3258    IRTemp src   = newTemp(ty);
   3259    IRTemp dst1  = newTemp(ty);
   3260    Long  lit    = getSDisp(size4,delta);
   3261    assign(dst0, getIRegRAX(size));
   3262    assign(src,  mkU(ty,lit & mkSizeMask(size)));
   3263 
   3264    if (isAddSub(op8) && !carrying) {
   3265       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   3266       setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3267    }
   3268    else
   3269    if (isLogic(op8)) {
   3270       vassert(!carrying);
   3271       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   3272       setFlags_DEP1(op8, dst1, ty);
   3273    }
   3274    else
   3275    if (op8 == Iop_Add8 && carrying) {
   3276       helper_ADC( size, dst1, dst0, src,
   3277                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3278    }
   3279    else
   3280    if (op8 == Iop_Sub8 && carrying) {
   3281       helper_SBB( size, dst1, dst0, src,
   3282                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3283    }
   3284    else
   3285       vpanic("dis_op_imm_A(amd64,guest)");
   3286 
   3287    if (keep)
   3288       putIRegRAX(size, mkexpr(dst1));
   3289 
   3290    DIP("%s%c $%lld, %s\n", t_amd64opc, nameISize(size),
   3291                            lit, nameIRegRAX(size));
   3292    return delta+size4;
   3293 }
   3294 
   3295 
   3296 /* Sign- and Zero-extending moves. */
   3297 static
   3298 ULong dis_movx_E_G ( const VexAbiInfo* vbi,
   3299                      Prefix pfx,
   3300                      Long delta, Int szs, Int szd, Bool sign_extend )
   3301 {
   3302    UChar rm = getUChar(delta);
   3303    if (epartIsReg(rm)) {
   3304       putIRegG(szd, pfx, rm,
   3305                     doScalarWidening(
   3306                        szs,szd,sign_extend,
   3307                        getIRegE(szs,pfx,rm)));
   3308       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   3309                                nameISize(szs),
   3310                                nameISize(szd),
   3311                                nameIRegE(szs,pfx,rm),
   3312                                nameIRegG(szd,pfx,rm));
   3313       return 1+delta;
   3314    }
   3315 
   3316    /* E refers to memory */
   3317    {
   3318       Int    len;
   3319       HChar  dis_buf[50];
   3320       IRTemp addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   3321       putIRegG(szd, pfx, rm,
   3322                     doScalarWidening(
   3323                        szs,szd,sign_extend,
   3324                        loadLE(szToITy(szs),mkexpr(addr))));
   3325       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   3326                                nameISize(szs),
   3327                                nameISize(szd),
   3328                                dis_buf,
   3329                                nameIRegG(szd,pfx,rm));
   3330       return len+delta;
   3331    }
   3332 }
   3333 
   3334 
   3335 /* Generate code to divide ArchRegs RDX:RAX / EDX:EAX / DX:AX / AX by
   3336    the 64 / 32 / 16 / 8 bit quantity in the given IRTemp.  */
   3337 static
   3338 void codegen_div ( Int sz, IRTemp t, Bool signed_divide )
   3339 {
   3340    /* special-case the 64-bit case */
   3341    if (sz == 8) {
   3342       IROp   op     = signed_divide ? Iop_DivModS128to64
   3343                                     : Iop_DivModU128to64;
   3344       IRTemp src128 = newTemp(Ity_I128);
   3345       IRTemp dst128 = newTemp(Ity_I128);
   3346       assign( src128, binop(Iop_64HLto128,
   3347                             getIReg64(R_RDX),
   3348                             getIReg64(R_RAX)) );
   3349       assign( dst128, binop(op, mkexpr(src128), mkexpr(t)) );
   3350       putIReg64( R_RAX, unop(Iop_128to64,mkexpr(dst128)) );
   3351       putIReg64( R_RDX, unop(Iop_128HIto64,mkexpr(dst128)) );
   3352    } else {
   3353       IROp   op    = signed_divide ? Iop_DivModS64to32
   3354                                    : Iop_DivModU64to32;
   3355       IRTemp src64 = newTemp(Ity_I64);
   3356       IRTemp dst64 = newTemp(Ity_I64);
   3357       switch (sz) {
   3358       case 4:
   3359          assign( src64,
   3360                  binop(Iop_32HLto64, getIRegRDX(4), getIRegRAX(4)) );
   3361          assign( dst64,
   3362                  binop(op, mkexpr(src64), mkexpr(t)) );
   3363          putIRegRAX( 4, unop(Iop_64to32,mkexpr(dst64)) );
   3364          putIRegRDX( 4, unop(Iop_64HIto32,mkexpr(dst64)) );
   3365          break;
   3366       case 2: {
   3367          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   3368          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   3369          assign( src64, unop(widen3264,
   3370                              binop(Iop_16HLto32,
   3371                                    getIRegRDX(2),
   3372                                    getIRegRAX(2))) );
   3373          assign( dst64, binop(op, mkexpr(src64), unop(widen1632,mkexpr(t))) );
   3374          putIRegRAX( 2, unop(Iop_32to16,unop(Iop_64to32,mkexpr(dst64))) );
   3375          putIRegRDX( 2, unop(Iop_32to16,unop(Iop_64HIto32,mkexpr(dst64))) );
   3376          break;
   3377       }
   3378       case 1: {
   3379          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   3380          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   3381          IROp widen816  = signed_divide ? Iop_8Sto16  : Iop_8Uto16;
   3382          assign( src64, unop(widen3264,
   3383                         unop(widen1632, getIRegRAX(2))) );
   3384          assign( dst64,
   3385                  binop(op, mkexpr(src64),
   3386                            unop(widen1632, unop(widen816, mkexpr(t)))) );
   3387          putIRegRAX( 1, unop(Iop_16to8,
   3388                         unop(Iop_32to16,
   3389                         unop(Iop_64to32,mkexpr(dst64)))) );
   3390          putIRegAH( unop(Iop_16to8,
   3391                     unop(Iop_32to16,
   3392                     unop(Iop_64HIto32,mkexpr(dst64)))) );
   3393          break;
   3394       }
   3395       default:
   3396          vpanic("codegen_div(amd64)");
   3397       }
   3398    }
   3399 }
   3400 
   3401 static
   3402 ULong dis_Grp1 ( const VexAbiInfo* vbi,
   3403                  Prefix pfx,
   3404                  Long delta, UChar modrm,
   3405                  Int am_sz, Int d_sz, Int sz, Long d64 )
   3406 {
   3407    Int     len;
   3408    HChar   dis_buf[50];
   3409    IRType  ty   = szToITy(sz);
   3410    IRTemp  dst1 = newTemp(ty);
   3411    IRTemp  src  = newTemp(ty);
   3412    IRTemp  dst0 = newTemp(ty);
   3413    IRTemp  addr = IRTemp_INVALID;
   3414    IROp    op8  = Iop_INVALID;
   3415    ULong   mask = mkSizeMask(sz);
   3416 
   3417    switch (gregLO3ofRM(modrm)) {
   3418       case 0: op8 = Iop_Add8; break;  case 1: op8 = Iop_Or8;  break;
   3419       case 2: break;  // ADC
   3420       case 3: break;  // SBB
   3421       case 4: op8 = Iop_And8; break;  case 5: op8 = Iop_Sub8; break;
   3422       case 6: op8 = Iop_Xor8; break;  case 7: op8 = Iop_Sub8; break;
   3423       /*NOTREACHED*/
   3424       default: vpanic("dis_Grp1(amd64): unhandled case");
   3425    }
   3426 
   3427    if (epartIsReg(modrm)) {
   3428       vassert(am_sz == 1);
   3429 
   3430       assign(dst0, getIRegE(sz,pfx,modrm));
   3431       assign(src,  mkU(ty,d64 & mask));
   3432 
   3433       if (gregLO3ofRM(modrm) == 2 /* ADC */) {
   3434          helper_ADC( sz, dst1, dst0, src,
   3435                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3436       } else
   3437       if (gregLO3ofRM(modrm) == 3 /* SBB */) {
   3438          helper_SBB( sz, dst1, dst0, src,
   3439                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3440       } else {
   3441          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3442          if (isAddSub(op8))
   3443             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3444          else
   3445             setFlags_DEP1(op8, dst1, ty);
   3446       }
   3447 
   3448       if (gregLO3ofRM(modrm) < 7)
   3449          putIRegE(sz, pfx, modrm, mkexpr(dst1));
   3450 
   3451       delta += (am_sz + d_sz);
   3452       DIP("%s%c $%lld, %s\n",
   3453           nameGrp1(gregLO3ofRM(modrm)), nameISize(sz), d64,
   3454           nameIRegE(sz,pfx,modrm));
   3455    } else {
   3456       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
   3457 
   3458       assign(dst0, loadLE(ty,mkexpr(addr)));
   3459       assign(src, mkU(ty,d64 & mask));
   3460 
   3461       if (gregLO3ofRM(modrm) == 2 /* ADC */) {
   3462          if (haveLOCK(pfx)) {
   3463             /* cas-style store */
   3464             helper_ADC( sz, dst1, dst0, src,
   3465                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3466          } else {
   3467             /* normal store */
   3468             helper_ADC( sz, dst1, dst0, src,
   3469                         /*store*/addr, IRTemp_INVALID, 0 );
   3470          }
   3471       } else
   3472       if (gregLO3ofRM(modrm) == 3 /* SBB */) {
   3473          if (haveLOCK(pfx)) {
   3474             /* cas-style store */
   3475             helper_SBB( sz, dst1, dst0, src,
   3476                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3477          } else {
   3478             /* normal store */
   3479             helper_SBB( sz, dst1, dst0, src,
   3480                         /*store*/addr, IRTemp_INVALID, 0 );
   3481          }
   3482       } else {
   3483          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3484          if (gregLO3ofRM(modrm) < 7) {
   3485             if (haveLOCK(pfx)) {
   3486                casLE( mkexpr(addr), mkexpr(dst0)/*expVal*/,
   3487                                     mkexpr(dst1)/*newVal*/,
   3488                                     guest_RIP_curr_instr );
   3489             } else {
   3490                storeLE(mkexpr(addr), mkexpr(dst1));
   3491             }
   3492          }
   3493          if (isAddSub(op8))
   3494             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3495          else
   3496             setFlags_DEP1(op8, dst1, ty);
   3497       }
   3498 
   3499       delta += (len+d_sz);
   3500       DIP("%s%c $%lld, %s\n",
   3501           nameGrp1(gregLO3ofRM(modrm)), nameISize(sz),
   3502           d64, dis_buf);
   3503    }
   3504    return delta;
   3505 }
   3506 
   3507 
   3508 /* Group 2 extended opcodes.  shift_expr must be an 8-bit typed
   3509    expression. */
   3510 
   3511 static
   3512 ULong dis_Grp2 ( const VexAbiInfo* vbi,
   3513                  Prefix pfx,
   3514                  Long delta, UChar modrm,
   3515                  Int am_sz, Int d_sz, Int sz, IRExpr* shift_expr,
   3516                  const HChar* shift_expr_txt, Bool* decode_OK )
   3517 {
   3518    /* delta on entry points at the modrm byte. */
   3519    HChar  dis_buf[50];
   3520    Int    len;
   3521    Bool   isShift, isRotate, isRotateC;
   3522    IRType ty    = szToITy(sz);
   3523    IRTemp dst0  = newTemp(ty);
   3524    IRTemp dst1  = newTemp(ty);
   3525    IRTemp addr  = IRTemp_INVALID;
   3526 
   3527    *decode_OK = True;
   3528 
   3529    vassert(sz == 1 || sz == 2 || sz == 4 || sz == 8);
   3530 
   3531    /* Put value to shift/rotate in dst0. */
   3532    if (epartIsReg(modrm)) {
   3533       assign(dst0, getIRegE(sz, pfx, modrm));
   3534       delta += (am_sz + d_sz);
   3535    } else {
   3536       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
   3537       assign(dst0, loadLE(ty,mkexpr(addr)));
   3538       delta += len + d_sz;
   3539    }
   3540 
   3541    isShift = False;
   3542    switch (gregLO3ofRM(modrm)) { case 4: case 5: case 6: case 7: isShift = True; }
   3543 
   3544    isRotate = False;
   3545    switch (gregLO3ofRM(modrm)) { case 0: case 1: isRotate = True; }
   3546 
   3547    isRotateC = False;
   3548    switch (gregLO3ofRM(modrm)) { case 2: case 3: isRotateC = True; }
   3549 
   3550    if (!isShift && !isRotate && !isRotateC) {
   3551       /*NOTREACHED*/
   3552       vpanic("dis_Grp2(Reg): unhandled case(amd64)");
   3553    }
   3554 
   3555    if (isRotateC) {
   3556       /* Call a helper; this insn is so ridiculous it does not deserve
   3557          better.  One problem is, the helper has to calculate both the
   3558          new value and the new flags.  This is more than 64 bits, and
   3559          there is no way to return more than 64 bits from the helper.
   3560          Hence the crude and obvious solution is to call it twice,
   3561          using the sign of the sz field to indicate whether it is the
   3562          value or rflags result we want.
   3563       */
   3564       Bool     left = toBool(gregLO3ofRM(modrm) == 2);
   3565       IRExpr** argsVALUE;
   3566       IRExpr** argsRFLAGS;
   3567 
   3568       IRTemp new_value  = newTemp(Ity_I64);
   3569       IRTemp new_rflags = newTemp(Ity_I64);
   3570       IRTemp old_rflags = newTemp(Ity_I64);
   3571 
   3572       assign( old_rflags, widenUto64(mk_amd64g_calculate_rflags_all()) );
   3573 
   3574       argsVALUE
   3575          = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
   3576                           widenUto64(shift_expr),   /* rotate amount */
   3577                           mkexpr(old_rflags),
   3578                           mkU64(sz) );
   3579       assign( new_value,
   3580                  mkIRExprCCall(
   3581                     Ity_I64,
   3582                     0/*regparm*/,
   3583                     left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
   3584                     left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
   3585                     argsVALUE
   3586                  )
   3587             );
   3588 
   3589       argsRFLAGS
   3590          = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
   3591                           widenUto64(shift_expr),   /* rotate amount */
   3592                           mkexpr(old_rflags),
   3593                           mkU64(-sz) );
   3594       assign( new_rflags,
   3595                  mkIRExprCCall(
   3596                     Ity_I64,
   3597                     0/*regparm*/,
   3598                     left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
   3599                     left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
   3600                     argsRFLAGS
   3601                  )
   3602             );
   3603 
   3604       assign( dst1, narrowTo(ty, mkexpr(new_value)) );
   3605       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   3606       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(new_rflags) ));
   3607       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   3608       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   3609    }
   3610 
   3611    else
   3612    if (isShift) {
   3613 
   3614       IRTemp pre64     = newTemp(Ity_I64);
   3615       IRTemp res64     = newTemp(Ity_I64);
   3616       IRTemp res64ss   = newTemp(Ity_I64);
   3617       IRTemp shift_amt = newTemp(Ity_I8);
   3618       UChar  mask      = toUChar(sz==8 ? 63 : 31);
   3619       IROp   op64;
   3620 
   3621       switch (gregLO3ofRM(modrm)) {
   3622          case 4: op64 = Iop_Shl64; break;
   3623          case 5: op64 = Iop_Shr64; break;
   3624          case 6: op64 = Iop_Shl64; break;
   3625          case 7: op64 = Iop_Sar64; break;
   3626          /*NOTREACHED*/
   3627          default: vpanic("dis_Grp2:shift"); break;
   3628       }
   3629 
   3630       /* Widen the value to be shifted to 64 bits, do the shift, and
   3631          narrow back down.  This seems surprisingly long-winded, but
   3632          unfortunately the AMD semantics requires that 8/16/32-bit
   3633          shifts give defined results for shift values all the way up
   3634          to 32, and this seems the simplest way to do it.  It has the
   3635          advantage that the only IR level shifts generated are of 64
   3636          bit values, and the shift amount is guaranteed to be in the
   3637          range 0 .. 63, thereby observing the IR semantics requiring
   3638          all shift values to be in the range 0 .. 2^word_size-1.
   3639 
   3640          Therefore the shift amount is masked with 63 for 64-bit shifts
   3641          and 31 for all others.
   3642       */
   3643       /* shift_amt = shift_expr & MASK, regardless of operation size */
   3644       assign( shift_amt, binop(Iop_And8, shift_expr, mkU8(mask)) );
   3645 
   3646       /* suitably widen the value to be shifted to 64 bits. */
   3647       assign( pre64, op64==Iop_Sar64 ? widenSto64(mkexpr(dst0))
   3648                                      : widenUto64(mkexpr(dst0)) );
   3649 
   3650       /* res64 = pre64 `shift` shift_amt */
   3651       assign( res64, binop(op64, mkexpr(pre64), mkexpr(shift_amt)) );
   3652 
   3653       /* res64ss = pre64 `shift` ((shift_amt - 1) & MASK) */
   3654       assign( res64ss,
   3655               binop(op64,
   3656                     mkexpr(pre64),
   3657                     binop(Iop_And8,
   3658                           binop(Iop_Sub8,
   3659                                 mkexpr(shift_amt), mkU8(1)),
   3660                           mkU8(mask))) );
   3661 
   3662       /* Build the flags thunk. */
   3663       setFlags_DEP1_DEP2_shift(op64, res64, res64ss, ty, shift_amt);
   3664 
   3665       /* Narrow the result back down. */
   3666       assign( dst1, narrowTo(ty, mkexpr(res64)) );
   3667 
   3668    } /* if (isShift) */
   3669 
   3670    else
   3671    if (isRotate) {
   3672       Int    ccOp      = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1
   3673                                         : (ty==Ity_I32 ? 2 : 3));
   3674       Bool   left      = toBool(gregLO3ofRM(modrm) == 0);
   3675       IRTemp rot_amt   = newTemp(Ity_I8);
   3676       IRTemp rot_amt64 = newTemp(Ity_I8);
   3677       IRTemp oldFlags  = newTemp(Ity_I64);
   3678       UChar  mask      = toUChar(sz==8 ? 63 : 31);
   3679 
   3680       /* rot_amt = shift_expr & mask */
   3681       /* By masking the rotate amount thusly, the IR-level Shl/Shr
   3682          expressions never shift beyond the word size and thus remain
   3683          well defined. */
   3684       assign(rot_amt64, binop(Iop_And8, shift_expr, mkU8(mask)));
   3685 
   3686       if (ty == Ity_I64)
   3687          assign(rot_amt, mkexpr(rot_amt64));
   3688       else
   3689          assign(rot_amt, binop(Iop_And8, mkexpr(rot_amt64), mkU8(8*sz-1)));
   3690 
   3691       if (left) {
   3692 
   3693          /* dst1 = (dst0 << rot_amt) | (dst0 >>u (wordsize-rot_amt)) */
   3694          assign(dst1,
   3695             binop( mkSizedOp(ty,Iop_Or8),
   3696                    binop( mkSizedOp(ty,Iop_Shl8),
   3697                           mkexpr(dst0),
   3698                           mkexpr(rot_amt)
   3699                    ),
   3700                    binop( mkSizedOp(ty,Iop_Shr8),
   3701                           mkexpr(dst0),
   3702                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   3703                    )
   3704             )
   3705          );
   3706          ccOp += AMD64G_CC_OP_ROLB;
   3707 
   3708       } else { /* right */
   3709 
   3710          /* dst1 = (dst0 >>u rot_amt) | (dst0 << (wordsize-rot_amt)) */
   3711          assign(dst1,
   3712             binop( mkSizedOp(ty,Iop_Or8),
   3713                    binop( mkSizedOp(ty,Iop_Shr8),
   3714                           mkexpr(dst0),
   3715                           mkexpr(rot_amt)
   3716                    ),
   3717                    binop( mkSizedOp(ty,Iop_Shl8),
   3718                           mkexpr(dst0),
   3719                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   3720                    )
   3721             )
   3722          );
   3723          ccOp += AMD64G_CC_OP_RORB;
   3724 
   3725       }
   3726 
   3727       /* dst1 now holds the rotated value.  Build flag thunk.  We
   3728          need the resulting value for this, and the previous flags.
   3729          Except don't set it if the rotate count is zero. */
   3730 
   3731       assign(oldFlags, mk_amd64g_calculate_rflags_all());
   3732 
   3733       /* rot_amt64 :: Ity_I8.  We need to convert it to I1. */
   3734       IRTemp rot_amt64b = newTemp(Ity_I1);
   3735       assign(rot_amt64b, binop(Iop_CmpNE8, mkexpr(rot_amt64), mkU8(0)) );
   3736 
   3737       /* CC_DEP1 is the rotated value.  CC_NDEP is flags before. */
   3738       stmt( IRStmt_Put( OFFB_CC_OP,
   3739                         IRExpr_ITE( mkexpr(rot_amt64b),
   3740                                     mkU64(ccOp),
   3741                                     IRExpr_Get(OFFB_CC_OP,Ity_I64) ) ));
   3742       stmt( IRStmt_Put( OFFB_CC_DEP1,
   3743                         IRExpr_ITE( mkexpr(rot_amt64b),
   3744                                     widenUto64(mkexpr(dst1)),
   3745                                     IRExpr_Get(OFFB_CC_DEP1,Ity_I64) ) ));
   3746       stmt( IRStmt_Put( OFFB_CC_DEP2,
   3747                         IRExpr_ITE( mkexpr(rot_amt64b),
   3748                                     mkU64(0),
   3749                                     IRExpr_Get(OFFB_CC_DEP2,Ity_I64) ) ));
   3750       stmt( IRStmt_Put( OFFB_CC_NDEP,
   3751                         IRExpr_ITE( mkexpr(rot_amt64b),
   3752                                     mkexpr(oldFlags),
   3753                                     IRExpr_Get(OFFB_CC_NDEP,Ity_I64) ) ));
   3754    } /* if (isRotate) */
   3755 
   3756    /* Save result, and finish up. */
   3757    if (epartIsReg(modrm)) {
   3758       putIRegE(sz, pfx, modrm, mkexpr(dst1));
   3759       if (vex_traceflags & VEX_TRACE_FE) {
   3760          vex_printf("%s%c ",
   3761                     nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
   3762          if (shift_expr_txt)
   3763             vex_printf("%s", shift_expr_txt);
   3764          else
   3765             ppIRExpr(shift_expr);
   3766          vex_printf(", %s\n", nameIRegE(sz,pfx,modrm));
   3767       }
   3768    } else {
   3769       storeLE(mkexpr(addr), mkexpr(dst1));
   3770       if (vex_traceflags & VEX_TRACE_FE) {
   3771          vex_printf("%s%c ",
   3772                     nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
   3773          if (shift_expr_txt)
   3774             vex_printf("%s", shift_expr_txt);
   3775          else
   3776             ppIRExpr(shift_expr);
   3777          vex_printf(", %s\n", dis_buf);
   3778       }
   3779    }
   3780    return delta;
   3781 }
   3782 
   3783 
   3784 /* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
   3785 static
   3786 ULong dis_Grp8_Imm ( const VexAbiInfo* vbi,
   3787                      Prefix pfx,
   3788                      Long delta, UChar modrm,
   3789                      Int am_sz, Int sz, ULong src_val,
   3790                      Bool* decode_OK )
   3791 {
   3792    /* src_val denotes a d8.
   3793       And delta on entry points at the modrm byte. */
   3794 
   3795    IRType ty     = szToITy(sz);
   3796    IRTemp t2     = newTemp(Ity_I64);
   3797    IRTemp t2m    = newTemp(Ity_I64);
   3798    IRTemp t_addr = IRTemp_INVALID;
   3799    HChar  dis_buf[50];
   3800    ULong  mask;
   3801 
   3802    /* we're optimists :-) */
   3803    *decode_OK = True;
   3804 
   3805    /* Check whether F2 or F3 are acceptable. */
   3806    if (epartIsReg(modrm)) {
   3807       /* F2 or F3 are not allowed in the register case. */
   3808       if (haveF2orF3(pfx)) {
   3809          *decode_OK = False;
   3810          return delta;
   3811      }
   3812    } else {
   3813       /* F2 or F3 (but not both) are allowable provided LOCK is also
   3814          present. */
   3815       if (haveF2orF3(pfx)) {
   3816          if (haveF2andF3(pfx) || !haveLOCK(pfx)) {
   3817             *decode_OK = False;
   3818             return delta;
   3819          }
   3820       }
   3821    }
   3822 
   3823    /* Limit src_val -- the bit offset -- to something within a word.
   3824       The Intel docs say that literal offsets larger than a word are
   3825       masked in this way. */
   3826    switch (sz) {
   3827       case 2:  src_val &= 15; break;
   3828       case 4:  src_val &= 31; break;
   3829       case 8:  src_val &= 63; break;
   3830       default: *decode_OK = False; return delta;
   3831    }
   3832 
   3833    /* Invent a mask suitable for the operation. */
   3834    switch (gregLO3ofRM(modrm)) {
   3835       case 4: /* BT */  mask = 0;                  break;
   3836       case 5: /* BTS */ mask = 1ULL << src_val;    break;
   3837       case 6: /* BTR */ mask = ~(1ULL << src_val); break;
   3838       case 7: /* BTC */ mask = 1ULL << src_val;    break;
   3839          /* If this needs to be extended, probably simplest to make a
   3840             new function to handle the other cases (0 .. 3).  The
   3841             Intel docs do however not indicate any use for 0 .. 3, so
   3842             we don't expect this to happen. */
   3843       default: *decode_OK = False; return delta;
   3844    }
   3845 
   3846    /* Fetch the value to be tested and modified into t2, which is
   3847       64-bits wide regardless of sz. */
   3848    if (epartIsReg(modrm)) {
   3849       vassert(am_sz == 1);
   3850       assign( t2, widenUto64(getIRegE(sz, pfx, modrm)) );
   3851       delta += (am_sz + 1);
   3852       DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
   3853                                 nameISize(sz),
   3854                                 src_val, nameIRegE(sz,pfx,modrm));
   3855    } else {
   3856       Int len;
   3857       t_addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 1 );
   3858       delta  += (len+1);
   3859       assign( t2, widenUto64(loadLE(ty, mkexpr(t_addr))) );
   3860       DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
   3861                                 nameISize(sz),
   3862                                 src_val, dis_buf);
   3863    }
   3864 
   3865    /* Compute the new value into t2m, if non-BT. */
   3866    switch (gregLO3ofRM(modrm)) {
   3867       case 4: /* BT */
   3868          break;
   3869       case 5: /* BTS */
   3870          assign( t2m, binop(Iop_Or64, mkU64(mask), mkexpr(t2)) );
   3871          break;
   3872       case 6: /* BTR */
   3873          assign( t2m, binop(Iop_And64, mkU64(mask), mkexpr(t2)) );
   3874          break;
   3875       case 7: /* BTC */
   3876          assign( t2m, binop(Iop_Xor64, mkU64(mask), mkexpr(t2)) );
   3877          break;
   3878      default:
   3879          /*NOTREACHED*/ /*the previous switch guards this*/
   3880          vassert(0);
   3881    }
   3882 
   3883    /* Write the result back, if non-BT. */
   3884    if (gregLO3ofRM(modrm) != 4 /* BT */) {
   3885       if (epartIsReg(modrm)) {
   3886         putIRegE(sz, pfx, modrm, narrowTo(ty, mkexpr(t2m)));
   3887       } else {
   3888          if (haveLOCK(pfx)) {
   3889             casLE( mkexpr(t_addr),
   3890                    narrowTo(ty, mkexpr(t2))/*expd*/,
   3891                    narrowTo(ty, mkexpr(t2m))/*new*/,
   3892                    guest_RIP_curr_instr );
   3893          } else {
   3894             storeLE(mkexpr(t_addr), narrowTo(ty, mkexpr(t2m)));
   3895          }
   3896       }
   3897    }
   3898 
   3899    /* Copy relevant bit from t2 into the carry flag. */
   3900    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
   3901    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   3902    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   3903    stmt( IRStmt_Put(
   3904             OFFB_CC_DEP1,
   3905             binop(Iop_And64,
   3906                   binop(Iop_Shr64, mkexpr(t2), mkU8(src_val)),
   3907                   mkU64(1))
   3908        ));
   3909    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   3910       elimination of previous stores to this field work better. */
   3911    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   3912 
   3913    return delta;
   3914 }
   3915 
   3916 
   3917 /* Signed/unsigned widening multiply.  Generate IR to multiply the
   3918    value in RAX/EAX/AX/AL by the given IRTemp, and park the result in
   3919    RDX:RAX/EDX:EAX/DX:AX/AX.
   3920 */
   3921 static void codegen_mulL_A_D ( Int sz, Bool syned,
   3922                                IRTemp tmp, const HChar* tmp_txt )
   3923 {
   3924    IRType ty = szToITy(sz);
   3925    IRTemp t1 = newTemp(ty);
   3926 
   3927    assign( t1, getIRegRAX(sz) );
   3928 
   3929    switch (ty) {
   3930       case Ity_I64: {
   3931          IRTemp res128  = newTemp(Ity_I128);
   3932          IRTemp resHi   = newTemp(Ity_I64);
   3933          IRTemp resLo   = newTemp(Ity_I64);
   3934          IROp   mulOp   = syned ? Iop_MullS64 : Iop_MullU64;
   3935          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   3936          setFlags_MUL ( Ity_I64, t1, tmp, tBaseOp );
   3937          assign( res128, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   3938          assign( resHi, unop(Iop_128HIto64,mkexpr(res128)));
   3939          assign( resLo, unop(Iop_128to64,mkexpr(res128)));
   3940          putIReg64(R_RDX, mkexpr(resHi));
   3941          putIReg64(R_RAX, mkexpr(resLo));
   3942          break;
   3943       }
   3944       case Ity_I32: {
   3945          IRTemp res64   = newTemp(Ity_I64);
   3946          IRTemp resHi   = newTemp(Ity_I32);
   3947          IRTemp resLo   = newTemp(Ity_I32);
   3948          IROp   mulOp   = syned ? Iop_MullS32 : Iop_MullU32;
   3949          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   3950          setFlags_MUL ( Ity_I32, t1, tmp, tBaseOp );
   3951          assign( res64, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   3952          assign( resHi, unop(Iop_64HIto32,mkexpr(res64)));
   3953          assign( resLo, unop(Iop_64to32,mkexpr(res64)));
   3954          putIRegRDX(4, mkexpr(resHi));
   3955          putIRegRAX(4, mkexpr(resLo));
   3956          break;
   3957       }
   3958       case Ity_I16: {
   3959          IRTemp res32   = newTemp(Ity_I32);
   3960          IRTemp resHi   = newTemp(Ity_I16);
   3961          IRTemp resLo   = newTemp(Ity_I16);
   3962          IROp   mulOp   = syned ? Iop_MullS16 : Iop_MullU16;
   3963          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   3964          setFlags_MUL ( Ity_I16, t1, tmp, tBaseOp );
   3965          assign( res32, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   3966          assign( resHi, unop(Iop_32HIto16,mkexpr(res32)));
   3967          assign( resLo, unop(Iop_32to16,mkexpr(res32)));
   3968          putIRegRDX(2, mkexpr(resHi));
   3969          putIRegRAX(2, mkexpr(resLo));
   3970          break;
   3971       }
   3972       case Ity_I8: {
   3973          IRTemp res16   = newTemp(Ity_I16);
   3974          IRTemp resHi   = newTemp(Ity_I8);
   3975          IRTemp resLo   = newTemp(Ity_I8);
   3976          IROp   mulOp   = syned ? Iop_MullS8 : Iop_MullU8;
   3977          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   3978          setFlags_MUL ( Ity_I8, t1, tmp, tBaseOp );
   3979          assign( res16, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   3980          assign( resHi, unop(Iop_16HIto8,mkexpr(res16)));
   3981          assign( resLo, unop(Iop_16to8,mkexpr(res16)));
   3982          putIRegRAX(2, mkexpr(res16));
   3983          break;
   3984       }
   3985       default:
   3986          ppIRType(ty);
   3987          vpanic("codegen_mulL_A_D(amd64)");
   3988    }
   3989    DIP("%s%c %s\n", syned ? "imul" : "mul", nameISize(sz), tmp_txt);
   3990 }
   3991 
   3992 
   3993 /* Group 3 extended opcodes.  We have to decide here whether F2 and F3
   3994    might be valid.*/
   3995 static
   3996 ULong dis_Grp3 ( const VexAbiInfo* vbi,
   3997                  Prefix pfx, Int sz, Long delta, Bool* decode_OK )
   3998 {
   3999    Long    d64;
   4000    UChar   modrm;
   4001    HChar   dis_buf[50];
   4002    Int     len;
   4003    IRTemp  addr;
   4004    IRType  ty = szToITy(sz);
   4005    IRTemp  t1 = newTemp(ty);
   4006    IRTemp dst1, src, dst0;
   4007    *decode_OK = True;
   4008    modrm = getUChar(delta);
   4009    if (epartIsReg(modrm)) {
   4010       /* F2/XACQ and F3/XREL are always invalid in the non-mem case. */
   4011       if (haveF2orF3(pfx)) goto unhandled;
   4012       switch (gregLO3ofRM(modrm)) {
   4013          case 0: { /* TEST */
   4014             delta++;
   4015             d64 = getSDisp(imin(4,sz), delta);
   4016             delta += imin(4,sz);
   4017             dst1 = newTemp(ty);
   4018             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   4019                                getIRegE(sz,pfx,modrm),
   4020                                mkU(ty, d64 & mkSizeMask(sz))));
   4021             setFlags_DEP1( Iop_And8, dst1, ty );
   4022             DIP("test%c $%lld, %s\n",
   4023                 nameISize(sz), d64,
   4024                 nameIRegE(sz, pfx, modrm));
   4025             break;
   4026          }
   4027          case 1:
   4028             *decode_OK = False;
   4029             return delta;
   4030          case 2: /* NOT */
   4031             delta++;
   4032             putIRegE(sz, pfx, modrm,
   4033                               unop(mkSizedOp(ty,Iop_Not8),
   4034                                    getIRegE(sz, pfx, modrm)));
   4035             DIP("not%c %s\n", nameISize(sz),
   4036                               nameIRegE(sz, pfx, modrm));
   4037             break;
   4038          case 3: /* NEG */
   4039             delta++;
   4040             dst0 = newTemp(ty);
   4041             src  = newTemp(ty);
   4042             dst1 = newTemp(ty);
   4043             assign(dst0, mkU(ty,0));
   4044             assign(src,  getIRegE(sz, pfx, modrm));
   4045             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
   4046                                                        mkexpr(src)));
   4047             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   4048             putIRegE(sz, pfx, modrm, mkexpr(dst1));
   4049             DIP("neg%c %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm));
   4050             break;
   4051          case 4: /* MUL (unsigned widening) */
   4052             delta++;
   4053             src = newTemp(ty);
   4054             assign(src, getIRegE(sz,pfx,modrm));
   4055             codegen_mulL_A_D ( sz, False, src,
   4056                                nameIRegE(sz,pfx,modrm) );
   4057             break;
   4058          case 5: /* IMUL (signed widening) */
   4059             delta++;
   4060             src = newTemp(ty);
   4061             assign(src, getIRegE(sz,pfx,modrm));
   4062             codegen_mulL_A_D ( sz, True, src,
   4063                                nameIRegE(sz,pfx,modrm) );
   4064             break;
   4065          case 6: /* DIV */
   4066             delta++;
   4067             assign( t1, getIRegE(sz, pfx, modrm) );
   4068             codegen_div ( sz, t1, False );
   4069             DIP("div%c %s\n", nameISize(sz),
   4070                               nameIRegE(sz, pfx, modrm));
   4071             break;
   4072          case 7: /* IDIV */
   4073             delta++;
   4074             assign( t1, getIRegE(sz, pfx, modrm) );
   4075             codegen_div ( sz, t1, True );
   4076             DIP("idiv%c %s\n", nameISize(sz),
   4077                                nameIRegE(sz, pfx, modrm));
   4078             break;
   4079          default:
   4080             /*NOTREACHED*/
   4081             vpanic("Grp3(amd64,R)");
   4082       }
   4083    } else {
   4084       /* Decide if F2/XACQ or F3/XREL might be valid. */
   4085       Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
   4086       if ((gregLO3ofRM(modrm) == 3/*NEG*/ || gregLO3ofRM(modrm) == 2/*NOT*/)
   4087           && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
   4088          validF2orF3 = True;
   4089       }
   4090       if (!validF2orF3) goto unhandled;
   4091       /* */
   4092       addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
   4093                         /* we have to inform disAMode of any immediate
   4094                            bytes used */
   4095                         gregLO3ofRM(modrm)==0/*TEST*/
   4096                            ? imin(4,sz)
   4097                            : 0
   4098                       );
   4099       t1   = newTemp(ty);
   4100       delta += len;
   4101       assign(t1, loadLE(ty,mkexpr(addr)));
   4102       switch (gregLO3ofRM(modrm)) {
   4103          case 0: { /* TEST */
   4104             d64 = getSDisp(imin(4,sz), delta);
   4105             delta += imin(4,sz);
   4106             dst1 = newTemp(ty);
   4107             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   4108                                mkexpr(t1),
   4109                                mkU(ty, d64 & mkSizeMask(sz))));
   4110             setFlags_DEP1( Iop_And8, dst1, ty );
   4111             DIP("test%c $%lld, %s\n", nameISize(sz), d64, dis_buf);
   4112             break;
   4113          }
   4114          case 1:
   4115             *decode_OK = False;
   4116             return delta;
   4117          case 2: /* NOT */
   4118             dst1 = newTemp(ty);
   4119             assign(dst1, unop(mkSizedOp(ty,Iop_Not8), mkexpr(t1)));
   4120             if (haveLOCK(pfx)) {
   4121                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   4122                                     guest_RIP_curr_instr );
   4123             } else {
   4124                storeLE( mkexpr(addr), mkexpr(dst1) );
   4125             }
   4126             DIP("not%c %s\n", nameISize(sz), dis_buf);
   4127             break;
   4128          case 3: /* NEG */
   4129             dst0 = newTemp(ty);
   4130             src  = newTemp(ty);
   4131             dst1 = newTemp(ty);
   4132             assign(dst0, mkU(ty,0));
   4133             assign(src,  mkexpr(t1));
   4134             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
   4135                                                        mkexpr(src)));
   4136             if (haveLOCK(pfx)) {
   4137                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   4138                                     guest_RIP_curr_instr );
   4139             } else {
   4140                storeLE( mkexpr(addr), mkexpr(dst1) );
   4141             }
   4142             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   4143             DIP("neg%c %s\n", nameISize(sz), dis_buf);
   4144             break;
   4145          case 4: /* MUL (unsigned widening) */
   4146             codegen_mulL_A_D ( sz, False, t1, dis_buf );
   4147             break;
   4148          case 5: /* IMUL */
   4149             codegen_mulL_A_D ( sz, True, t1, dis_buf );
   4150             break;
   4151          case 6: /* DIV */
   4152             codegen_div ( sz, t1, False );
   4153             DIP("div%c %s\n", nameISize(sz), dis_buf);
   4154             break;
   4155          case 7: /* IDIV */
   4156             codegen_div ( sz, t1, True );
   4157             DIP("idiv%c %s\n", nameISize(sz), dis_buf);
   4158             break;
   4159          default:
   4160             /*NOTREACHED*/
   4161             vpanic("Grp3(amd64,M)");
   4162       }
   4163    }
   4164    return delta;
   4165   unhandled:
   4166    *decode_OK = False;
   4167    return delta;
   4168 }
   4169 
   4170 
   4171 /* Group 4 extended opcodes.  We have to decide here whether F2 and F3
   4172    might be valid. */
   4173 static
   4174 ULong dis_Grp4 ( const VexAbiInfo* vbi,
   4175                  Prefix pfx, Long delta, Bool* decode_OK )
   4176 {
   4177    Int   alen;
   4178    UChar modrm;
   4179    HChar dis_buf[50];
   4180    IRType ty = Ity_I8;
   4181    IRTemp t1 = newTemp(ty);
   4182    IRTemp t2 = newTemp(ty);
   4183 
   4184    *decode_OK = True;
   4185 
   4186    modrm = getUChar(delta);
   4187    if (epartIsReg(modrm)) {
   4188       /* F2/XACQ and F3/XREL are always invalid in the non-mem case. */
   4189       if (haveF2orF3(pfx)) goto unhandled;
   4190       assign(t1, getIRegE(1, pfx, modrm));
   4191       switch (gregLO3ofRM(modrm)) {
   4192          case 0: /* INC */
   4193             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   4194             putIRegE(1, pfx, modrm, mkexpr(t2));
   4195             setFlags_INC_DEC( True, t2, ty );
   4196             break;
   4197          case 1: /* DEC */
   4198             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   4199             putIRegE(1, pfx, modrm, mkexpr(t2));
   4200             setFlags_INC_DEC( False, t2, ty );
   4201             break;
   4202          default:
   4203             *decode_OK = False;
   4204             return delta;
   4205       }
   4206       delta++;
   4207       DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)),
   4208                       nameIRegE(1, pfx, modrm));
   4209    } else {
   4210       /* Decide if F2/XACQ or F3/XREL might be valid. */
   4211       Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
   4212       if ((gregLO3ofRM(modrm) == 0/*INC*/ || gregLO3ofRM(modrm) == 1/*DEC*/)
   4213           && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
   4214          validF2orF3 = True;
   4215       }
   4216       if (!validF2orF3) goto unhandled;
   4217       /* */
   4218       IRTemp addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   4219       assign( t1, loadLE(ty, mkexpr(addr)) );
   4220       switch (gregLO3ofRM(modrm)) {
   4221          case 0: /* INC */
   4222             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   4223             if (haveLOCK(pfx)) {
   4224                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   4225                       guest_RIP_curr_instr );
   4226             } else {
   4227                storeLE( mkexpr(addr), mkexpr(t2) );
   4228             }
   4229             setFlags_INC_DEC( True, t2, ty );
   4230             break;
   4231          case 1: /* DEC */
   4232             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   4233             if (haveLOCK(pfx)) {
   4234                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   4235                       guest_RIP_curr_instr );
   4236             } else {
   4237                storeLE( mkexpr(addr), mkexpr(t2) );
   4238             }
   4239             setFlags_INC_DEC( False, t2, ty );
   4240             break;
   4241          default:
   4242             *decode_OK = False;
   4243             return delta;
   4244       }
   4245       delta += alen;
   4246       DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)), dis_buf);
   4247    }
   4248    return delta;
   4249   unhandled:
   4250    *decode_OK = False;
   4251    return delta;
   4252 }
   4253 
   4254 
   4255 /* Group 5 extended opcodes.  We have to decide here whether F2 and F3
   4256    might be valid. */
   4257 static
   4258 ULong dis_Grp5 ( const VexAbiInfo* vbi,
   4259                  Prefix pfx, Int sz, Long delta,
   4260                  /*MOD*/DisResult* dres, /*OUT*/Bool* decode_OK )
   4261 {
   4262    Int     len;
   4263    UChar   modrm;
   4264    HChar   dis_buf[50];
   4265    IRTemp  addr = IRTemp_INVALID;
   4266    IRType  ty = szToITy(sz);
   4267    IRTemp  t1 = newTemp(ty);
   4268    IRTemp  t2 = IRTemp_INVALID;
   4269    IRTemp  t3 = IRTemp_INVALID;
   4270    Bool    showSz = True;
   4271 
   4272    *decode_OK = True;
   4273 
   4274    modrm = getUChar(delta);
   4275    if (epartIsReg(modrm)) {
   4276       /* F2/XACQ and F3/XREL are always invalid in the non-mem case.
   4277          F2/CALL and F2/JMP may have bnd prefix. */
   4278      if (haveF2orF3(pfx)
   4279          && ! (haveF2(pfx)
   4280                && (gregLO3ofRM(modrm) == 2 || gregLO3ofRM(modrm) == 4)))
   4281         goto unhandledR;
   4282       assign(t1, getIRegE(sz,pfx,modrm));
   4283       switch (gregLO3ofRM(modrm)) {
   4284          case 0: /* INC */
   4285             t2 = newTemp(ty);
   4286             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   4287                              mkexpr(t1), mkU(ty,1)));
   4288             setFlags_INC_DEC( True, t2, ty );
   4289             putIRegE(sz,pfx,modrm, mkexpr(t2));
   4290             break;
   4291          case 1: /* DEC */
   4292             t2 = newTemp(ty);
   4293             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   4294                              mkexpr(t1), mkU(ty,1)));
   4295             setFlags_INC_DEC( False, t2, ty );
   4296             putIRegE(sz,pfx,modrm, mkexpr(t2));
   4297             break;
   4298          case 2: /* call Ev */
   4299             /* Ignore any sz value and operate as if sz==8. */
   4300             if (!(sz == 4 || sz == 8)) goto unhandledR;
   4301             if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   4302             sz = 8;
   4303             t3 = newTemp(Ity_I64);
   4304             assign(t3, getIRegE(sz,pfx,modrm));
   4305             t2 = newTemp(Ity_I64);
   4306             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   4307             putIReg64(R_RSP, mkexpr(t2));
   4308             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+1));
   4309             make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(reg)");
   4310             jmp_treg(dres, Ijk_Call, t3);
   4311             vassert(dres->whatNext == Dis_StopHere);
   4312             showSz = False;
   4313             break;
   4314          case 4: /* jmp Ev */
   4315             /* Ignore any sz value and operate as if sz==8. */
   4316             if (!(sz == 4 || sz == 8)) goto unhandledR;
   4317             if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   4318             sz = 8;
   4319             t3 = newTemp(Ity_I64);
   4320             assign(t3, getIRegE(sz,pfx,modrm));
   4321             jmp_treg(dres, Ijk_Boring, t3);
   4322             vassert(dres->whatNext == Dis_StopHere);
   4323             showSz = False;
   4324             break;
   4325          case 6: /* PUSH Ev */
   4326             /* There is no encoding for 32-bit operand size; hence ... */
   4327             if (sz == 4) sz = 8;
   4328             if (sz == 8 || sz == 2) {
   4329                ty = szToITy(sz); /* redo it, since sz might have changed */
   4330                t3 = newTemp(ty);
   4331                assign(t3, getIRegE(sz,pfx,modrm));
   4332                t2 = newTemp(Ity_I64);
   4333                assign( t2, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
   4334                putIReg64(R_RSP, mkexpr(t2) );
   4335                storeLE( mkexpr(t2), mkexpr(t3) );
   4336                break;
   4337             } else {
   4338                goto unhandledR; /* awaiting test case */
   4339             }
   4340          default:
   4341          unhandledR:
   4342             *decode_OK = False;
   4343             return delta;
   4344       }
   4345       delta++;
   4346       DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
   4347                        showSz ? nameISize(sz) : ' ',
   4348                        nameIRegE(sz, pfx, modrm));
   4349    } else {
   4350       /* Decide if F2/XACQ, F3/XREL, F2/CALL or F2/JMP might be valid. */
   4351       Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
   4352       if ((gregLO3ofRM(modrm) == 0/*INC*/ || gregLO3ofRM(modrm) == 1/*DEC*/)
   4353           && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
   4354          validF2orF3 = True;
   4355       } else if ((gregLO3ofRM(modrm) == 2 || gregLO3ofRM(modrm) == 4)
   4356                  && (haveF2(pfx) && !haveF3(pfx))) {
   4357          validF2orF3 = True;
   4358       }
   4359       if (!validF2orF3) goto unhandledM;
   4360       /* */
   4361       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   4362       if (gregLO3ofRM(modrm) != 2 && gregLO3ofRM(modrm) != 4
   4363                                   && gregLO3ofRM(modrm) != 6) {
   4364          assign(t1, loadLE(ty,mkexpr(addr)));
   4365       }
   4366       switch (gregLO3ofRM(modrm)) {
   4367          case 0: /* INC */
   4368             t2 = newTemp(ty);
   4369             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   4370                              mkexpr(t1), mkU(ty,1)));
   4371             if (haveLOCK(pfx)) {
   4372                casLE( mkexpr(addr),
   4373                       mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
   4374             } else {
   4375                storeLE(mkexpr(addr),mkexpr(t2));
   4376             }
   4377             setFlags_INC_DEC( True, t2, ty );
   4378             break;
   4379          case 1: /* DEC */
   4380             t2 = newTemp(ty);
   4381             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   4382                              mkexpr(t1), mkU(ty,1)));
   4383             if (haveLOCK(pfx)) {
   4384                casLE( mkexpr(addr),
   4385                       mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
   4386             } else {
   4387                storeLE(mkexpr(addr),mkexpr(t2));
   4388             }
   4389             setFlags_INC_DEC( False, t2, ty );
   4390             break;
   4391          case 2: /* call Ev */
   4392             /* Ignore any sz value and operate as if sz==8. */
   4393             if (!(sz == 4 || sz == 8)) goto unhandledM;
   4394             if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   4395             sz = 8;
   4396             t3 = newTemp(Ity_I64);
   4397             assign(t3, loadLE(Ity_I64,mkexpr(addr)));
   4398             t2 = newTemp(Ity_I64);
   4399             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   4400             putIReg64(R_RSP, mkexpr(t2));
   4401             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+len));
   4402             make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(mem)");
   4403             jmp_treg(dres, Ijk_Call, t3);
   4404             vassert(dres->whatNext == Dis_StopHere);
   4405             showSz = False;
   4406             break;
   4407          case 4: /* JMP Ev */
   4408             /* Ignore any sz value and operate as if sz==8. */
   4409             if (!(sz == 4 || sz == 8)) goto unhandledM;
   4410             if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   4411             sz = 8;
   4412             t3 = newTemp(Ity_I64);
   4413             assign(t3, loadLE(Ity_I64,mkexpr(addr)));
   4414             jmp_treg(dres, Ijk_Boring, t3);
   4415             vassert(dres->whatNext == Dis_StopHere);
   4416             showSz = False;
   4417             break;
   4418          case 6: /* PUSH Ev */
   4419             /* There is no encoding for 32-bit operand size; hence ... */
   4420             if (sz == 4) sz = 8;
   4421             if (sz == 8 || sz == 2) {
   4422                ty = szToITy(sz); /* redo it, since sz might have changed */
   4423                t3 = newTemp(ty);
   4424                assign(t3, loadLE(ty,mkexpr(addr)));
   4425                t2 = newTemp(Ity_I64);
   4426                assign( t2, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
   4427                putIReg64(R_RSP, mkexpr(t2) );
   4428                storeLE( mkexpr(t2), mkexpr(t3) );
   4429                break;
   4430             } else {
   4431                goto unhandledM; /* awaiting test case */
   4432             }
   4433          default:
   4434          unhandledM:
   4435             *decode_OK = False;
   4436             return delta;
   4437       }
   4438       delta += len;
   4439       DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
   4440                        showSz ? nameISize(sz) : ' ',
   4441                        dis_buf);
   4442    }
   4443    return delta;
   4444 }
   4445 
   4446 
   4447 /*------------------------------------------------------------*/
   4448 /*--- Disassembling string ops (including REP prefixes)    ---*/
   4449 /*------------------------------------------------------------*/
   4450 
   4451 /* Code shared by all the string ops */
   4452 static
   4453 void dis_string_op_increment ( Int sz, IRTemp t_inc )
   4454 {
   4455    UChar logSz;
   4456    if (sz == 8 || sz == 4 || sz == 2) {
   4457       logSz = 1;
   4458       if (sz == 4) logSz = 2;
   4459       if (sz == 8) logSz = 3;
   4460       assign( t_inc,
   4461               binop(Iop_Shl64, IRExpr_Get( OFFB_DFLAG, Ity_I64 ),
   4462                                mkU8(logSz) ) );
   4463    } else {
   4464       assign( t_inc,
   4465               IRExpr_Get( OFFB_DFLAG, Ity_I64 ) );
   4466    }
   4467 }
   4468 
   4469 static
   4470 void dis_string_op( void (*dis_OP)( Int, IRTemp, Prefix pfx ),
   4471                     Int sz, const HChar* name, Prefix pfx )
   4472 {
   4473    IRTemp t_inc = newTemp(Ity_I64);
   4474    /* Really we ought to inspect the override prefixes, but we don't.
   4475       The following assertion catches any resulting sillyness. */
   4476    vassert(pfx == clearSegBits(pfx));
   4477    dis_string_op_increment(sz, t_inc);
   4478    dis_OP( sz, t_inc, pfx );
   4479    DIP("%s%c\n", name, nameISize(sz));
   4480 }
   4481 
   4482 static
   4483 void dis_MOVS ( Int sz, IRTemp t_inc, Prefix pfx )
   4484 {
   4485    IRType ty = szToITy(sz);
   4486    IRTemp td = newTemp(Ity_I64);   /* RDI */
   4487    IRTemp ts = newTemp(Ity_I64);   /* RSI */
   4488    IRExpr *incd, *incs;
   4489 
   4490    if (haveASO(pfx)) {
   4491       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4492       assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
   4493    } else {
   4494       assign( td, getIReg64(R_RDI) );
   4495       assign( ts, getIReg64(R_RSI) );
   4496    }
   4497 
   4498    storeLE( mkexpr(td), loadLE(ty,mkexpr(ts)) );
   4499 
   4500    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4501    incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
   4502    if (haveASO(pfx)) {
   4503       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4504       incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
   4505    }
   4506    putIReg64( R_RDI, incd );
   4507    putIReg64( R_RSI, incs );
   4508 }
   4509 
   4510 static
   4511 void dis_LODS ( Int sz, IRTemp t_inc, Prefix pfx )
   4512 {
   4513    IRType ty = szToITy(sz);
   4514    IRTemp ts = newTemp(Ity_I64);   /* RSI */
   4515    IRExpr *incs;
   4516 
   4517    if (haveASO(pfx))
   4518       assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
   4519    else
   4520       assign( ts, getIReg64(R_RSI) );
   4521 
   4522    putIRegRAX ( sz, loadLE(ty, mkexpr(ts)) );
   4523 
   4524    incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
   4525    if (haveASO(pfx))
   4526       incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
   4527    putIReg64( R_RSI, incs );
   4528 }
   4529 
   4530 static
   4531 void dis_STOS ( Int sz, IRTemp t_inc, Prefix pfx )
   4532 {
   4533    IRType ty = szToITy(sz);
   4534    IRTemp ta = newTemp(ty);        /* rAX */
   4535    IRTemp td = newTemp(Ity_I64);   /* RDI */
   4536    IRExpr *incd;
   4537 
   4538    assign( ta, getIRegRAX(sz) );
   4539 
   4540    if (haveASO(pfx))
   4541       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4542    else
   4543       assign( td, getIReg64(R_RDI) );
   4544 
   4545    storeLE( mkexpr(td), mkexpr(ta) );
   4546 
   4547    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4548    if (haveASO(pfx))
   4549       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4550    putIReg64( R_RDI, incd );
   4551 }
   4552 
   4553 static
   4554 void dis_CMPS ( Int sz, IRTemp t_inc, Prefix pfx )
   4555 {
   4556    IRType ty  = szToITy(sz);
   4557    IRTemp tdv = newTemp(ty);      /* (RDI) */
   4558    IRTemp tsv = newTemp(ty);      /* (RSI) */
   4559    IRTemp td  = newTemp(Ity_I64); /*  RDI  */
   4560    IRTemp ts  = newTemp(Ity_I64); /*  RSI  */
   4561    IRExpr *incd, *incs;
   4562 
   4563    if (haveASO(pfx)) {
   4564       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4565       assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
   4566    } else {
   4567       assign( td, getIReg64(R_RDI) );
   4568       assign( ts, getIReg64(R_RSI) );
   4569    }
   4570 
   4571    assign( tdv, loadLE(ty,mkexpr(td)) );
   4572 
   4573    assign( tsv, loadLE(ty,mkexpr(ts)) );
   4574 
   4575    setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
   4576 
   4577    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4578    incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
   4579    if (haveASO(pfx)) {
   4580       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4581       incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
   4582    }
   4583    putIReg64( R_RDI, incd );
   4584    putIReg64( R_RSI, incs );
   4585 }
   4586 
   4587 static
   4588 void dis_SCAS ( Int sz, IRTemp t_inc, Prefix pfx )
   4589 {
   4590    IRType ty  = szToITy(sz);
   4591    IRTemp ta  = newTemp(ty);       /*  rAX  */
   4592    IRTemp td  = newTemp(Ity_I64);  /*  RDI  */
   4593    IRTemp tdv = newTemp(ty);       /* (RDI) */
   4594    IRExpr *incd;
   4595 
   4596    assign( ta, getIRegRAX(sz) );
   4597 
   4598    if (haveASO(pfx))
   4599       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4600    else
   4601       assign( td, getIReg64(R_RDI) );
   4602 
   4603    assign( tdv, loadLE(ty,mkexpr(td)) );
   4604 
   4605    setFlags_DEP1_DEP2 ( Iop_Sub8, ta, tdv, ty );
   4606 
   4607    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4608    if (haveASO(pfx))
   4609       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4610    putIReg64( R_RDI, incd );
   4611 }
   4612 
   4613 
   4614 /* Wrap the appropriate string op inside a REP/REPE/REPNE.  We assume
   4615    the insn is the last one in the basic block, and so emit a jump to
   4616    the next insn, rather than just falling through. */
   4617 static
   4618 void dis_REP_op ( /*MOD*/DisResult* dres,
   4619                   AMD64Condcode cond,
   4620                   void (*dis_OP)(Int, IRTemp, Prefix),
   4621                   Int sz, Addr64 rip, Addr64 rip_next, const HChar* name,
   4622                   Prefix pfx )
   4623 {
   4624    IRTemp t_inc = newTemp(Ity_I64);
   4625    IRTemp tc;
   4626    IRExpr* cmp;
   4627 
   4628    /* Really we ought to inspect the override prefixes, but we don't.
   4629       The following assertion catches any resulting sillyness. */
   4630    vassert(pfx == clearSegBits(pfx));
   4631 
   4632    if (haveASO(pfx)) {
   4633       tc = newTemp(Ity_I32);  /*  ECX  */
   4634       assign( tc, getIReg32(R_RCX) );
   4635       cmp = binop(Iop_CmpEQ32, mkexpr(tc), mkU32(0));
   4636    } else {
   4637       tc = newTemp(Ity_I64);  /*  RCX  */
   4638       assign( tc, getIReg64(R_RCX) );
   4639       cmp = binop(Iop_CmpEQ64, mkexpr(tc), mkU64(0));
   4640    }
   4641 
   4642    stmt( IRStmt_Exit( cmp, Ijk_Boring,
   4643                       IRConst_U64(rip_next), OFFB_RIP ) );
   4644 
   4645    if (haveASO(pfx))
   4646       putIReg32(R_RCX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
   4647   else
   4648       putIReg64(R_RCX, binop(Iop_Sub64, mkexpr(tc), mkU64(1)) );
   4649 
   4650    dis_string_op_increment(sz, t_inc);
   4651    dis_OP (sz, t_inc, pfx);
   4652 
   4653    if (cond == AMD64CondAlways) {
   4654       jmp_lit(dres, Ijk_Boring, rip);
   4655       vassert(dres->whatNext == Dis_StopHere);
   4656    } else {
   4657       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(cond),
   4658                          Ijk_Boring,
   4659                          IRConst_U64(rip),
   4660                          OFFB_RIP ) );
   4661       jmp_lit(dres, Ijk_Boring, rip_next);
   4662       vassert(dres->whatNext == Dis_StopHere);
   4663    }
   4664    DIP("%s%c\n", name, nameISize(sz));
   4665 }
   4666 
   4667 
   4668 /*------------------------------------------------------------*/
   4669 /*--- Arithmetic, etc.                                     ---*/
   4670 /*------------------------------------------------------------*/
   4671 
   4672 /* IMUL E, G.  Supplied eip points to the modR/M byte. */
   4673 static
   4674 ULong dis_mul_E_G ( const VexAbiInfo* vbi,
   4675                     Prefix      pfx,
   4676                     Int         size,
   4677                     Long        delta0 )
   4678 {
   4679    Int    alen;
   4680    HChar  dis_buf[50];
   4681    UChar  rm = getUChar(delta0);
   4682    IRType ty = szToITy(size);
   4683    IRTemp te = newTemp(ty);
   4684    IRTemp tg = newTemp(ty);
   4685    IRTemp resLo = newTemp(ty);
   4686 
   4687    assign( tg, getIRegG(size, pfx, rm) );
   4688    if (epartIsReg(rm)) {
   4689       assign( te, getIRegE(size, pfx, rm) );
   4690    } else {
   4691       IRTemp addr = disAMode( &alen, vbi, pfx, delta0, dis_buf, 0 );
   4692       assign( te, loadLE(ty,mkexpr(addr)) );
   4693    }
   4694 
   4695    setFlags_MUL ( ty, te, tg, AMD64G_CC_OP_SMULB );
   4696 
   4697    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tg) ) );
   4698 
   4699    putIRegG(size, pfx, rm, mkexpr(resLo) );
   4700 
   4701    if (epartIsReg(rm)) {
   4702       DIP("imul%c %s, %s\n", nameISize(size),
   4703                              nameIRegE(size,pfx,rm),
   4704                              nameIRegG(size,pfx,rm));
   4705       return 1+delta0;
   4706    } else {
   4707       DIP("imul%c %s, %s\n", nameISize(size),
   4708                              dis_buf,
   4709                              nameIRegG(size,pfx,rm));
   4710       return alen+delta0;
   4711    }
   4712 }
   4713 
   4714 
   4715 /* IMUL I * E -> G.  Supplied rip points to the modR/M byte. */
   4716 static
   4717 ULong dis_imul_I_E_G ( const VexAbiInfo* vbi,
   4718                        Prefix      pfx,
   4719                        Int         size,
   4720                        Long        delta,
   4721                        Int         litsize )
   4722 {
   4723    Long   d64;
   4724    Int    alen;
   4725    HChar  dis_buf[50];
   4726    UChar  rm = getUChar(delta);
   4727    IRType ty = szToITy(size);
   4728    IRTemp te = newTemp(ty);
   4729    IRTemp tl = newTemp(ty);
   4730    IRTemp resLo = newTemp(ty);
   4731 
   4732    vassert(/*size == 1 ||*/ size == 2 || size == 4 || size == 8);
   4733 
   4734    if (epartIsReg(rm)) {
   4735       assign(te, getIRegE(size, pfx, rm));
   4736       delta++;
   4737    } else {
   4738       IRTemp addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   4739                                      imin(4,litsize) );
   4740       assign(te, loadLE(ty, mkexpr(addr)));
   4741       delta += alen;
   4742    }
   4743    d64 = getSDisp(imin(4,litsize),delta);
   4744    delta += imin(4,litsize);
   4745 
   4746    d64 &= mkSizeMask(size);
   4747    assign(tl, mkU(ty,d64));
   4748 
   4749    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tl) ));
   4750 
   4751    setFlags_MUL ( ty, te, tl, AMD64G_CC_OP_SMULB );
   4752 
   4753    putIRegG(size, pfx, rm, mkexpr(resLo));
   4754 
   4755    DIP("imul%c $%lld, %s, %s\n",
   4756        nameISize(size), d64,
   4757        ( epartIsReg(rm) ? nameIRegE(size,pfx,rm) : dis_buf ),
   4758        nameIRegG(size,pfx,rm) );
   4759    return delta;
   4760 }
   4761 
   4762 
   4763 /* Generate an IR sequence to do a popcount operation on the supplied
   4764    IRTemp, and return a new IRTemp holding the result.  'ty' may be
   4765    Ity_I16, Ity_I32 or Ity_I64 only. */
   4766 static IRTemp gen_POPCOUNT ( IRType ty, IRTemp src )
   4767 {
   4768    Int i;
   4769    if (ty == Ity_I16) {
   4770       IRTemp old = IRTemp_INVALID;
   4771       IRTemp nyu = IRTemp_INVALID;
   4772       IRTemp mask[4], shift[4];
   4773       for (i = 0; i < 4; i++) {
   4774          mask[i]  = newTemp(ty);
   4775          shift[i] = 1 << i;
   4776       }
   4777       assign(mask[0], mkU16(0x5555));
   4778       assign(mask[1], mkU16(0x3333));
   4779       assign(mask[2], mkU16(0x0F0F));
   4780       assign(mask[3], mkU16(0x00FF));
   4781       old = src;
   4782       for (i = 0; i < 4; i++) {
   4783          nyu = newTemp(ty);
   4784          assign(nyu,
   4785                 binop(Iop_Add16,
   4786                       binop(Iop_And16,
   4787                             mkexpr(old),
   4788                             mkexpr(mask[i])),
   4789                       binop(Iop_And16,
   4790                             binop(Iop_Shr16, mkexpr(old), mkU8(shift[i])),
   4791                             mkexpr(mask[i]))));
   4792          old = nyu;
   4793       }
   4794       return nyu;
   4795    }
   4796    if (ty == Ity_I32) {
   4797       IRTemp old = IRTemp_INVALID;
   4798       IRTemp nyu = IRTemp_INVALID;
   4799       IRTemp mask[5], shift[5];
   4800       for (i = 0; i < 5; i++) {
   4801          mask[i]  = newTemp(ty);
   4802          shift[i] = 1 << i;
   4803       }
   4804       assign(mask[0], mkU32(0x55555555));
   4805       assign(mask[1], mkU32(0x33333333));
   4806       assign(mask[2], mkU32(0x0F0F0F0F));
   4807       assign(mask[3], mkU32(0x00FF00FF));
   4808       assign(mask[4], mkU32(0x0000FFFF));
   4809       old = src;
   4810       for (i = 0; i < 5; i++) {
   4811          nyu = newTemp(ty);
   4812          assign(nyu,
   4813                 binop(Iop_Add32,
   4814                       binop(Iop_And32,
   4815                             mkexpr(old),
   4816                             mkexpr(mask[i])),
   4817                       binop(Iop_And32,
   4818                             binop(Iop_Shr32, mkexpr(old), mkU8(shift[i])),
   4819                             mkexpr(mask[i]))));
   4820          old = nyu;
   4821       }
   4822       return nyu;
   4823    }
   4824    if (ty == Ity_I64) {
   4825       IRTemp old = IRTemp_INVALID;
   4826       IRTemp nyu = IRTemp_INVALID;
   4827       IRTemp mask[6], shift[6];
   4828       for (i = 0; i < 6; i++) {
   4829          mask[i]  = newTemp(ty);
   4830          shift[i] = 1 << i;
   4831       }
   4832       assign(mask[0], mkU64(0x5555555555555555ULL));
   4833       assign(mask[1], mkU64(0x3333333333333333ULL));
   4834       assign(mask[2], mkU64(0x0F0F0F0F0F0F0F0FULL));
   4835       assign(mask[3], mkU64(0x00FF00FF00FF00FFULL));
   4836       assign(mask[4], mkU64(0x0000FFFF0000FFFFULL));
   4837       assign(mask[5], mkU64(0x00000000FFFFFFFFULL));
   4838       old = src;
   4839       for (i = 0; i < 6; i++) {
   4840          nyu = newTemp(ty);
   4841          assign(nyu,
   4842                 binop(Iop_Add64,
   4843                       binop(Iop_And64,
   4844                             mkexpr(old),
   4845                             mkexpr(mask[i])),
   4846                       binop(Iop_And64,
   4847                             binop(Iop_Shr64, mkexpr(old), mkU8(shift[i])),
   4848                             mkexpr(mask[i]))));
   4849          old = nyu;
   4850       }
   4851       return nyu;
   4852    }
   4853    /*NOTREACHED*/
   4854    vassert(0);
   4855 }
   4856 
   4857 
   4858 /* Generate an IR sequence to do a count-leading-zeroes operation on
   4859    the supplied IRTemp, and return a new IRTemp holding the result.
   4860    'ty' may be Ity_I16, Ity_I32 or Ity_I64 only.  In the case where
   4861    the argument is zero, return the number of bits in the word (the
   4862    natural semantics). */
   4863 static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
   4864 {
   4865    vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16);
   4866 
   4867    IRTemp src64 = newTemp(Ity_I64);
   4868    assign(src64, widenUto64( mkexpr(src) ));
   4869 
   4870    IRTemp src64x = newTemp(Ity_I64);
   4871    assign(src64x,
   4872           binop(Iop_Shl64, mkexpr(src64),
   4873                            mkU8(64 - 8 * sizeofIRType(ty))));
   4874 
   4875    // Clz64 has undefined semantics when its input is zero, so
   4876    // special-case around that.
   4877    IRTemp res64 = newTemp(Ity_I64);
   4878    assign(res64,
   4879           IRExpr_ITE(
   4880              binop(Iop_CmpEQ64, mkexpr(src64x), mkU64(0)),
   4881              mkU64(8 * sizeofIRType(ty)),
   4882              unop(Iop_Clz64, mkexpr(src64x))
   4883    ));
   4884 
   4885    IRTemp res = newTemp(ty);
   4886    assign(res, narrowTo(ty, mkexpr(res64)));
   4887    return res;
   4888 }
   4889 
   4890 
   4891 /* Generate an IR sequence to do a count-trailing-zeroes operation on
   4892    the supplied IRTemp, and return a new IRTemp holding the result.
   4893    'ty' may be Ity_I16, Ity_I32 or Ity_I64 only.  In the case where
   4894    the argument is zero, return the number of bits in the word (the
   4895    natural semantics). */
   4896 static IRTemp gen_TZCNT ( IRType ty, IRTemp src )
   4897 {
   4898    vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16);
   4899 
   4900    IRTemp src64 = newTemp(Ity_I64);
   4901    assign(src64, widenUto64( mkexpr(src) ));
   4902 
   4903    // Ctz64 has undefined semantics when its input is zero, so
   4904    // special-case around that.
   4905    IRTemp res64 = newTemp(Ity_I64);
   4906    assign(res64,
   4907           IRExpr_ITE(
   4908              binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0)),
   4909              mkU64(8 * sizeofIRType(ty)),
   4910              unop(Iop_Ctz64, mkexpr(src64))
   4911    ));
   4912 
   4913    IRTemp res = newTemp(ty);
   4914    assign(res, narrowTo(ty, mkexpr(res64)));
   4915    return res;
   4916 }
   4917 
   4918 
   4919 /*------------------------------------------------------------*/
   4920 /*---                                                      ---*/
   4921 /*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
   4922 /*---                                                      ---*/
   4923 /*------------------------------------------------------------*/
   4924 
   4925 /* --- Helper functions for dealing with the register stack. --- */
   4926 
   4927 /* --- Set the emulation-warning pseudo-register. --- */
   4928 
   4929 static void put_emwarn ( IRExpr* e /* :: Ity_I32 */ )
   4930 {
   4931    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   4932    stmt( IRStmt_Put( OFFB_EMNOTE, e ) );
   4933 }
   4934 
   4935 /* --- Produce an IRExpr* denoting a 64-bit QNaN. --- */
   4936 
   4937 static IRExpr* mkQNaN64 ( void )
   4938 {
   4939   /* QNaN is 0 2047 1 0(51times)
   4940      == 0b 11111111111b 1 0(51times)
   4941      == 0x7FF8 0000 0000 0000
   4942    */
   4943    return IRExpr_Const(IRConst_F64i(0x7FF8000000000000ULL));
   4944 }
   4945 
   4946 /* --------- Get/put the top-of-stack pointer :: Ity_I32 --------- */
   4947 
   4948 static IRExpr* get_ftop ( void )
   4949 {
   4950    return IRExpr_Get( OFFB_FTOP, Ity_I32 );
   4951 }
   4952 
   4953 static void put_ftop ( IRExpr* e )
   4954 {
   4955    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   4956    stmt( IRStmt_Put( OFFB_FTOP, e ) );
   4957 }
   4958 
   4959 /* --------- Get/put the C3210 bits. --------- */
   4960 
   4961 static IRExpr*  /* :: Ity_I64 */ get_C3210 ( void )
   4962 {
   4963    return IRExpr_Get( OFFB_FC3210, Ity_I64 );
   4964 }
   4965 
   4966 static void put_C3210 ( IRExpr* e  /* :: Ity_I64 */ )
   4967 {
   4968    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
   4969    stmt( IRStmt_Put( OFFB_FC3210, e ) );
   4970 }
   4971 
   4972 /* --------- Get/put the FPU rounding mode. --------- */
   4973 static IRExpr* /* :: Ity_I32 */ get_fpround ( void )
   4974 {
   4975    return unop(Iop_64to32, IRExpr_Get( OFFB_FPROUND, Ity_I64 ));
   4976 }
   4977 
   4978 static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
   4979 {
   4980    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   4981    stmt( IRStmt_Put( OFFB_FPROUND, unop(Iop_32Uto64,e) ) );
   4982 }
   4983 
   4984 
   4985 /* --------- Synthesise a 2-bit FPU rounding mode. --------- */
   4986 /* Produces a value in 0 .. 3, which is encoded as per the type
   4987    IRRoundingMode.  Since the guest_FPROUND value is also encoded as
   4988    per IRRoundingMode, we merely need to get it and mask it for
   4989    safety.
   4990 */
   4991 static IRExpr* /* :: Ity_I32 */ get_roundingmode ( void )
   4992 {
   4993    return binop( Iop_And32, get_fpround(), mkU32(3) );
   4994 }
   4995 
   4996 static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
   4997 {
   4998    return mkU32(Irrm_NEAREST);
   4999 }
   5000 
   5001 
   5002 /* --------- Get/set FP register tag bytes. --------- */
   5003 
   5004 /* Given i, and some expression e, generate 'ST_TAG(i) = e'. */
   5005 
   5006 static void put_ST_TAG ( Int i, IRExpr* value )
   5007 {
   5008    IRRegArray* descr;
   5009    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_I8);
   5010    descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   5011    stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
   5012 }
   5013 
   5014 /* Given i, generate an expression yielding 'ST_TAG(i)'.  This will be
   5015    zero to indicate "Empty" and nonzero to indicate "NonEmpty".  */
   5016 
   5017 static IRExpr* get_ST_TAG ( Int i )
   5018 {
   5019    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   5020    return IRExpr_GetI( descr, get_ftop(), i );
   5021 }
   5022 
   5023 
   5024 /* --------- Get/set FP registers. --------- */
   5025 
   5026 /* Given i, and some expression e, emit 'ST(i) = e' and set the
   5027    register's tag to indicate the register is full.  The previous
   5028    state of the register is not checked. */
   5029 
   5030 static void put_ST_UNCHECKED ( Int i, IRExpr* value )
   5031 {
   5032    IRRegArray* descr;
   5033    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_F64);
   5034    descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   5035    stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
   5036    /* Mark the register as in-use. */
   5037    put_ST_TAG(i, mkU8(1));
   5038 }
   5039 
   5040 /* Given i, and some expression e, emit
   5041       ST(i) = is_full(i) ? NaN : e
   5042    and set the tag accordingly.
   5043 */
   5044 
   5045 static void put_ST ( Int i, IRExpr* value )
   5046 {
   5047    put_ST_UNCHECKED(
   5048       i,
   5049       IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
   5050                   /* non-0 means full */
   5051                   mkQNaN64(),
   5052                   /* 0 means empty */
   5053                   value
   5054       )
   5055    );
   5056 }
   5057 
   5058 
   5059 /* Given i, generate an expression yielding 'ST(i)'. */
   5060 
   5061 static IRExpr* get_ST_UNCHECKED ( Int i )
   5062 {
   5063    IRRegArray* descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   5064    return IRExpr_GetI( descr, get_ftop(), i );
   5065 }
   5066 
   5067 
   5068 /* Given i, generate an expression yielding
   5069   is_full(i) ? ST(i) : NaN
   5070 */
   5071 
   5072 static IRExpr* get_ST ( Int i )
   5073 {
   5074    return
   5075       IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
   5076                   /* non-0 means full */
   5077                   get_ST_UNCHECKED(i),
   5078                   /* 0 means empty */
   5079                   mkQNaN64());
   5080 }
   5081 
   5082 
   5083 /* Given i, and some expression e, and a condition cond, generate IR
   5084    which has the same effect as put_ST(i,e) when cond is true and has
   5085    no effect when cond is false.  Given the lack of proper
   5086    if-then-else in the IR, this is pretty tricky.
   5087 */
   5088 
   5089 static void maybe_put_ST ( IRTemp cond, Int i, IRExpr* value )
   5090 {
   5091    // new_tag = if cond then FULL else old_tag
   5092    // new_val = if cond then (if old_tag==FULL then NaN else val)
   5093    //                   else old_val
   5094 
   5095    IRTemp old_tag = newTemp(Ity_I8);
   5096    assign(old_tag, get_ST_TAG(i));
   5097    IRTemp new_tag = newTemp(Ity_I8);
   5098    assign(new_tag,
   5099           IRExpr_ITE(mkexpr(cond), mkU8(1)/*FULL*/, mkexpr(old_tag)));
   5100 
   5101    IRTemp old_val = newTemp(Ity_F64);
   5102    assign(old_val, get_ST_UNCHECKED(i));
   5103    IRTemp new_val = newTemp(Ity_F64);
   5104    assign(new_val,
   5105           IRExpr_ITE(mkexpr(cond),
   5106                      IRExpr_ITE(binop(Iop_CmpNE8, mkexpr(old_tag), mkU8(0)),
   5107                                 /* non-0 means full */
   5108                                 mkQNaN64(),
   5109                                 /* 0 means empty */
   5110                                 value),
   5111                      mkexpr(old_val)));
   5112 
   5113    put_ST_UNCHECKED(i, mkexpr(new_val));
   5114    // put_ST_UNCHECKED incorrectly sets tag(i) to always be FULL.  So
   5115    // now set it to new_tag instead.
   5116    put_ST_TAG(i, mkexpr(new_tag));
   5117 }
   5118 
   5119 /* Adjust FTOP downwards by one register. */
   5120 
   5121 static void fp_push ( void )
   5122 {
   5123    put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
   5124 }
   5125 
   5126 /* Adjust FTOP downwards by one register when COND is 1:I1.  Else
   5127    don't change it. */
   5128 
   5129 static void maybe_fp_push ( IRTemp cond )
   5130 {
   5131    put_ftop( binop(Iop_Sub32, get_ftop(), unop(Iop_1Uto32,mkexpr(cond))) );
   5132 }
   5133 
   5134 /* Adjust FTOP upwards by one register, and mark the vacated register
   5135    as empty.  */
   5136 
   5137 static void fp_pop ( void )
   5138 {
   5139    put_ST_TAG(0, mkU8(0));
   5140    put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   5141 }
   5142 
   5143 /* Set the C2 bit of the FPU status register to e[0].  Assumes that
   5144    e[31:1] == 0.
   5145 */
   5146 static void set_C2 ( IRExpr* e )
   5147 {
   5148    IRExpr* cleared = binop(Iop_And64, get_C3210(), mkU64(~AMD64G_FC_MASK_C2));
   5149    put_C3210( binop(Iop_Or64,
   5150                     cleared,
   5151                     binop(Iop_Shl64, e, mkU8(AMD64G_FC_SHIFT_C2))) );
   5152 }
   5153 
   5154 /* Generate code to check that abs(d64) < 2^63 and is finite.  This is
   5155    used to do the range checks for FSIN, FCOS, FSINCOS and FPTAN.  The
   5156    test is simple, but the derivation of it is not so simple.
   5157 
   5158    The exponent field for an IEEE754 double is 11 bits.  That means it
   5159    can take values 0 through 0x7FF.  If the exponent has value 0x7FF,
   5160    the number is either a NaN or an Infinity and so is not finite.
   5161    Furthermore, a finite value of exactly 2^63 is the smallest value
   5162    that has exponent value 0x43E.  Hence, what we need to do is
   5163    extract the exponent, ignoring the sign bit and mantissa, and check
   5164    it is < 0x43E, or <= 0x43D.
   5165 
   5166    To make this easily applicable to 32- and 64-bit targets, a
   5167    roundabout approach is used.  First the number is converted to I64,
   5168    then the top 32 bits are taken.  Shifting them right by 20 bits
   5169    places the sign bit and exponent in the bottom 12 bits.  Anding
   5170    with 0x7FF gets rid of the sign bit, leaving just the exponent
   5171    available for comparison.
   5172 */
   5173 static IRTemp math_IS_TRIG_ARG_FINITE_AND_IN_RANGE ( IRTemp d64 )
   5174 {
   5175    IRTemp i64 = newTemp(Ity_I64);
   5176    assign(i64, unop(Iop_ReinterpF64asI64, mkexpr(d64)) );
   5177    IRTemp exponent = newTemp(Ity_I32);
   5178    assign(exponent,
   5179           binop(Iop_And32,
   5180                 binop(Iop_Shr32, unop(Iop_64HIto32, mkexpr(i64)), mkU8(20)),
   5181                 mkU32(0x7FF)));
   5182    IRTemp in_range_and_finite = newTemp(Ity_I1);
   5183    assign(in_range_and_finite,
   5184           binop(Iop_CmpLE32U, mkexpr(exponent), mkU32(0x43D)));
   5185    return in_range_and_finite;
   5186 }
   5187 
   5188 /* Invent a plausible-looking FPU status word value:
   5189       ((ftop & 7) << 11) | (c3210 & 0x4700)
   5190  */
   5191 static IRExpr* get_FPU_sw ( void )
   5192 {
   5193    return
   5194       unop(Iop_32to16,
   5195            binop(Iop_Or32,
   5196                  binop(Iop_Shl32,
   5197                        binop(Iop_And32, get_ftop(), mkU32(7)),
   5198                              mkU8(11)),
   5199                        binop(Iop_And32, unop(Iop_64to32, get_C3210()),
   5200                                         mkU32(0x4700))
   5201       ));
   5202 }
   5203 
   5204 
   5205 /* Generate a dirty helper call that initialises the x87 state a la
   5206    FINIT.  If |guard| is NULL, it is done unconditionally.  Otherwise
   5207    |guard| is used as a guarding condition.
   5208 */
   5209 static void gen_FINIT_SEQUENCE ( IRExpr* guard )
   5210 {
   5211    /* Uses dirty helper:
   5212          void amd64g_do_FINIT ( VexGuestAMD64State* ) */
   5213    IRDirty* d  = unsafeIRDirty_0_N (
   5214                     0/*regparms*/,
   5215                     "amd64g_dirtyhelper_FINIT",
   5216                     &amd64g_dirtyhelper_FINIT,
   5217                     mkIRExprVec_1( IRExpr_BBPTR() )
   5218                  );
   5219 
   5220    /* declare we're writing guest state */
   5221    d->nFxState = 5;
   5222    vex_bzero(&d->fxState, sizeof(d->fxState));
   5223 
   5224    d->fxState[0].fx     = Ifx_Write;
   5225    d->fxState[0].offset = OFFB_FTOP;
   5226    d->fxState[0].size   = sizeof(UInt);
   5227 
   5228    d->fxState[1].fx     = Ifx_Write;
   5229    d->fxState[1].offset = OFFB_FPREGS;
   5230    d->fxState[1].size   = 8 * sizeof(ULong);
   5231 
   5232    d->fxState[2].fx     = Ifx_Write;
   5233    d->fxState[2].offset = OFFB_FPTAGS;
   5234    d->fxState[2].size   = 8 * sizeof(UChar);
   5235 
   5236    d->fxState[3].fx     = Ifx_Write;
   5237    d->fxState[3].offset = OFFB_FPROUND;
   5238    d->fxState[3].size   = sizeof(ULong);
   5239 
   5240    d->fxState[4].fx     = Ifx_Write;
   5241    d->fxState[4].offset = OFFB_FC3210;
   5242    d->fxState[4].size   = sizeof(ULong);
   5243 
   5244    if (guard)
   5245       d->guard = guard;
   5246 
   5247    stmt( IRStmt_Dirty(d) );
   5248 }
   5249 
   5250 
   5251 /* ------------------------------------------------------- */
   5252 /* Given all that stack-mangling junk, we can now go ahead
   5253    and describe FP instructions.
   5254 */
   5255 
   5256 /* ST(0) = ST(0) `op` mem64/32(addr)
   5257    Need to check ST(0)'s tag on read, but not on write.
   5258 */
   5259 static
   5260 void fp_do_op_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
   5261                          IROp op, Bool dbl )
   5262 {
   5263    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   5264    if (dbl) {
   5265       put_ST_UNCHECKED(0,
   5266          triop( op,
   5267                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5268                 get_ST(0),
   5269                 loadLE(Ity_F64,mkexpr(addr))
   5270          ));
   5271    } else {
   5272       put_ST_UNCHECKED(0,
   5273          triop( op,
   5274                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5275                 get_ST(0),
   5276                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
   5277          ));
   5278    }
   5279 }
   5280 
   5281 
   5282 /* ST(0) = mem64/32(addr) `op` ST(0)
   5283    Need to check ST(0)'s tag on read, but not on write.
   5284 */
   5285 static
   5286 void fp_do_oprev_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
   5287                             IROp op, Bool dbl )
   5288 {
   5289    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   5290    if (dbl) {
   5291       put_ST_UNCHECKED(0,
   5292          triop( op,
   5293                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5294                 loadLE(Ity_F64,mkexpr(addr)),
   5295                 get_ST(0)
   5296          ));
   5297    } else {
   5298       put_ST_UNCHECKED(0,
   5299          triop( op,
   5300                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5301                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
   5302                 get_ST(0)
   5303          ));
   5304    }
   5305 }
   5306 
   5307 
   5308 /* ST(dst) = ST(dst) `op` ST(src).
   5309    Check dst and src tags when reading but not on write.
   5310 */
   5311 static
   5312 void fp_do_op_ST_ST ( const HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
   5313                       Bool pop_after )
   5314 {
   5315    DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
   5316    put_ST_UNCHECKED(
   5317       st_dst,
   5318       triop( op,
   5319              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5320              get_ST(st_dst),
   5321              get_ST(st_src) )
   5322    );
   5323    if (pop_after)
   5324       fp_pop();
   5325 }
   5326 
   5327 /* ST(dst) = ST(src) `op` ST(dst).
   5328    Check dst and src tags when reading but not on write.
   5329 */
   5330 static
   5331 void fp_do_oprev_ST_ST ( const HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
   5332                          Bool pop_after )
   5333 {
   5334    DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
   5335    put_ST_UNCHECKED(
   5336       st_dst,
   5337       triop( op,
   5338              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5339              get_ST(st_src),
   5340              get_ST(st_dst) )
   5341    );
   5342    if (pop_after)
   5343       fp_pop();
   5344 }
   5345 
   5346 /* %rflags(Z,P,C) = UCOMI( st(0), st(i) ) */
   5347 static void fp_do_ucomi_ST0_STi ( UInt i, Bool pop_after )
   5348 {
   5349    DIP("fucomi%s %%st(0),%%st(%u)\n", pop_after ? "p" : "", i);
   5350    /* This is a bit of a hack (and isn't really right).  It sets
   5351       Z,P,C,O correctly, but forces A and S to zero, whereas the Intel
   5352       documentation implies A and S are unchanged.
   5353    */
   5354    /* It's also fishy in that it is used both for COMIP and
   5355       UCOMIP, and they aren't the same (although similar). */
   5356    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   5357    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   5358    stmt( IRStmt_Put(
   5359             OFFB_CC_DEP1,
   5360             binop( Iop_And64,
   5361                    unop( Iop_32Uto64,
   5362                          binop(Iop_CmpF64, get_ST(0), get_ST(i))),
   5363                    mkU64(0x45)
   5364         )));
   5365    if (pop_after)
   5366       fp_pop();
   5367 }
   5368 
   5369 
   5370 /* returns
   5371    32to16( if e32 <s -32768 || e32 >s 32767 then -32768 else e32 )
   5372 */
   5373 static IRExpr* x87ishly_qnarrow_32_to_16 ( IRExpr* e32 )
   5374 {
   5375    IRTemp t32 = newTemp(Ity_I32);
   5376    assign( t32, e32 );
   5377    return
   5378       IRExpr_ITE(
   5379          binop(Iop_CmpLT64U,
   5380                unop(Iop_32Uto64,
   5381                     binop(Iop_Add32, mkexpr(t32), mkU32(32768))),
   5382                mkU64(65536)),
   5383          unop(Iop_32to16, mkexpr(t32)),
   5384          mkU16( 0x8000 ) );
   5385 }
   5386 
   5387 
   5388 static
   5389 ULong dis_FPU ( /*OUT*/Bool* decode_ok,
   5390                 const VexAbiInfo* vbi, Prefix pfx, Long delta )
   5391 {
   5392    Int    len;
   5393    UInt   r_src, r_dst;
   5394    HChar  dis_buf[50];
   5395    IRTemp t1, t2;
   5396 
   5397    /* On entry, delta points at the second byte of the insn (the modrm
   5398       byte).*/
   5399    UChar first_opcode = getUChar(delta-1);
   5400    UChar modrm        = getUChar(delta+0);
   5401 
   5402    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */
   5403 
   5404    if (first_opcode == 0xD8) {
   5405       if (modrm < 0xC0) {
   5406 
   5407          /* bits 5,4,3 are an opcode extension, and the modRM also
   5408            specifies an address. */
   5409          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   5410          delta += len;
   5411 
   5412          switch (gregLO3ofRM(modrm)) {
   5413 
   5414             case 0: /* FADD single-real */
   5415                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, False );
   5416                break;
   5417 
   5418             case 1: /* FMUL single-real */
   5419                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, False );
   5420                break;
   5421 
   5422             case 2: /* FCOM single-real */
   5423                DIP("fcoms %s\n", dis_buf);
   5424                /* This forces C1 to zero, which isn't right. */
   5425                /* The AMD documentation suggests that forcing C1 to
   5426                   zero is correct (Eliot Moss) */
   5427                put_C3210(
   5428                    unop( Iop_32Uto64,
   5429                        binop( Iop_And32,
   5430                               binop(Iop_Shl32,
   5431                                     binop(Iop_CmpF64,
   5432                                           get_ST(0),
   5433                                           unop(Iop_F32toF64,
   5434                                                loadLE(Ity_F32,mkexpr(addr)))),
   5435                                     mkU8(8)),
   5436                               mkU32(0x4500)
   5437                    )));
   5438                break;
   5439 
   5440             case 3: /* FCOMP single-real */
   5441                /* The AMD documentation suggests that forcing C1 to
   5442                   zero is correct (Eliot Moss) */
   5443                DIP("fcomps %s\n", dis_buf);
   5444                /* This forces C1 to zero, which isn't right. */
   5445                put_C3210(
   5446                    unop( Iop_32Uto64,
   5447                        binop( Iop_And32,
   5448                               binop(Iop_Shl32,
   5449                                     binop(Iop_CmpF64,
   5450                                           get_ST(0),
   5451                                           unop(Iop_F32toF64,
   5452                                                loadLE(Ity_F32,mkexpr(addr)))),
   5453                                     mkU8(8)),
   5454                               mkU32(0x4500)
   5455                    )));
   5456                fp_pop();
   5457                break;
   5458 
   5459             case 4: /* FSUB single-real */
   5460                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, False );
   5461                break;
   5462 
   5463             case 5: /* FSUBR single-real */
   5464                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, False );
   5465                break;
   5466 
   5467             case 6: /* FDIV single-real */
   5468                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, False );
   5469                break;
   5470 
   5471             case 7: /* FDIVR single-real */
   5472                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, False );
   5473                break;
   5474 
   5475             default:
   5476                vex_printf("unhandled opc_aux = 0x%2x\n",
   5477                           (UInt)gregLO3ofRM(modrm));
   5478                vex_printf("first_opcode == 0xD8\n");
   5479                goto decode_fail;
   5480          }
   5481       } else {
   5482          delta++;
   5483          switch (modrm) {
   5484 
   5485             case 0xC0 ... 0xC7: /* FADD %st(?),%st(0) */
   5486                fp_do_op_ST_ST ( "add", Iop_AddF64, modrm - 0xC0, 0, False );
   5487                break;
   5488 
   5489             case 0xC8 ... 0xCF: /* FMUL %st(?),%st(0) */
   5490                fp_do_op_ST_ST ( "mul", Iop_MulF64, modrm - 0xC8, 0, False );
   5491                break;
   5492 
   5493             /* Dunno if this is right */
   5494             case 0xD0 ... 0xD7: /* FCOM %st(?),%st(0) */
   5495                r_dst = (UInt)modrm - 0xD0;
   5496                DIP("fcom %%st(0),%%st(%u)\n", r_dst);
   5497                /* This forces C1 to zero, which isn't right. */
   5498                put_C3210(
   5499                    unop(Iop_32Uto64,
   5500                    binop( Iop_And32,
   5501                           binop(Iop_Shl32,
   5502                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   5503                                 mkU8(8)),
   5504                           mkU32(0x4500)
   5505                    )));
   5506                break;
   5507 
   5508             /* Dunno if this is right */
   5509             case 0xD8 ... 0xDF: /* FCOMP %st(?),%st(0) */
   5510                r_dst = (UInt)modrm - 0xD8;
   5511                DIP("fcomp %%st(0),%%st(%u)\n", r_dst);
   5512                /* This forces C1 to zero, which isn't right. */
   5513                put_C3210(
   5514                    unop(Iop_32Uto64,
   5515                    binop( Iop_And32,
   5516                           binop(Iop_Shl32,
   5517                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   5518                                 mkU8(8)),
   5519                           mkU32(0x4500)
   5520                    )));
   5521                fp_pop();
   5522                break;
   5523 
   5524             case 0xE0 ... 0xE7: /* FSUB %st(?),%st(0) */
   5525                fp_do_op_ST_ST ( "sub", Iop_SubF64, modrm - 0xE0, 0, False );
   5526                break;
   5527 
   5528             case 0xE8 ... 0xEF: /* FSUBR %st(?),%st(0) */
   5529                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, modrm - 0xE8, 0, False );
   5530                break;
   5531 
   5532             case 0xF0 ... 0xF7: /* FDIV %st(?),%st(0) */
   5533                fp_do_op_ST_ST ( "div", Iop_DivF64, modrm - 0xF0, 0, False );
   5534                break;
   5535 
   5536             case 0xF8 ... 0xFF: /* FDIVR %st(?),%st(0) */
   5537                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, modrm - 0xF8, 0, False );
   5538                break;
   5539 
   5540             default:
   5541                goto decode_fail;
   5542          }
   5543       }
   5544    }
   5545 
   5546    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
   5547    else
   5548    if (first_opcode == 0xD9) {
   5549       if (modrm < 0xC0) {
   5550 
   5551          /* bits 5,4,3 are an opcode extension, and the modRM also
   5552             specifies an address. */
   5553          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   5554          delta += len;
   5555 
   5556          switch (gregLO3ofRM(modrm)) {
   5557 
   5558             case 0: /* FLD single-real */
   5559                DIP("flds %s\n", dis_buf);
   5560                fp_push();
   5561                put_ST(0, unop(Iop_F32toF64,
   5562                               loadLE(Ity_F32, mkexpr(addr))));
   5563                break;
   5564 
   5565             case 2: /* FST single-real */
   5566                DIP("fsts %s\n", dis_buf);
   5567                storeLE(mkexpr(addr),
   5568                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   5569                break;
   5570 
   5571             case 3: /* FSTP single-real */
   5572                DIP("fstps %s\n", dis_buf);
   5573                storeLE(mkexpr(addr),
   5574                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   5575                fp_pop();
   5576                break;
   5577 
   5578             case 4: { /* FLDENV m28 */
   5579                /* Uses dirty helper:
   5580                      VexEmNote amd64g_do_FLDENV ( VexGuestX86State*, HWord ) */
   5581                IRTemp    ew = newTemp(Ity_I32);
   5582                IRTemp   w64 = newTemp(Ity_I64);
   5583                IRDirty*   d = unsafeIRDirty_0_N (
   5584                                  0/*regparms*/,
   5585                                  "amd64g_dirtyhelper_FLDENV",
   5586                                  &amd64g_dirtyhelper_FLDENV,
   5587                                  mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   5588                               );
   5589                d->tmp       = w64;
   5590                /* declare we're reading memory */
   5591                d->mFx   = Ifx_Read;
   5592                d->mAddr = mkexpr(addr);
   5593                d->mSize = 28;
   5594 
   5595                /* declare we're writing guest state */
   5596                d->nFxState = 4;
   5597                vex_bzero(&d->fxState, sizeof(d->fxState));
   5598 
   5599                d->fxState[0].fx     = Ifx_Write;
   5600                d->fxState[0].offset = OFFB_FTOP;
   5601                d->fxState[0].size   = sizeof(UInt);
   5602 
   5603                d->fxState[1].fx     = Ifx_Write;
   5604                d->fxState[1].offset = OFFB_FPTAGS;
   5605                d->fxState[1].size   = 8 * sizeof(UChar);
   5606 
   5607                d->fxState[2].fx     = Ifx_Write;
   5608                d->fxState[2].offset = OFFB_FPROUND;
   5609                d->fxState[2].size   = sizeof(ULong);
   5610 
   5611                d->fxState[3].fx     = Ifx_Write;
   5612                d->fxState[3].offset = OFFB_FC3210;
   5613                d->fxState[3].size   = sizeof(ULong);
   5614 
   5615                stmt( IRStmt_Dirty(d) );
   5616 
   5617                /* ew contains any emulation warning we may need to
   5618                   issue.  If needed, side-exit to the next insn,
   5619                   reporting the warning, so that Valgrind's dispatcher
   5620                   sees the warning. */
   5621                assign(ew, unop(Iop_64to32,mkexpr(w64)) );
   5622                put_emwarn( mkexpr(ew) );
   5623                stmt(
   5624                   IRStmt_Exit(
   5625                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   5626                      Ijk_EmWarn,
   5627                      IRConst_U64( guest_RIP_bbstart+delta ),
   5628                      OFFB_RIP
   5629                   )
   5630                );
   5631 
   5632                DIP("fldenv %s\n", dis_buf);
   5633                break;
   5634             }
   5635 
   5636             case 5: {/* FLDCW */
   5637                /* The only thing we observe in the control word is the
   5638                   rounding mode.  Therefore, pass the 16-bit value
   5639                   (x87 native-format control word) to a clean helper,
   5640                   getting back a 64-bit value, the lower half of which
   5641                   is the FPROUND value to store, and the upper half of
   5642                   which is the emulation-warning token which may be
   5643                   generated.
   5644                */
   5645                /* ULong amd64h_check_fldcw ( ULong ); */
   5646                IRTemp t64 = newTemp(Ity_I64);
   5647                IRTemp ew = newTemp(Ity_I32);
   5648                DIP("fldcw %s\n", dis_buf);
   5649                assign( t64, mkIRExprCCall(
   5650                                Ity_I64, 0/*regparms*/,
   5651                                "amd64g_check_fldcw",
   5652                                &amd64g_check_fldcw,
   5653                                mkIRExprVec_1(
   5654                                   unop( Iop_16Uto64,
   5655                                         loadLE(Ity_I16, mkexpr(addr)))
   5656                                )
   5657                             )
   5658                      );
   5659 
   5660                put_fpround( unop(Iop_64to32, mkexpr(t64)) );
   5661                assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   5662                put_emwarn( mkexpr(ew) );
   5663                /* Finally, if an emulation warning was reported,
   5664                   side-exit to the next insn, reporting the warning,
   5665                   so that Valgrind's dispatcher sees the warning. */
   5666                stmt(
   5667                   IRStmt_Exit(
   5668                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   5669                      Ijk_EmWarn,
   5670                      IRConst_U64( guest_RIP_bbstart+delta ),
   5671                      OFFB_RIP
   5672                   )
   5673                );
   5674                break;
   5675             }
   5676 
   5677             case 6: { /* FNSTENV m28 */
   5678                /* Uses dirty helper:
   5679                      void amd64g_do_FSTENV ( VexGuestAMD64State*, HWord ) */
   5680                IRDirty* d = unsafeIRDirty_0_N (
   5681                                0/*regparms*/,
   5682                                "amd64g_dirtyhelper_FSTENV",
   5683                                &amd64g_dirtyhelper_FSTENV,
   5684                                mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   5685                             );
   5686                /* declare we're writing memory */
   5687                d->mFx   = Ifx_Write;
   5688                d->mAddr = mkexpr(addr);
   5689                d->mSize = 28;
   5690 
   5691                /* declare we're reading guest state */
   5692                d->nFxState = 4;
   5693                vex_bzero(&d->fxState, sizeof(d->fxState));
   5694 
   5695                d->fxState[0].fx     = Ifx_Read;
   5696                d->fxState[0].offset = OFFB_FTOP;
   5697                d->fxState[0].size   = sizeof(UInt);
   5698 
   5699                d->fxState[1].fx     = Ifx_Read;
   5700                d->fxState[1].offset = OFFB_FPTAGS;
   5701                d->fxState[1].size   = 8 * sizeof(UChar);
   5702 
   5703                d->fxState[2].fx     = Ifx_Read;
   5704                d->fxState[2].offset = OFFB_FPROUND;
   5705                d->fxState[2].size   = sizeof(ULong);
   5706 
   5707                d->fxState[3].fx     = Ifx_Read;
   5708                d->fxState[3].offset = OFFB_FC3210;
   5709                d->fxState[3].size   = sizeof(ULong);
   5710 
   5711                stmt( IRStmt_Dirty(d) );
   5712 
   5713                DIP("fnstenv %s\n", dis_buf);
   5714                break;
   5715             }
   5716 
   5717             case 7: /* FNSTCW */
   5718                /* Fake up a native x87 FPU control word.  The only
   5719                   thing it depends on is FPROUND[1:0], so call a clean
   5720                   helper to cook it up. */
   5721                /* ULong amd64g_create_fpucw ( ULong fpround ) */
   5722                DIP("fnstcw %s\n", dis_buf);
   5723                storeLE(
   5724                   mkexpr(addr),
   5725                   unop( Iop_64to16,
   5726                         mkIRExprCCall(
   5727                            Ity_I64, 0/*regp*/,
   5728                            "amd64g_create_fpucw", &amd64g_create_fpucw,
   5729                            mkIRExprVec_1( unop(Iop_32Uto64, get_fpround()) )
   5730                         )
   5731                   )
   5732                );
   5733                break;
   5734 
   5735             default:
   5736                vex_printf("unhandled opc_aux = 0x%2x\n",
   5737                           (UInt)gregLO3ofRM(modrm));
   5738                vex_printf("first_opcode == 0xD9\n");
   5739                goto decode_fail;
   5740          }
   5741 
   5742       } else {
   5743          delta++;
   5744          switch (modrm) {
   5745 
   5746             case 0xC0 ... 0xC7: /* FLD %st(?) */
   5747                r_src = (UInt)modrm - 0xC0;
   5748                DIP("fld %%st(%u)\n", r_src);
   5749                t1 = newTemp(Ity_F64);
   5750                assign(t1, get_ST(r_src));
   5751                fp_push();
   5752                put_ST(0, mkexpr(t1));
   5753                break;
   5754 
   5755             case 0xC8 ... 0xCF: /* FXCH %st(?) */
   5756                r_src = (UInt)modrm - 0xC8;
   5757                DIP("fxch %%st(%u)\n", r_src);
   5758                t1 = newTemp(Ity_F64);
   5759                t2 = newTemp(Ity_F64);
   5760                assign(t1, get_ST(0));
   5761                assign(t2, get_ST(r_src));
   5762                put_ST_UNCHECKED(0, mkexpr(t2));
   5763                put_ST_UNCHECKED(r_src, mkexpr(t1));
   5764                break;
   5765 
   5766             case 0xE0: /* FCHS */
   5767                DIP("fchs\n");
   5768                put_ST_UNCHECKED(0, unop(Iop_NegF64, get_ST(0)));
   5769                break;
   5770 
   5771             case 0xE1: /* FABS */
   5772                DIP("fabs\n");
   5773                put_ST_UNCHECKED(0, unop(Iop_AbsF64, get_ST(0)));
   5774                break;
   5775 
   5776             case 0xE5: { /* FXAM */
   5777                /* This is an interesting one.  It examines %st(0),
   5778                   regardless of whether the tag says it's empty or not.
   5779                   Here, just pass both the tag (in our format) and the
   5780                   value (as a double, actually a ULong) to a helper
   5781                   function. */
   5782                IRExpr** args
   5783                   = mkIRExprVec_2( unop(Iop_8Uto64, get_ST_TAG(0)),
   5784                                    unop(Iop_ReinterpF64asI64,
   5785                                         get_ST_UNCHECKED(0)) );
   5786                put_C3210(mkIRExprCCall(
   5787                             Ity_I64,
   5788                             0/*regparm*/,
   5789                             "amd64g_calculate_FXAM", &amd64g_calculate_FXAM,
   5790                             args
   5791                         ));
   5792                DIP("fxam\n");
   5793                break;
   5794             }
   5795 
   5796             case 0xE8: /* FLD1 */
   5797                DIP("fld1\n");
   5798                fp_push();
   5799                /* put_ST(0, IRExpr_Const(IRConst_F64(1.0))); */
   5800                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL)));
   5801                break;
   5802 
   5803             case 0xE9: /* FLDL2T */
   5804                DIP("fldl2t\n");
   5805                fp_push();
   5806                /* put_ST(0, IRExpr_Const(IRConst_F64(3.32192809488736234781))); */
   5807                put_ST(0, IRExpr_Const(IRConst_F64i(0x400a934f0979a371ULL)));
   5808                break;
   5809 
   5810             case 0xEA: /* FLDL2E */
   5811                DIP("fldl2e\n");
   5812                fp_push();
   5813                /* put_ST(0, IRExpr_Const(IRConst_F64(1.44269504088896340739))); */
   5814                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff71547652b82feULL)));
   5815                break;
   5816 
   5817             case 0xEB: /* FLDPI */
   5818                DIP("fldpi\n");
   5819                fp_push();
   5820                /* put_ST(0, IRExpr_Const(IRConst_F64(3.14159265358979323851))); */
   5821                put_ST(0, IRExpr_Const(IRConst_F64i(0x400921fb54442d18ULL)));
   5822                break;
   5823 
   5824             case 0xEC: /* FLDLG2 */
   5825                DIP("fldlg2\n");
   5826                fp_push();
   5827                /* put_ST(0, IRExpr_Const(IRConst_F64(0.301029995663981143))); */
   5828                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fd34413509f79ffULL)));
   5829                break;
   5830 
   5831             case 0xED: /* FLDLN2 */
   5832                DIP("fldln2\n");
   5833                fp_push();
   5834                /* put_ST(0, IRExpr_Const(IRConst_F64(0.69314718055994530942))); */
   5835                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fe62e42fefa39efULL)));
   5836                break;
   5837 
   5838             case 0xEE: /* FLDZ */
   5839                DIP("fldz\n");
   5840                fp_push();
   5841                /* put_ST(0, IRExpr_Const(IRConst_F64(0.0))); */
   5842                put_ST(0, IRExpr_Const(IRConst_F64i(0x0000000000000000ULL)));
   5843                break;
   5844 
   5845             case 0xF0: /* F2XM1 */
   5846                DIP("f2xm1\n");
   5847                put_ST_UNCHECKED(0,
   5848                   binop(Iop_2xm1F64,
   5849                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5850                         get_ST(0)));
   5851                break;
   5852 
   5853             case 0xF1: /* FYL2X */
   5854                DIP("fyl2x\n");
   5855                put_ST_UNCHECKED(1,
   5856                   triop(Iop_Yl2xF64,
   5857                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5858                         get_ST(1),
   5859                         get_ST(0)));
   5860                fp_pop();
   5861                break;
   5862 
   5863             case 0xF2: { /* FPTAN */
   5864                DIP("fptan\n");
   5865                IRTemp argD = newTemp(Ity_F64);
   5866                assign(argD, get_ST(0));
   5867                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
   5868                IRTemp resD = newTemp(Ity_F64);
   5869                assign(resD,
   5870                   IRExpr_ITE(
   5871                      mkexpr(argOK),
   5872                      binop(Iop_TanF64,
   5873                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5874                            mkexpr(argD)),
   5875                      mkexpr(argD))
   5876                );
   5877                put_ST_UNCHECKED(0, mkexpr(resD));
   5878                /* Conditionally push 1.0 on the stack, if the arg is
   5879                   in range */
   5880                maybe_fp_push(argOK);
   5881                maybe_put_ST(argOK, 0,
   5882                             IRExpr_Const(IRConst_F64(1.0)));
   5883                set_C2( binop(Iop_Xor64,
   5884                              unop(Iop_1Uto64, mkexpr(argOK)),
   5885                              mkU64(1)) );
   5886                break;
   5887             }
   5888 
   5889             case 0xF3: /* FPATAN */
   5890                DIP("fpatan\n");
   5891                put_ST_UNCHECKED(1,
   5892                   triop(Iop_AtanF64,
   5893                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5894                         get_ST(1),
   5895                         get_ST(0)));
   5896                fp_pop();
   5897                break;
   5898 
   5899             case 0xF4: { /* FXTRACT */
   5900                IRTemp argF = newTemp(Ity_F64);
   5901                IRTemp sigF = newTemp(Ity_F64);
   5902                IRTemp expF = newTemp(Ity_F64);
   5903                IRTemp argI = newTemp(Ity_I64);
   5904                IRTemp sigI = newTemp(Ity_I64);
   5905                IRTemp expI = newTemp(Ity_I64);
   5906                DIP("fxtract\n");
   5907                assign( argF, get_ST(0) );
   5908                assign( argI, unop(Iop_ReinterpF64asI64, mkexpr(argF)));
   5909                assign( sigI,
   5910                        mkIRExprCCall(
   5911                           Ity_I64, 0/*regparms*/,
   5912                           "x86amd64g_calculate_FXTRACT",
   5913                           &x86amd64g_calculate_FXTRACT,
   5914                           mkIRExprVec_2( mkexpr(argI),
   5915                                          mkIRExpr_HWord(0)/*sig*/ ))
   5916                );
   5917                assign( expI,
   5918                        mkIRExprCCall(
   5919                           Ity_I64, 0/*regparms*/,
   5920                           "x86amd64g_calculate_FXTRACT",
   5921                           &x86amd64g_calculate_FXTRACT,
   5922                           mkIRExprVec_2( mkexpr(argI),
   5923                                          mkIRExpr_HWord(1)/*exp*/ ))
   5924                );
   5925                assign( sigF, unop(Iop_ReinterpI64asF64, mkexpr(sigI)) );
   5926                assign( expF, unop(Iop_ReinterpI64asF64, mkexpr(expI)) );
   5927                /* exponent */
   5928                put_ST_UNCHECKED(0, mkexpr(expF) );
   5929                fp_push();
   5930                /* significand */
   5931                put_ST(0, mkexpr(sigF) );
   5932                break;
   5933             }
   5934 
   5935             case 0xF5: { /* FPREM1 -- IEEE compliant */
   5936                IRTemp a1 = newTemp(Ity_F64);
   5937                IRTemp a2 = newTemp(Ity_F64);
   5938                DIP("fprem1\n");
   5939                /* Do FPREM1 twice, once to get the remainder, and once
   5940                   to get the C3210 flag values. */
   5941                assign( a1, get_ST(0) );
   5942                assign( a2, get_ST(1) );
   5943                put_ST_UNCHECKED(0,
   5944                   triop(Iop_PRem1F64,
   5945                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5946                         mkexpr(a1),
   5947                         mkexpr(a2)));
   5948                put_C3210(
   5949                   unop(Iop_32Uto64,
   5950                   triop(Iop_PRem1C3210F64,
   5951                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5952                         mkexpr(a1),
   5953                         mkexpr(a2)) ));
   5954                break;
   5955             }
   5956 
   5957             case 0xF7: /* FINCSTP */
   5958                DIP("fincstp\n");
   5959                put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   5960                break;
   5961 
   5962             case 0xF8: { /* FPREM -- not IEEE compliant */
   5963                IRTemp a1 = newTemp(Ity_F64);
   5964                IRTemp a2 = newTemp(Ity_F64);
   5965                DIP("fprem\n");
   5966                /* Do FPREM twice, once to get the remainder, and once
   5967                   to get the C3210 flag values. */
   5968                assign( a1, get_ST(0) );
   5969                assign( a2, get_ST(1) );
   5970                put_ST_UNCHECKED(0,
   5971                   triop(Iop_PRemF64,
   5972                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5973                         mkexpr(a1),
   5974                         mkexpr(a2)));
   5975                put_C3210(
   5976                   unop(Iop_32Uto64,
   5977                   triop(Iop_PRemC3210F64,
   5978                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5979                         mkexpr(a1),
   5980                         mkexpr(a2)) ));
   5981                break;
   5982             }
   5983 
   5984             case 0xF9: /* FYL2XP1 */
   5985                DIP("fyl2xp1\n");
   5986                put_ST_UNCHECKED(1,
   5987                   triop(Iop_Yl2xp1F64,
   5988                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5989                         get_ST(1),
   5990                         get_ST(0)));
   5991                fp_pop();
   5992                break;
   5993 
   5994             case 0xFA: /* FSQRT */
   5995                DIP("fsqrt\n");
   5996                put_ST_UNCHECKED(0,
   5997                   binop(Iop_SqrtF64,
   5998                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5999                         get_ST(0)));
   6000                break;
   6001 
   6002             case 0xFB: { /* FSINCOS */
   6003                DIP("fsincos\n");
   6004                IRTemp argD = newTemp(Ity_F64);
   6005                assign(argD, get_ST(0));
   6006                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
   6007                IRTemp resD = newTemp(Ity_F64);
   6008                assign(resD,
   6009                   IRExpr_ITE(
   6010                      mkexpr(argOK),
   6011                      binop(Iop_SinF64,
   6012                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6013                            mkexpr(argD)),
   6014                      mkexpr(argD))
   6015                );
   6016                put_ST_UNCHECKED(0, mkexpr(resD));
   6017                /* Conditionally push the cos value on the stack, if
   6018                   the arg is in range */
   6019                maybe_fp_push(argOK);
   6020                maybe_put_ST(argOK, 0,
   6021                   binop(Iop_CosF64,
   6022                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6023                         mkexpr(argD)));
   6024                set_C2( binop(Iop_Xor64,
   6025                              unop(Iop_1Uto64, mkexpr(argOK)),
   6026                              mkU64(1)) );
   6027                break;
   6028             }
   6029 
   6030             case 0xFC: /* FRNDINT */
   6031                DIP("frndint\n");
   6032                put_ST_UNCHECKED(0,
   6033                   binop(Iop_RoundF64toInt, get_roundingmode(), get_ST(0)) );
   6034                break;
   6035 
   6036             case 0xFD: /* FSCALE */
   6037                DIP("fscale\n");
   6038                put_ST_UNCHECKED(0,
   6039                   triop(Iop_ScaleF64,
   6040                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6041                         get_ST(0),
   6042                         get_ST(1)));
   6043                break;
   6044 
   6045             case 0xFE:   /* FSIN */
   6046             case 0xFF: { /* FCOS */
   6047                Bool isSIN = modrm == 0xFE;
   6048                DIP("%s\n", isSIN ? "fsin" : "fcos");
   6049                IRTemp argD = newTemp(Ity_F64);
   6050                assign(argD, get_ST(0));
   6051                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
   6052                IRTemp resD = newTemp(Ity_F64);
   6053                assign(resD,
   6054                   IRExpr_ITE(
   6055                      mkexpr(argOK),
   6056                      binop(isSIN ? Iop_SinF64 : Iop_CosF64,
   6057                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6058                            mkexpr(argD)),
   6059                      mkexpr(argD))
   6060                );
   6061                put_ST_UNCHECKED(0, mkexpr(resD));
   6062                set_C2( binop(Iop_Xor64,
   6063                              unop(Iop_1Uto64, mkexpr(argOK)),
   6064                              mkU64(1)) );
   6065                break;
   6066             }
   6067 
   6068             default:
   6069                goto decode_fail;
   6070          }
   6071       }
   6072    }
   6073 
   6074    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
   6075    else
   6076    if (first_opcode == 0xDA) {
   6077 
   6078       if (modrm < 0xC0) {
   6079 
   6080          /* bits 5,4,3 are an opcode extension, and the modRM also
   6081             specifies an address. */
   6082          IROp   fop;
   6083          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6084          delta += len;
   6085          switch (gregLO3ofRM(modrm)) {
   6086 
   6087             case 0: /* FIADD m32int */ /* ST(0) += m32int */
   6088                DIP("fiaddl %s\n", dis_buf);
   6089                fop = Iop_AddF64;
   6090                goto do_fop_m32;
   6091 
   6092             case 1: /* FIMUL m32int */ /* ST(0) *= m32int */
   6093                DIP("fimull %s\n", dis_buf);
   6094                fop = Iop_MulF64;
   6095                goto do_fop_m32;
   6096 
   6097             case 4: /* FISUB m32int */ /* ST(0) -= m32int */
   6098                DIP("fisubl %s\n", dis_buf);
   6099                fop = Iop_SubF64;
   6100                goto do_fop_m32;
   6101 
   6102             case 5: /* FISUBR m32int */ /* ST(0) = m32int - ST(0) */
   6103                DIP("fisubrl %s\n", dis_buf);
   6104                fop = Iop_SubF64;
   6105                goto do_foprev_m32;
   6106 
   6107             case 6: /* FIDIV m32int */ /* ST(0) /= m32int */
   6108                DIP("fisubl %s\n", dis_buf);
   6109                fop = Iop_DivF64;
   6110                goto do_fop_m32;
   6111 
   6112             case 7: /* FIDIVR m32int */ /* ST(0) = m32int / ST(0) */
   6113                DIP("fidivrl %s\n", dis_buf);
   6114                fop = Iop_DivF64;
   6115                goto do_foprev_m32;
   6116 
   6117             do_fop_m32:
   6118                put_ST_UNCHECKED(0,
   6119                   triop(fop,
   6120                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6121                         get_ST(0),
   6122                         unop(Iop_I32StoF64,
   6123                              loadLE(Ity_I32, mkexpr(addr)))));
   6124                break;
   6125 
   6126             do_foprev_m32:
   6127                put_ST_UNCHECKED(0,
   6128                   triop(fop,
   6129                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6130                         unop(Iop_I32StoF64,
   6131                              loadLE(Ity_I32, mkexpr(addr))),
   6132                         get_ST(0)));
   6133                break;
   6134 
   6135             default:
   6136                vex_printf("unhandled opc_aux = 0x%2x\n",
   6137                           (UInt)gregLO3ofRM(modrm));
   6138                vex_printf("first_opcode == 0xDA\n");
   6139                goto decode_fail;
   6140          }
   6141 
   6142       } else {
   6143 
   6144          delta++;
   6145          switch (modrm) {
   6146 
   6147             case 0xC0 ... 0xC7: /* FCMOVB ST(i), ST(0) */
   6148                r_src = (UInt)modrm - 0xC0;
   6149                DIP("fcmovb %%st(%u), %%st(0)\n", r_src);
   6150                put_ST_UNCHECKED(0,
   6151                                 IRExpr_ITE(
   6152                                     mk_amd64g_calculate_condition(AMD64CondB),
   6153                                     get_ST(r_src), get_ST(0)) );
   6154                break;
   6155 
   6156             case 0xC8 ... 0xCF: /* FCMOVE(Z) ST(i), ST(0) */
   6157                r_src = (UInt)modrm - 0xC8;
   6158                DIP("fcmovz %%st(%u), %%st(0)\n", r_src);
   6159                put_ST_UNCHECKED(0,
   6160                                 IRExpr_ITE(
   6161                                     mk_amd64g_calculate_condition(AMD64CondZ),
   6162                                     get_ST(r_src), get_ST(0)) );
   6163                break;
   6164 
   6165             case 0xD0 ... 0xD7: /* FCMOVBE ST(i), ST(0) */
   6166                r_src = (UInt)modrm - 0xD0;
   6167                DIP("fcmovbe %%st(%u), %%st(0)\n", r_src);
   6168                put_ST_UNCHECKED(0,
   6169                                 IRExpr_ITE(
   6170                                     mk_amd64g_calculate_condition(AMD64CondBE),
   6171                                     get_ST(r_src), get_ST(0)) );
   6172                break;
   6173 
   6174             case 0xD8 ... 0xDF: /* FCMOVU ST(i), ST(0) */
   6175                r_src = (UInt)modrm - 0xD8;
   6176                DIP("fcmovu %%st(%u), %%st(0)\n", r_src);
   6177                put_ST_UNCHECKED(0,
   6178                                 IRExpr_ITE(
   6179                                     mk_amd64g_calculate_condition(AMD64CondP),
   6180                                     get_ST(r_src), get_ST(0)) );
   6181                break;
   6182 
   6183             case 0xE9: /* FUCOMPP %st(0),%st(1) */
   6184                DIP("fucompp %%st(0),%%st(1)\n");
   6185                /* This forces C1 to zero, which isn't right. */
   6186                put_C3210(
   6187                    unop(Iop_32Uto64,
   6188                    binop( Iop_And32,
   6189                           binop(Iop_Shl32,
   6190                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   6191                                 mkU8(8)),
   6192                           mkU32(0x4500)
   6193                    )));
   6194                fp_pop();
   6195                fp_pop();
   6196                break;
   6197 
   6198             default:
   6199                goto decode_fail;
   6200          }
   6201 
   6202       }
   6203    }
   6204 
   6205    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDB opcodes +-+-+-+-+-+-+-+ */
   6206    else
   6207    if (first_opcode == 0xDB) {
   6208       if (modrm < 0xC0) {
   6209 
   6210          /* bits 5,4,3 are an opcode extension, and the modRM also
   6211             specifies an address. */
   6212          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6213          delta += len;
   6214 
   6215          switch (gregLO3ofRM(modrm)) {
   6216 
   6217             case 0: /* FILD m32int */
   6218                DIP("fildl %s\n", dis_buf);
   6219                fp_push();
   6220                put_ST(0, unop(Iop_I32StoF64,
   6221                               loadLE(Ity_I32, mkexpr(addr))));
   6222                break;
   6223 
   6224             case 1: /* FISTTPL m32 (SSE3) */
   6225                DIP("fisttpl %s\n", dis_buf);
   6226                storeLE( mkexpr(addr),
   6227                         binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) );
   6228                fp_pop();
   6229                break;
   6230 
   6231             case 2: /* FIST m32 */
   6232                DIP("fistl %s\n", dis_buf);
   6233                storeLE( mkexpr(addr),
   6234                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   6235                break;
   6236 
   6237             case 3: /* FISTP m32 */
   6238                DIP("fistpl %s\n", dis_buf);
   6239                storeLE( mkexpr(addr),
   6240                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   6241                fp_pop();
   6242                break;
   6243 
   6244             case 5: { /* FLD extended-real */
   6245                /* Uses dirty helper:
   6246                      ULong amd64g_loadF80le ( ULong )
   6247                   addr holds the address.  First, do a dirty call to
   6248                   get hold of the data. */
   6249                IRTemp   val  = newTemp(Ity_I64);
   6250                IRExpr** args = mkIRExprVec_1 ( mkexpr(addr) );
   6251 
   6252                IRDirty* d = unsafeIRDirty_1_N (
   6253                                val,
   6254                                0/*regparms*/,
   6255                                "amd64g_dirtyhelper_loadF80le",
   6256                                &amd64g_dirtyhelper_loadF80le,
   6257                                args
   6258                             );
   6259                /* declare that we're reading memory */
   6260                d->mFx   = Ifx_Read;
   6261                d->mAddr = mkexpr(addr);
   6262                d->mSize = 10;
   6263 
   6264                /* execute the dirty call, dumping the result in val. */
   6265                stmt( IRStmt_Dirty(d) );
   6266                fp_push();
   6267                put_ST(0, unop(Iop_ReinterpI64asF64, mkexpr(val)));
   6268 
   6269                DIP("fldt %s\n", dis_buf);
   6270                break;
   6271             }
   6272 
   6273             case 7: { /* FSTP extended-real */
   6274                /* Uses dirty helper:
   6275                      void amd64g_storeF80le ( ULong addr, ULong data )
   6276                */
   6277                IRExpr** args
   6278                   = mkIRExprVec_2( mkexpr(addr),
   6279                                    unop(Iop_ReinterpF64asI64, get_ST(0)) );
   6280 
   6281                IRDirty* d = unsafeIRDirty_0_N (
   6282                                0/*regparms*/,
   6283                                "amd64g_dirtyhelper_storeF80le",
   6284                                &amd64g_dirtyhelper_storeF80le,
   6285                                args
   6286                             );
   6287                /* declare we're writing memory */
   6288                d->mFx   = Ifx_Write;
   6289                d->mAddr = mkexpr(addr);
   6290                d->mSize = 10;
   6291 
   6292                /* execute the dirty call. */
   6293                stmt( IRStmt_Dirty(d) );
   6294                fp_pop();
   6295 
   6296                DIP("fstpt\n %s", dis_buf);
   6297                break;
   6298             }
   6299 
   6300             default:
   6301                vex_printf("unhandled opc_aux = 0x%2x\n",
   6302                           (UInt)gregLO3ofRM(modrm));
   6303                vex_printf("first_opcode == 0xDB\n");
   6304                goto decode_fail;
   6305          }
   6306 
   6307       } else {
   6308 
   6309          delta++;
   6310          switch (modrm) {
   6311 
   6312             case 0xC0 ... 0xC7: /* FCMOVNB ST(i), ST(0) */
   6313                r_src = (UInt)modrm - 0xC0;
   6314                DIP("fcmovnb %%st(%u), %%st(0)\n", r_src);
   6315                put_ST_UNCHECKED(0,
   6316                                 IRExpr_ITE(
   6317                                     mk_amd64g_calculate_condition(AMD64CondNB),
   6318                                     get_ST(r_src), get_ST(0)) );
   6319                break;
   6320 
   6321             case 0xC8 ... 0xCF: /* FCMOVNE(NZ) ST(i), ST(0) */
   6322                r_src = (UInt)modrm - 0xC8;
   6323                DIP("fcmovnz %%st(%u), %%st(0)\n", r_src);
   6324                put_ST_UNCHECKED(
   6325                   0,
   6326                   IRExpr_ITE(
   6327                      mk_amd64g_calculate_condition(AMD64CondNZ),
   6328                      get_ST(r_src),
   6329                      get_ST(0)
   6330                   )
   6331                );
   6332                break;
   6333 
   6334             case 0xD0 ... 0xD7: /* FCMOVNBE ST(i), ST(0) */
   6335                r_src = (UInt)modrm - 0xD0;
   6336                DIP("fcmovnbe %%st(%u), %%st(0)\n", r_src);
   6337                put_ST_UNCHECKED(
   6338                   0,
   6339                   IRExpr_ITE(
   6340                      mk_amd64g_calculate_condition(AMD64CondNBE),
   6341                      get_ST(r_src),
   6342                      get_ST(0)
   6343                   )
   6344                );
   6345                break;
   6346 
   6347             case 0xD8 ... 0xDF: /* FCMOVNU ST(i), ST(0) */
   6348                r_src = (UInt)modrm - 0xD8;
   6349                DIP("fcmovnu %%st(%u), %%st(0)\n", r_src);
   6350                put_ST_UNCHECKED(
   6351                   0,
   6352                   IRExpr_ITE(
   6353                      mk_amd64g_calculate_condition(AMD64CondNP),
   6354                      get_ST(r_src),
   6355                      get_ST(0)
   6356                   )
   6357                );
   6358                break;
   6359 
   6360             case 0xE2:
   6361                DIP("fnclex\n");
   6362                break;
   6363 
   6364             case 0xE3: {
   6365                gen_FINIT_SEQUENCE(NULL/*no guarding condition*/);
   6366                DIP("fninit\n");
   6367                break;
   6368             }
   6369 
   6370             case 0xE8 ... 0xEF: /* FUCOMI %st(0),%st(?) */
   6371                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, False );
   6372                break;
   6373 
   6374             case 0xF0 ... 0xF7: /* FCOMI %st(0),%st(?) */
   6375                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, False );
   6376                break;
   6377 
   6378             default:
   6379                goto decode_fail;
   6380          }
   6381       }
   6382    }
   6383 
   6384    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDC opcodes +-+-+-+-+-+-+-+ */
   6385    else
   6386    if (first_opcode == 0xDC) {
   6387       if (modrm < 0xC0) {
   6388 
   6389          /* bits 5,4,3 are an opcode extension, and the modRM also
   6390             specifies an address. */
   6391          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6392          delta += len;
   6393 
   6394          switch (gregLO3ofRM(modrm)) {
   6395 
   6396             case 0: /* FADD double-real */
   6397                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, True );
   6398                break;
   6399 
   6400             case 1: /* FMUL double-real */
   6401                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, True );
   6402                break;
   6403 
   6404 //..             case 2: /* FCOM double-real */
   6405 //..                DIP("fcoml %s\n", dis_buf);
   6406 //..                /* This forces C1 to zero, which isn't right. */
   6407 //..                put_C3210(
   6408 //..                    binop( Iop_And32,
   6409 //..                           binop(Iop_Shl32,
   6410 //..                                 binop(Iop_CmpF64,
   6411 //..                                       get_ST(0),
   6412 //..                                       loadLE(Ity_F64,mkexpr(addr))),
   6413 //..                                 mkU8(8)),
   6414 //..                           mkU32(0x4500)
   6415 //..                    ));
   6416 //..                break;
   6417 
   6418             case 3: /* FCOMP double-real */
   6419                DIP("fcompl %s\n", dis_buf);
   6420                /* This forces C1 to zero, which isn't right. */
   6421                put_C3210(
   6422                    unop(Iop_32Uto64,
   6423                    binop( Iop_And32,
   6424                           binop(Iop_Shl32,
   6425                                 binop(Iop_CmpF64,
   6426                                       get_ST(0),
   6427                                       loadLE(Ity_F64,mkexpr(addr))),
   6428                                 mkU8(8)),
   6429                           mkU32(0x4500)
   6430                    )));
   6431                fp_pop();
   6432                break;
   6433 
   6434             case 4: /* FSUB double-real */
   6435                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, True );
   6436                break;
   6437 
   6438             case 5: /* FSUBR double-real */
   6439                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, True );
   6440                break;
   6441 
   6442             case 6: /* FDIV double-real */
   6443                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, True );
   6444                break;
   6445 
   6446             case 7: /* FDIVR double-real */
   6447                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, True );
   6448                break;
   6449 
   6450             default:
   6451                vex_printf("unhandled opc_aux = 0x%2x\n",
   6452                           (UInt)gregLO3ofRM(modrm));
   6453                vex_printf("first_opcode == 0xDC\n");
   6454                goto decode_fail;
   6455          }
   6456 
   6457       } else {
   6458 
   6459          delta++;
   6460          switch (modrm) {
   6461 
   6462             case 0xC0 ... 0xC7: /* FADD %st(0),%st(?) */
   6463                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, False );
   6464                break;
   6465 
   6466             case 0xC8 ... 0xCF: /* FMUL %st(0),%st(?) */
   6467                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, False );
   6468                break;
   6469 
   6470             case 0xE0 ... 0xE7: /* FSUBR %st(0),%st(?) */
   6471                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0, modrm - 0xE0, False );
   6472                break;
   6473 
   6474             case 0xE8 ... 0xEF: /* FSUB %st(0),%st(?) */
   6475                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0, modrm - 0xE8, False );
   6476                break;
   6477 
   6478             case 0xF0 ... 0xF7: /* FDIVR %st(0),%st(?) */
   6479                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, False );
   6480                break;
   6481 
   6482             case 0xF8 ... 0xFF: /* FDIV %st(0),%st(?) */
   6483                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, False );
   6484                break;
   6485 
   6486             default:
   6487                goto decode_fail;
   6488          }
   6489 
   6490       }
   6491    }
   6492 
   6493    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDD opcodes +-+-+-+-+-+-+-+ */
   6494    else
   6495    if (first_opcode == 0xDD) {
   6496 
   6497       if (modrm < 0xC0) {
   6498 
   6499          /* bits 5,4,3 are an opcode extension, and the modRM also
   6500             specifies an address. */
   6501          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6502          delta += len;
   6503 
   6504          switch (gregLO3ofRM(modrm)) {
   6505 
   6506             case 0: /* FLD double-real */
   6507                DIP("fldl %s\n", dis_buf);
   6508                fp_push();
   6509                put_ST(0, loadLE(Ity_F64, mkexpr(addr)));
   6510                break;
   6511 
   6512             case 1: /* FISTTPQ m64 (SSE3) */
   6513                DIP("fistppll %s\n", dis_buf);
   6514                storeLE( mkexpr(addr),
   6515                         binop(Iop_F64toI64S, mkU32(Irrm_ZERO), get_ST(0)) );
   6516                fp_pop();
   6517                break;
   6518 
   6519             case 2: /* FST double-real */
   6520                DIP("fstl %s\n", dis_buf);
   6521                storeLE(mkexpr(addr), get_ST(0));
   6522                break;
   6523 
   6524             case 3: /* FSTP double-real */
   6525                DIP("fstpl %s\n", dis_buf);
   6526                storeLE(mkexpr(addr), get_ST(0));
   6527                fp_pop();
   6528                break;
   6529 
   6530             case 4: { /* FRSTOR m94/m108 */
   6531                IRTemp   ew = newTemp(Ity_I32);
   6532                IRTemp  w64 = newTemp(Ity_I64);
   6533                IRDirty*  d;
   6534                if ( have66(pfx) ) {
   6535                   /* Uses dirty helper:
   6536                      VexEmNote amd64g_dirtyhelper_FRSTORS
   6537                                   ( VexGuestAMD64State*, HWord ) */
   6538                   d = unsafeIRDirty_0_N (
   6539                          0/*regparms*/,
   6540                          "amd64g_dirtyhelper_FRSTORS",
   6541                          &amd64g_dirtyhelper_FRSTORS,
   6542                          mkIRExprVec_1( mkexpr(addr) )
   6543                       );
   6544                   d->mSize = 94;
   6545                } else {
   6546                   /* Uses dirty helper:
   6547                      VexEmNote amd64g_dirtyhelper_FRSTOR
   6548                                   ( VexGuestAMD64State*, HWord ) */
   6549                   d = unsafeIRDirty_0_N (
   6550                          0/*regparms*/,
   6551                          "amd64g_dirtyhelper_FRSTOR",
   6552                          &amd64g_dirtyhelper_FRSTOR,
   6553                          mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   6554                       );
   6555                   d->mSize = 108;
   6556                }
   6557 
   6558                d->tmp    = w64;
   6559                /* declare we're reading memory */
   6560                d->mFx   = Ifx_Read;
   6561                d->mAddr = mkexpr(addr);
   6562                /* d->mSize set above */
   6563 
   6564                /* declare we're writing guest state */
   6565                d->nFxState = 5;
   6566                vex_bzero(&d->fxState, sizeof(d->fxState));
   6567 
   6568                d->fxState[0].fx     = Ifx_Write;
   6569                d->fxState[0].offset = OFFB_FTOP;
   6570                d->fxState[0].size   = sizeof(UInt);
   6571 
   6572                d->fxState[1].fx     = Ifx_Write;
   6573                d->fxState[1].offset = OFFB_FPREGS;
   6574                d->fxState[1].size   = 8 * sizeof(ULong);
   6575 
   6576                d->fxState[2].fx     = Ifx_Write;
   6577                d->fxState[2].offset = OFFB_FPTAGS;
   6578                d->fxState[2].size   = 8 * sizeof(UChar);
   6579 
   6580                d->fxState[3].fx     = Ifx_Write;
   6581                d->fxState[3].offset = OFFB_FPROUND;
   6582                d->fxState[3].size   = sizeof(ULong);
   6583 
   6584                d->fxState[4].fx     = Ifx_Write;
   6585                d->fxState[4].offset = OFFB_FC3210;
   6586                d->fxState[4].size   = sizeof(ULong);
   6587 
   6588                stmt( IRStmt_Dirty(d) );
   6589 
   6590                /* ew contains any emulation warning we may need to
   6591                   issue.  If needed, side-exit to the next insn,
   6592                   reporting the warning, so that Valgrind's dispatcher
   6593                   sees the warning. */
   6594                assign(ew, unop(Iop_64to32,mkexpr(w64)) );
   6595                put_emwarn( mkexpr(ew) );
   6596                stmt(
   6597                   IRStmt_Exit(
   6598                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   6599                      Ijk_EmWarn,
   6600                      IRConst_U64( guest_RIP_bbstart+delta ),
   6601                      OFFB_RIP
   6602                   )
   6603                );
   6604 
   6605                if ( have66(pfx) ) {
   6606                   DIP("frstors %s\n", dis_buf);
   6607                } else {
   6608                   DIP("frstor %s\n", dis_buf);
   6609                }
   6610                break;
   6611             }
   6612 
   6613             case 6: { /* FNSAVE m94/m108 */
   6614                IRDirty *d;
   6615                if ( have66(pfx) ) {
   6616                  /* Uses dirty helper:
   6617                     void amd64g_dirtyhelper_FNSAVES ( VexGuestAMD64State*,
   6618                                                       HWord ) */
   6619                   d = unsafeIRDirty_0_N (
   6620                          0/*regparms*/,
   6621                          "amd64g_dirtyhelper_FNSAVES",
   6622                          &amd64g_dirtyhelper_FNSAVES,
   6623                          mkIRExprVec_1( mkexpr(addr) )
   6624                          );
   6625                   d->mSize = 94;
   6626                } else {
   6627                  /* Uses dirty helper:
   6628                     void amd64g_dirtyhelper_FNSAVE ( VexGuestAMD64State*,
   6629                                                      HWord ) */
   6630                   d = unsafeIRDirty_0_N (
   6631                          0/*regparms*/,
   6632                          "amd64g_dirtyhelper_FNSAVE",
   6633                          &amd64g_dirtyhelper_FNSAVE,
   6634                          mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   6635                       );
   6636                   d->mSize = 108;
   6637                }
   6638 
   6639                /* declare we're writing memory */
   6640                d->mFx   = Ifx_Write;
   6641                d->mAddr = mkexpr(addr);
   6642                /* d->mSize set above */
   6643 
   6644                /* declare we're reading guest state */
   6645                d->nFxState = 5;
   6646                vex_bzero(&d->fxState, sizeof(d->fxState));
   6647 
   6648                d->fxState[0].fx     = Ifx_Read;
   6649                d->fxState[0].offset = OFFB_FTOP;
   6650                d->fxState[0].size   = sizeof(UInt);
   6651 
   6652                d->fxState[1].fx     = Ifx_Read;
   6653                d->fxState[1].offset = OFFB_FPREGS;
   6654                d->fxState[1].size   = 8 * sizeof(ULong);
   6655 
   6656                d->fxState[2].fx     = Ifx_Read;
   6657                d->fxState[2].offset = OFFB_FPTAGS;
   6658                d->fxState[2].size   = 8 * sizeof(UChar);
   6659 
   6660                d->fxState[3].fx     = Ifx_Read;
   6661                d->fxState[3].offset = OFFB_FPROUND;
   6662                d->fxState[3].size   = sizeof(ULong);
   6663 
   6664                d->fxState[4].fx     = Ifx_Read;
   6665                d->fxState[4].offset = OFFB_FC3210;
   6666                d->fxState[4].size   = sizeof(ULong);
   6667 
   6668                stmt( IRStmt_Dirty(d) );
   6669 
   6670                if ( have66(pfx) ) {
   6671                  DIP("fnsaves %s\n", dis_buf);
   6672                } else {
   6673                  DIP("fnsave %s\n", dis_buf);
   6674                }
   6675                break;
   6676             }
   6677 
   6678             case 7: { /* FNSTSW m16 */
   6679                IRExpr* sw = get_FPU_sw();
   6680                vassert(typeOfIRExpr(irsb->tyenv, sw) == Ity_I16);
   6681                storeLE( mkexpr(addr), sw );
   6682                DIP("fnstsw %s\n", dis_buf);
   6683                break;
   6684             }
   6685 
   6686             default:
   6687                vex_printf("unhandled opc_aux = 0x%2x\n",
   6688                           (UInt)gregLO3ofRM(modrm));
   6689                vex_printf("first_opcode == 0xDD\n");
   6690                goto decode_fail;
   6691          }
   6692       } else {
   6693          delta++;
   6694          switch (modrm) {
   6695 
   6696             case 0xC0 ... 0xC7: /* FFREE %st(?) */
   6697                r_dst = (UInt)modrm - 0xC0;
   6698                DIP("ffree %%st(%u)\n", r_dst);
   6699                put_ST_TAG ( r_dst, mkU8(0) );
   6700                break;
   6701 
   6702             case 0xD0 ... 0xD7: /* FST %st(0),%st(?) */
   6703                r_dst = (UInt)modrm - 0xD0;
   6704                DIP("fst %%st(0),%%st(%u)\n", r_dst);
   6705                /* P4 manual says: "If the destination operand is a
   6706                   non-empty register, the invalid-operation exception
   6707                   is not generated.  Hence put_ST_UNCHECKED. */
   6708                put_ST_UNCHECKED(r_dst, get_ST(0));
   6709                break;
   6710 
   6711             case 0xD8 ... 0xDF: /* FSTP %st(0),%st(?) */
   6712                r_dst = (UInt)modrm - 0xD8;
   6713                DIP("fstp %%st(0),%%st(%u)\n", r_dst);
   6714                /* P4 manual says: "If the destination operand is a
   6715                   non-empty register, the invalid-operation exception
   6716                   is not generated.  Hence put_ST_UNCHECKED. */
   6717                put_ST_UNCHECKED(r_dst, get_ST(0));
   6718                fp_pop();
   6719                break;
   6720 
   6721             case 0xE0 ... 0xE7: /* FUCOM %st(0),%st(?) */
   6722                r_dst = (UInt)modrm - 0xE0;
   6723                DIP("fucom %%st(0),%%st(%u)\n", r_dst);
   6724                /* This forces C1 to zero, which isn't right. */
   6725                put_C3210(
   6726                    unop(Iop_32Uto64,
   6727                    binop( Iop_And32,
   6728                           binop(Iop_Shl32,
   6729                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   6730                                 mkU8(8)),
   6731                           mkU32(0x4500)
   6732                    )));
   6733                break;
   6734 
   6735             case 0xE8 ... 0xEF: /* FUCOMP %st(0),%st(?) */
   6736                r_dst = (UInt)modrm - 0xE8;
   6737                DIP("fucomp %%st(0),%%st(%u)\n", r_dst);
   6738                /* This forces C1 to zero, which isn't right. */
   6739                put_C3210(
   6740                    unop(Iop_32Uto64,
   6741                    binop( Iop_And32,
   6742                           binop(Iop_Shl32,
   6743                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   6744                                 mkU8(8)),
   6745                           mkU32(0x4500)
   6746                    )));
   6747                fp_pop();
   6748                break;
   6749 
   6750             default:
   6751                goto decode_fail;
   6752          }
   6753       }
   6754    }
   6755 
   6756    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDE opcodes +-+-+-+-+-+-+-+ */
   6757    else
   6758    if (first_opcode == 0xDE) {
   6759 
   6760       if (modrm < 0xC0) {
   6761 
   6762          /* bits 5,4,3 are an opcode extension, and the modRM also
   6763             specifies an address. */
   6764          IROp   fop;
   6765          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6766          delta += len;
   6767 
   6768          switch (gregLO3ofRM(modrm)) {
   6769 
   6770             case 0: /* FIADD m16int */ /* ST(0) += m16int */
   6771                DIP("fiaddw %s\n", dis_buf);
   6772                fop = Iop_AddF64;
   6773                goto do_fop_m16;
   6774 
   6775             case 1: /* FIMUL m16int */ /* ST(0) *= m16int */
   6776                DIP("fimulw %s\n", dis_buf);
   6777                fop = Iop_MulF64;
   6778                goto do_fop_m16;
   6779 
   6780             case 4: /* FISUB m16int */ /* ST(0) -= m16int */
   6781                DIP("fisubw %s\n", dis_buf);
   6782                fop = Iop_SubF64;
   6783                goto do_fop_m16;
   6784 
   6785             case 5: /* FISUBR m16int */ /* ST(0) = m16int - ST(0) */
   6786                DIP("fisubrw %s\n", dis_buf);
   6787                fop = Iop_SubF64;
   6788                goto do_foprev_m16;
   6789 
   6790             case 6: /* FIDIV m16int */ /* ST(0) /= m16int */
   6791                DIP("fisubw %s\n", dis_buf);
   6792                fop = Iop_DivF64;
   6793                goto do_fop_m16;
   6794 
   6795             case 7: /* FIDIVR m16int */ /* ST(0) = m16int / ST(0) */
   6796                DIP("fidivrw %s\n", dis_buf);
   6797                fop = Iop_DivF64;
   6798                goto do_foprev_m16;
   6799 
   6800             do_fop_m16:
   6801                put_ST_UNCHECKED(0,
   6802                   triop(fop,
   6803                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6804                         get_ST(0),
   6805                         unop(Iop_I32StoF64,
   6806                              unop(Iop_16Sto32,
   6807                                   loadLE(Ity_I16, mkexpr(addr))))));
   6808                break;
   6809 
   6810             do_foprev_m16:
   6811                put_ST_UNCHECKED(0,
   6812                   triop(fop,
   6813                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6814                         unop(Iop_I32StoF64,
   6815                              unop(Iop_16Sto32,
   6816                                   loadLE(Ity_I16, mkexpr(addr)))),
   6817                         get_ST(0)));
   6818                break;
   6819 
   6820             default:
   6821                vex_printf("unhandled opc_aux = 0x%2x\n",
   6822                           (UInt)gregLO3ofRM(modrm));
   6823                vex_printf("first_opcode == 0xDE\n");
   6824                goto decode_fail;
   6825          }
   6826 
   6827       } else {
   6828 
   6829          delta++;
   6830          switch (modrm) {
   6831 
   6832             case 0xC0 ... 0xC7: /* FADDP %st(0),%st(?) */
   6833                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, True );
   6834                break;
   6835 
   6836             case 0xC8 ... 0xCF: /* FMULP %st(0),%st(?) */
   6837                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, True );
   6838                break;
   6839 
   6840             case 0xD9: /* FCOMPP %st(0),%st(1) */
   6841                DIP("fcompp %%st(0),%%st(1)\n");
   6842                /* This forces C1 to zero, which isn't right. */
   6843                put_C3210(
   6844                    unop(Iop_32Uto64,
   6845                    binop( Iop_And32,
   6846                           binop(Iop_Shl32,
   6847                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   6848                                 mkU8(8)),
   6849                           mkU32(0x4500)
   6850                    )));
   6851                fp_pop();
   6852                fp_pop();
   6853                break;
   6854 
   6855             case 0xE0 ... 0xE7: /* FSUBRP %st(0),%st(?) */
   6856                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0,  modrm - 0xE0, True );
   6857                break;
   6858 
   6859             case 0xE8 ... 0xEF: /* FSUBP %st(0),%st(?) */
   6860                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0,  modrm - 0xE8, True );
   6861                break;
   6862 
   6863             case 0xF0 ... 0xF7: /* FDIVRP %st(0),%st(?) */
   6864                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, True );
   6865                break;
   6866 
   6867             case 0xF8 ... 0xFF: /* FDIVP %st(0),%st(?) */
   6868                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, True );
   6869                break;
   6870 
   6871             default:
   6872                goto decode_fail;
   6873          }
   6874 
   6875       }
   6876    }
   6877 
   6878    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDF opcodes +-+-+-+-+-+-+-+ */
   6879    else
   6880    if (first_opcode == 0xDF) {
   6881 
   6882       if (modrm < 0xC0) {
   6883 
   6884          /* bits 5,4,3 are an opcode extension, and the modRM also
   6885             specifies an address. */
   6886          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6887          delta += len;
   6888 
   6889          switch (gregLO3ofRM(modrm)) {
   6890 
   6891             case 0: /* FILD m16int */
   6892                DIP("fildw %s\n", dis_buf);
   6893                fp_push();
   6894                put_ST(0, unop(Iop_I32StoF64,
   6895                               unop(Iop_16Sto32,
   6896                                    loadLE(Ity_I16, mkexpr(addr)))));
   6897                break;
   6898 
   6899             case 1: /* FISTTPS m16 (SSE3) */
   6900                DIP("fisttps %s\n", dis_buf);
   6901                storeLE( mkexpr(addr),
   6902                         x87ishly_qnarrow_32_to_16(
   6903                         binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) ));
   6904                fp_pop();
   6905                break;
   6906 
   6907             case 2: /* FIST m16 */
   6908                DIP("fists %s\n", dis_buf);
   6909                storeLE( mkexpr(addr),
   6910                         x87ishly_qnarrow_32_to_16(
   6911                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
   6912                break;
   6913 
   6914             case 3: /* FISTP m16 */
   6915                DIP("fistps %s\n", dis_buf);
   6916                storeLE( mkexpr(addr),
   6917                         x87ishly_qnarrow_32_to_16(
   6918                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
   6919                fp_pop();
   6920                break;
   6921 
   6922             case 5: /* FILD m64 */
   6923                DIP("fildll %s\n", dis_buf);
   6924                fp_push();
   6925                put_ST(0, binop(Iop_I64StoF64,
   6926                                get_roundingmode(),
   6927                                loadLE(Ity_I64, mkexpr(addr))));
   6928                break;
   6929 
   6930             case 7: /* FISTP m64 */
   6931                DIP("fistpll %s\n", dis_buf);
   6932                storeLE( mkexpr(addr),
   6933                         binop(Iop_F64toI64S, get_roundingmode(), get_ST(0)) );
   6934                fp_pop();
   6935                break;
   6936 
   6937             default:
   6938                vex_printf("unhandled opc_aux = 0x%2x\n",
   6939                           (UInt)gregLO3ofRM(modrm));
   6940                vex_printf("first_opcode == 0xDF\n");
   6941                goto decode_fail;
   6942          }
   6943 
   6944       } else {
   6945 
   6946          delta++;
   6947          switch (modrm) {
   6948 
   6949             case 0xC0: /* FFREEP %st(0) */
   6950                DIP("ffreep %%st(%d)\n", 0);
   6951                put_ST_TAG ( 0, mkU8(0) );
   6952                fp_pop();
   6953                break;
   6954 
   6955             case 0xE0: /* FNSTSW %ax */
   6956                DIP("fnstsw %%ax\n");
   6957                /* Invent a plausible-looking FPU status word value and
   6958                   dump it in %AX:
   6959                      ((ftop & 7) << 11) | (c3210 & 0x4700)
   6960                */
   6961                putIRegRAX(
   6962                   2,
   6963                   unop(Iop_32to16,
   6964                        binop(Iop_Or32,
   6965                              binop(Iop_Shl32,
   6966                                    binop(Iop_And32, get_ftop(), mkU32(7)),
   6967                                    mkU8(11)),
   6968                              binop(Iop_And32,
   6969                                    unop(Iop_64to32, get_C3210()),
   6970                                    mkU32(0x4700))
   6971                )));
   6972                break;
   6973 
   6974             case 0xE8 ... 0xEF: /* FUCOMIP %st(0),%st(?) */
   6975                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, True );
   6976                break;
   6977 
   6978             case 0xF0 ... 0xF7: /* FCOMIP %st(0),%st(?) */
   6979                /* not really right since COMIP != UCOMIP */
   6980                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, True );
   6981                break;
   6982 
   6983             default:
   6984                goto decode_fail;
   6985          }
   6986       }
   6987 
   6988    }
   6989 
   6990    else
   6991       goto decode_fail;
   6992 
   6993    *decode_ok = True;
   6994    return delta;
   6995 
   6996   decode_fail:
   6997    *decode_ok = False;
   6998    return delta;
   6999 }
   7000 
   7001 
   7002 /*------------------------------------------------------------*/
   7003 /*---                                                      ---*/
   7004 /*--- MMX INSTRUCTIONS                                     ---*/
   7005 /*---                                                      ---*/
   7006 /*------------------------------------------------------------*/
   7007 
   7008 /* Effect of MMX insns on x87 FPU state (table 11-2 of
   7009    IA32 arch manual, volume 3):
   7010 
   7011    Read from, or write to MMX register (viz, any insn except EMMS):
   7012    * All tags set to Valid (non-empty) -- FPTAGS[i] := nonzero
   7013    * FP stack pointer set to zero
   7014 
   7015    EMMS:
   7016    * All tags set to Invalid (empty) -- FPTAGS[i] := zero
   7017    * FP stack pointer set to zero
   7018 */
   7019 
   7020 static void do_MMX_preamble ( void )
   7021 {
   7022    Int         i;
   7023    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   7024    IRExpr*     zero  = mkU32(0);
   7025    IRExpr*     tag1  = mkU8(1);
   7026    put_ftop(zero);
   7027    for (i = 0; i < 8; i++)
   7028       stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag1) ) );
   7029 }
   7030 
   7031 static void do_EMMS_preamble ( void )
   7032 {
   7033    Int         i;
   7034    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   7035    IRExpr*     zero  = mkU32(0);
   7036    IRExpr*     tag0  = mkU8(0);
   7037    put_ftop(zero);
   7038    for (i = 0; i < 8; i++)
   7039       stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag0) ) );
   7040 }
   7041 
   7042 
   7043 static IRExpr* getMMXReg ( UInt archreg )
   7044 {
   7045    vassert(archreg < 8);
   7046    return IRExpr_Get( OFFB_FPREGS + 8 * archreg, Ity_I64 );
   7047 }
   7048 
   7049 
   7050 static void putMMXReg ( UInt archreg, IRExpr* e )
   7051 {
   7052    vassert(archreg < 8);
   7053    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   7054    stmt( IRStmt_Put( OFFB_FPREGS + 8 * archreg, e ) );
   7055 }
   7056 
   7057 
   7058 /* Helper for non-shift MMX insns.  Note this is incomplete in the
   7059    sense that it does not first call do_MMX_preamble() -- that is the
   7060    responsibility of its caller. */
   7061 
   7062 static
   7063 ULong dis_MMXop_regmem_to_reg ( const VexAbiInfo* vbi,
   7064                                 Prefix      pfx,
   7065                                 Long        delta,
   7066                                 UChar       opc,
   7067                                 const HChar* name,
   7068                                 Bool        show_granularity )
   7069 {
   7070    HChar   dis_buf[50];
   7071    UChar   modrm = getUChar(delta);
   7072    Bool    isReg = epartIsReg(modrm);
   7073    IRExpr* argL  = NULL;
   7074    IRExpr* argR  = NULL;
   7075    IRExpr* argG  = NULL;
   7076    IRExpr* argE  = NULL;
   7077    IRTemp  res   = newTemp(Ity_I64);
   7078 
   7079    Bool    invG  = False;
   7080    IROp    op    = Iop_INVALID;
   7081    void*   hAddr = NULL;
   7082    const HChar*  hName = NULL;
   7083    Bool    eLeft = False;
   7084 
   7085 #  define XXX(_name) do { hAddr = &_name; hName = #_name; } while (0)
   7086 
   7087    switch (opc) {
   7088       /* Original MMX ones */
   7089       case 0xFC: op = Iop_Add8x8; break;
   7090       case 0xFD: op = Iop_Add16x4; break;
   7091       case 0xFE: op = Iop_Add32x2; break;
   7092 
   7093       case 0xEC: op = Iop_QAdd8Sx8; break;
   7094       case 0xED: op = Iop_QAdd16Sx4; break;
   7095 
   7096       case 0xDC: op = Iop_QAdd8Ux8; break;
   7097       case 0xDD: op = Iop_QAdd16Ux4; break;
   7098 
   7099       case 0xF8: op = Iop_Sub8x8;  break;
   7100       case 0xF9: op = Iop_Sub16x4; break;
   7101       case 0xFA: op = Iop_Sub32x2; break;
   7102 
   7103       case 0xE8: op = Iop_QSub8Sx8; break;
   7104       case 0xE9: op = Iop_QSub16Sx4; break;
   7105 
   7106       case 0xD8: op = Iop_QSub8Ux8; break;
   7107       case 0xD9: op = Iop_QSub16Ux4; break;
   7108 
   7109       case 0xE5: op = Iop_MulHi16Sx4; break;
   7110       case 0xD5: op = Iop_Mul16x4; break;
   7111       case 0xF5: XXX(amd64g_calculate_mmx_pmaddwd); break;
   7112 
   7113       case 0x74: op = Iop_CmpEQ8x8; break;
   7114       case 0x75: op = Iop_CmpEQ16x4; break;
   7115       case 0x76: op = Iop_CmpEQ32x2; break;
   7116 
   7117       case 0x64: op = Iop_CmpGT8Sx8; break;
   7118       case 0x65: op = Iop_CmpGT16Sx4; break;
   7119       case 0x66: op = Iop_CmpGT32Sx2; break;
   7120 
   7121       case 0x6B: op = Iop_QNarrowBin32Sto16Sx4; eLeft = True; break;
   7122       case 0x63: op = Iop_QNarrowBin16Sto8Sx8;  eLeft = True; break;
   7123       case 0x67: op = Iop_QNarrowBin16Sto8Ux8;  eLeft = True; break;
   7124 
   7125       case 0x68: op = Iop_InterleaveHI8x8;  eLeft = True; break;
   7126       case 0x69: op = Iop_InterleaveHI16x4; eLeft = True; break;
   7127       case 0x6A: op = Iop_InterleaveHI32x2; eLeft = True; break;
   7128 
   7129       case 0x60: op = Iop_InterleaveLO8x8;  eLeft = True; break;
   7130       case 0x61: op = Iop_InterleaveLO16x4; eLeft = True; break;
   7131       case 0x62: op = Iop_InterleaveLO32x2; eLeft = True; break;
   7132 
   7133       case 0xDB: op = Iop_And64; break;
   7134       case 0xDF: op = Iop_And64; invG = True; break;
   7135       case 0xEB: op = Iop_Or64; break;
   7136       case 0xEF: /* Possibly do better here if argL and argR are the
   7137                     same reg */
   7138                  op = Iop_Xor64; break;
   7139 
   7140       /* Introduced in SSE1 */
   7141       case 0xE0: op = Iop_Avg8Ux8;    break;
   7142       case 0xE3: op = Iop_Avg16Ux4;   break;
   7143       case 0xEE: op = Iop_Max16Sx4;   break;
   7144       case 0xDE: op = Iop_Max8Ux8;    break;
   7145       case 0xEA: op = Iop_Min16Sx4;   break;
   7146       case 0xDA: op = Iop_Min8Ux8;    break;
   7147       case 0xE4: op = Iop_MulHi16Ux4; break;
   7148       case 0xF6: XXX(amd64g_calculate_mmx_psadbw); break;
   7149 
   7150       /* Introduced in SSE2 */
   7151       case 0xD4: op = Iop_Add64; break;
   7152       case 0xFB: op = Iop_Sub64; break;
   7153 
   7154       default:
   7155          vex_printf("\n0x%x\n", (UInt)opc);
   7156          vpanic("dis_MMXop_regmem_to_reg");
   7157    }
   7158 
   7159 #  undef XXX
   7160 
   7161    argG = getMMXReg(gregLO3ofRM(modrm));
   7162    if (invG)
   7163       argG = unop(Iop_Not64, argG);
   7164 
   7165    if (isReg) {
   7166       delta++;
   7167       argE = getMMXReg(eregLO3ofRM(modrm));
   7168    } else {
   7169       Int    len;
   7170       IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7171       delta += len;
   7172       argE = loadLE(Ity_I64, mkexpr(addr));
   7173    }
   7174 
   7175    if (eLeft) {
   7176       argL = argE;
   7177       argR = argG;
   7178    } else {
   7179       argL = argG;
   7180       argR = argE;
   7181    }
   7182 
   7183    if (op != Iop_INVALID) {
   7184       vassert(hName == NULL);
   7185       vassert(hAddr == NULL);
   7186       assign(res, binop(op, argL, argR));
   7187    } else {
   7188       vassert(hName != NULL);
   7189       vassert(hAddr != NULL);
   7190       assign( res,
   7191               mkIRExprCCall(
   7192                  Ity_I64,
   7193                  0/*regparms*/, hName, hAddr,
   7194                  mkIRExprVec_2( argL, argR )
   7195               )
   7196             );
   7197    }
   7198 
   7199    putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
   7200 
   7201    DIP("%s%s %s, %s\n",
   7202        name, show_granularity ? nameMMXGran(opc & 3) : "",
   7203        ( isReg ? nameMMXReg(eregLO3ofRM(modrm)) : dis_buf ),
   7204        nameMMXReg(gregLO3ofRM(modrm)) );
   7205 
   7206    return delta;
   7207 }
   7208 
   7209 
   7210 /* Vector by scalar shift of G by the amount specified at the bottom
   7211    of E.  This is a straight copy of dis_SSE_shiftG_byE. */
   7212 
   7213 static ULong dis_MMX_shiftG_byE ( const VexAbiInfo* vbi,
   7214                                   Prefix pfx, Long delta,
   7215                                   const HChar* opname, IROp op )
   7216 {
   7217    HChar   dis_buf[50];
   7218    Int     alen, size;
   7219    IRTemp  addr;
   7220    Bool    shl, shr, sar;
   7221    UChar   rm   = getUChar(delta);
   7222    IRTemp  g0   = newTemp(Ity_I64);
   7223    IRTemp  g1   = newTemp(Ity_I64);
   7224    IRTemp  amt  = newTemp(Ity_I64);
   7225    IRTemp  amt8 = newTemp(Ity_I8);
   7226 
   7227    if (epartIsReg(rm)) {
   7228       assign( amt, getMMXReg(eregLO3ofRM(rm)) );
   7229       DIP("%s %s,%s\n", opname,
   7230                         nameMMXReg(eregLO3ofRM(rm)),
   7231                         nameMMXReg(gregLO3ofRM(rm)) );
   7232       delta++;
   7233    } else {
   7234       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   7235       assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
   7236       DIP("%s %s,%s\n", opname,
   7237                         dis_buf,
   7238                         nameMMXReg(gregLO3ofRM(rm)) );
   7239       delta += alen;
   7240    }
   7241    assign( g0,   getMMXReg(gregLO3ofRM(rm)) );
   7242    assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
   7243 
   7244    shl = shr = sar = False;
   7245    size = 0;
   7246    switch (op) {
   7247       case Iop_ShlN16x4: shl = True; size = 32; break;
   7248       case Iop_ShlN32x2: shl = True; size = 32; break;
   7249       case Iop_Shl64:    shl = True; size = 64; break;
   7250       case Iop_ShrN16x4: shr = True; size = 16; break;
   7251       case Iop_ShrN32x2: shr = True; size = 32; break;
   7252       case Iop_Shr64:    shr = True; size = 64; break;
   7253       case Iop_SarN16x4: sar = True; size = 16; break;
   7254       case Iop_SarN32x2: sar = True; size = 32; break;
   7255       default: vassert(0);
   7256    }
   7257 
   7258    if (shl || shr) {
   7259      assign(
   7260         g1,
   7261         IRExpr_ITE(
   7262            binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size)),
   7263            binop(op, mkexpr(g0), mkexpr(amt8)),
   7264            mkU64(0)
   7265         )
   7266      );
   7267    } else
   7268    if (sar) {
   7269      assign(
   7270         g1,
   7271         IRExpr_ITE(
   7272            binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size)),
   7273            binop(op, mkexpr(g0), mkexpr(amt8)),
   7274            binop(op, mkexpr(g0), mkU8(size-1))
   7275         )
   7276      );
   7277    } else {
   7278       vassert(0);
   7279    }
   7280 
   7281    putMMXReg( gregLO3ofRM(rm), mkexpr(g1) );
   7282    return delta;
   7283 }
   7284 
   7285 
   7286 /* Vector by scalar shift of E by an immediate byte.  This is a
   7287    straight copy of dis_SSE_shiftE_imm. */
   7288 
   7289 static
   7290 ULong dis_MMX_shiftE_imm ( Long delta, const HChar* opname, IROp op )
   7291 {
   7292    Bool    shl, shr, sar;
   7293    UChar   rm   = getUChar(delta);
   7294    IRTemp  e0   = newTemp(Ity_I64);
   7295    IRTemp  e1   = newTemp(Ity_I64);
   7296    UChar   amt, size;
   7297    vassert(epartIsReg(rm));
   7298    vassert(gregLO3ofRM(rm) == 2
   7299            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
   7300    amt = getUChar(delta+1);
   7301    delta += 2;
   7302    DIP("%s $%d,%s\n", opname,
   7303                       (Int)amt,
   7304                       nameMMXReg(eregLO3ofRM(rm)) );
   7305 
   7306    assign( e0, getMMXReg(eregLO3ofRM(rm)) );
   7307 
   7308    shl = shr = sar = False;
   7309    size = 0;
   7310    switch (op) {
   7311       case Iop_ShlN16x4: shl = True; size = 16; break;
   7312       case Iop_ShlN32x2: shl = True; size = 32; break;
   7313       case Iop_Shl64:    shl = True; size = 64; break;
   7314       case Iop_SarN16x4: sar = True; size = 16; break;
   7315       case Iop_SarN32x2: sar = True; size = 32; break;
   7316       case Iop_ShrN16x4: shr = True; size = 16; break;
   7317       case Iop_ShrN32x2: shr = True; size = 32; break;
   7318       case Iop_Shr64:    shr = True; size = 64; break;
   7319       default: vassert(0);
   7320    }
   7321 
   7322    if (shl || shr) {
   7323      assign( e1, amt >= size
   7324                     ? mkU64(0)
   7325                     : binop(op, mkexpr(e0), mkU8(amt))
   7326      );
   7327    } else
   7328    if (sar) {
   7329      assign( e1, amt >= size
   7330                     ? binop(op, mkexpr(e0), mkU8(size-1))
   7331                     : binop(op, mkexpr(e0), mkU8(amt))
   7332      );
   7333    } else {
   7334       vassert(0);
   7335    }
   7336 
   7337    putMMXReg( eregLO3ofRM(rm), mkexpr(e1) );
   7338    return delta;
   7339 }
   7340 
   7341 
   7342 /* Completely handle all MMX instructions except emms. */
   7343 
   7344 static
   7345 ULong dis_MMX ( Bool* decode_ok,
   7346                 const VexAbiInfo* vbi, Prefix pfx, Int sz, Long delta )
   7347 {
   7348    Int   len;
   7349    UChar modrm;
   7350    HChar dis_buf[50];
   7351    UChar opc = getUChar(delta);
   7352    delta++;
   7353 
   7354    /* dis_MMX handles all insns except emms. */
   7355    do_MMX_preamble();
   7356 
   7357    switch (opc) {
   7358 
   7359       case 0x6E:
   7360          if (sz == 4) {
   7361             /* MOVD (src)ireg32-or-mem32 (E), (dst)mmxreg (G)*/
   7362             modrm = getUChar(delta);
   7363             if (epartIsReg(modrm)) {
   7364                delta++;
   7365                putMMXReg(
   7366                   gregLO3ofRM(modrm),
   7367                   binop( Iop_32HLto64,
   7368                          mkU32(0),
   7369                          getIReg32(eregOfRexRM(pfx,modrm)) ) );
   7370                DIP("movd %s, %s\n",
   7371                    nameIReg32(eregOfRexRM(pfx,modrm)),
   7372                    nameMMXReg(gregLO3ofRM(modrm)));
   7373             } else {
   7374                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7375                delta += len;
   7376                putMMXReg(
   7377                   gregLO3ofRM(modrm),
   7378                   binop( Iop_32HLto64,
   7379                          mkU32(0),
   7380                          loadLE(Ity_I32, mkexpr(addr)) ) );
   7381                DIP("movd %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
   7382             }
   7383          }
   7384          else
   7385          if (sz == 8) {
   7386             /* MOVD (src)ireg64-or-mem64 (E), (dst)mmxreg (G)*/
   7387             modrm = getUChar(delta);
   7388             if (epartIsReg(modrm)) {
   7389                delta++;
   7390                putMMXReg( gregLO3ofRM(modrm),
   7391                           getIReg64(eregOfRexRM(pfx,modrm)) );
   7392                DIP("movd %s, %s\n",
   7393                    nameIReg64(eregOfRexRM(pfx,modrm)),
   7394                    nameMMXReg(gregLO3ofRM(modrm)));
   7395             } else {
   7396                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7397                delta += len;
   7398                putMMXReg( gregLO3ofRM(modrm),
   7399                           loadLE(Ity_I64, mkexpr(addr)) );
   7400                DIP("movd{64} %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
   7401             }
   7402          }
   7403          else {
   7404             goto mmx_decode_failure;
   7405          }
   7406          break;
   7407 
   7408       case 0x7E:
   7409          if (sz == 4) {
   7410             /* MOVD (src)mmxreg (G), (dst)ireg32-or-mem32 (E) */
   7411             modrm = getUChar(delta);
   7412             if (epartIsReg(modrm)) {
   7413                delta++;
   7414                putIReg32( eregOfRexRM(pfx,modrm),
   7415                           unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
   7416                DIP("movd %s, %s\n",
   7417                    nameMMXReg(gregLO3ofRM(modrm)),
   7418                    nameIReg32(eregOfRexRM(pfx,modrm)));
   7419             } else {
   7420                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7421                delta += len;
   7422                storeLE( mkexpr(addr),
   7423                         unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
   7424                DIP("movd %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
   7425             }
   7426          }
   7427          else
   7428          if (sz == 8) {
   7429             /* MOVD (src)mmxreg (G), (dst)ireg64-or-mem64 (E) */
   7430             modrm = getUChar(delta);
   7431             if (epartIsReg(modrm)) {
   7432                delta++;
   7433                putIReg64( eregOfRexRM(pfx,modrm),
   7434                           getMMXReg(gregLO3ofRM(modrm)) );
   7435                DIP("movd %s, %s\n",
   7436                    nameMMXReg(gregLO3ofRM(modrm)),
   7437                    nameIReg64(eregOfRexRM(pfx,modrm)));
   7438             } else {
   7439                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7440                delta += len;
   7441                storeLE( mkexpr(addr),
   7442                        getMMXReg(gregLO3ofRM(modrm)) );
   7443                DIP("movd{64} %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
   7444             }
   7445          } else {
   7446             goto mmx_decode_failure;
   7447          }
   7448          break;
   7449 
   7450       case 0x6F:
   7451          /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
   7452          if (sz != 4
   7453              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7454             goto mmx_decode_failure;
   7455          modrm = getUChar(delta);
   7456          if (epartIsReg(modrm)) {
   7457             delta++;
   7458             putMMXReg( gregLO3ofRM(modrm), getMMXReg(eregLO3ofRM(modrm)) );
   7459             DIP("movq %s, %s\n",
   7460                 nameMMXReg(eregLO3ofRM(modrm)),
   7461                 nameMMXReg(gregLO3ofRM(modrm)));
   7462          } else {
   7463             IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7464             delta += len;
   7465             putMMXReg( gregLO3ofRM(modrm), loadLE(Ity_I64, mkexpr(addr)) );
   7466             DIP("movq %s, %s\n",
   7467                 dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
   7468          }
   7469          break;
   7470 
   7471       case 0x7F:
   7472          /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
   7473          if (sz != 4
   7474              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7475             goto mmx_decode_failure;
   7476          modrm = getUChar(delta);
   7477          if (epartIsReg(modrm)) {
   7478             delta++;
   7479             putMMXReg( eregLO3ofRM(modrm), getMMXReg(gregLO3ofRM(modrm)) );
   7480             DIP("movq %s, %s\n",
   7481                 nameMMXReg(gregLO3ofRM(modrm)),
   7482                 nameMMXReg(eregLO3ofRM(modrm)));
   7483          } else {
   7484             IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7485             delta += len;
   7486             storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
   7487             DIP("mov(nt)q %s, %s\n",
   7488                 nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
   7489          }
   7490          break;
   7491 
   7492       case 0xFC:
   7493       case 0xFD:
   7494       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
   7495          if (sz != 4)
   7496             goto mmx_decode_failure;
   7497          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padd", True );
   7498          break;
   7499 
   7500       case 0xEC:
   7501       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
   7502          if (sz != 4
   7503              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7504             goto mmx_decode_failure;
   7505          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padds", True );
   7506          break;
   7507 
   7508       case 0xDC:
   7509       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   7510          if (sz != 4)
   7511             goto mmx_decode_failure;
   7512          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "paddus", True );
   7513          break;
   7514 
   7515       case 0xF8:
   7516       case 0xF9:
   7517       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
   7518          if (sz != 4)
   7519             goto mmx_decode_failure;
   7520          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psub", True );
   7521          break;
   7522 
   7523       case 0xE8:
   7524       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
   7525          if (sz != 4)
   7526             goto mmx_decode_failure;
   7527          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubs", True );
   7528          break;
   7529 
   7530       case 0xD8:
   7531       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   7532          if (sz != 4)
   7533             goto mmx_decode_failure;
   7534          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubus", True );
   7535          break;
   7536 
   7537       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
   7538          if (sz != 4)
   7539             goto mmx_decode_failure;
   7540          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmulhw", False );
   7541          break;
   7542 
   7543       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
   7544          if (sz != 4)
   7545             goto mmx_decode_failure;
   7546          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmullw", False );
   7547          break;
   7548 
   7549       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
   7550          vassert(sz == 4);
   7551          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmaddwd", False );
   7552          break;
   7553 
   7554       case 0x74:
   7555       case 0x75:
   7556       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
   7557          if (sz != 4)
   7558             goto mmx_decode_failure;
   7559          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpeq", True );
   7560          break;
   7561 
   7562       case 0x64:
   7563       case 0x65:
   7564       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
   7565          if (sz != 4)
   7566             goto mmx_decode_failure;
   7567          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpgt", True );
   7568          break;
   7569 
   7570       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
   7571          if (sz != 4)
   7572             goto mmx_decode_failure;
   7573          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packssdw", False );
   7574          break;
   7575 
   7576       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
   7577          if (sz != 4)
   7578             goto mmx_decode_failure;
   7579          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packsswb", False );
   7580          break;
   7581 
   7582       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
   7583          if (sz != 4)
   7584             goto mmx_decode_failure;
   7585          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packuswb", False );
   7586          break;
   7587 
   7588       case 0x68:
   7589       case 0x69:
   7590       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
   7591          if (sz != 4
   7592              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7593             goto mmx_decode_failure;
   7594          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckh", True );
   7595          break;
   7596 
   7597       case 0x60:
   7598       case 0x61:
   7599       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
   7600          if (sz != 4
   7601              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7602             goto mmx_decode_failure;
   7603          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckl", True );
   7604          break;
   7605 
   7606       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
   7607          if (sz != 4)
   7608             goto mmx_decode_failure;
   7609          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pand", False );
   7610          break;
   7611 
   7612       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
   7613          if (sz != 4)
   7614             goto mmx_decode_failure;
   7615          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pandn", False );
   7616          break;
   7617 
   7618       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
   7619          if (sz != 4)
   7620             goto mmx_decode_failure;
   7621          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "por", False );
   7622          break;
   7623 
   7624       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
   7625          if (sz != 4)
   7626             goto mmx_decode_failure;
   7627          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pxor", False );
   7628          break;
   7629 
   7630 #     define SHIFT_BY_REG(_name,_op)                                     \
   7631                 delta = dis_MMX_shiftG_byE(vbi, pfx, delta, _name, _op); \
   7632                 break;
   7633 
   7634       /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
   7635       case 0xF1: SHIFT_BY_REG("psllw", Iop_ShlN16x4);
   7636       case 0xF2: SHIFT_BY_REG("pslld", Iop_ShlN32x2);
   7637       case 0xF3: SHIFT_BY_REG("psllq", Iop_Shl64);
   7638 
   7639       /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
   7640       case 0xD1: SHIFT_BY_REG("psrlw", Iop_ShrN16x4);
   7641       case 0xD2: SHIFT_BY_REG("psrld", Iop_ShrN32x2);
   7642       case 0xD3: SHIFT_BY_REG("psrlq", Iop_Shr64);
   7643 
   7644       /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
   7645       case 0xE1: SHIFT_BY_REG("psraw", Iop_SarN16x4);
   7646       case 0xE2: SHIFT_BY_REG("psrad", Iop_SarN32x2);
   7647 
   7648 #     undef SHIFT_BY_REG
   7649 
   7650       case 0x71:
   7651       case 0x72:
   7652       case 0x73: {
   7653          /* (sz==4): PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
   7654          UChar byte2, subopc;
   7655          if (sz != 4)
   7656             goto mmx_decode_failure;
   7657          byte2  = getUChar(delta);      /* amode / sub-opcode */
   7658          subopc = toUChar( (byte2 >> 3) & 7 );
   7659 
   7660 #        define SHIFT_BY_IMM(_name,_op)                        \
   7661             do { delta = dis_MMX_shiftE_imm(delta,_name,_op);  \
   7662             } while (0)
   7663 
   7664               if (subopc == 2 /*SRL*/ && opc == 0x71)
   7665                   SHIFT_BY_IMM("psrlw", Iop_ShrN16x4);
   7666          else if (subopc == 2 /*SRL*/ && opc == 0x72)
   7667                  SHIFT_BY_IMM("psrld", Iop_ShrN32x2);
   7668          else if (subopc == 2 /*SRL*/ && opc == 0x73)
   7669                  SHIFT_BY_IMM("psrlq", Iop_Shr64);
   7670 
   7671          else if (subopc == 4 /*SAR*/ && opc == 0x71)
   7672                  SHIFT_BY_IMM("psraw", Iop_SarN16x4);
   7673          else if (subopc == 4 /*SAR*/ && opc == 0x72)
   7674                  SHIFT_BY_IMM("psrad", Iop_SarN32x2);
   7675 
   7676          else if (subopc == 6 /*SHL*/ && opc == 0x71)
   7677                  SHIFT_BY_IMM("psllw", Iop_ShlN16x4);
   7678          else if (subopc == 6 /*SHL*/ && opc == 0x72)
   7679                   SHIFT_BY_IMM("pslld", Iop_ShlN32x2);
   7680          else if (subopc == 6 /*SHL*/ && opc == 0x73)
   7681                  SHIFT_BY_IMM("psllq", Iop_Shl64);
   7682 
   7683          else goto mmx_decode_failure;
   7684 
   7685 #        undef SHIFT_BY_IMM
   7686          break;
   7687       }
   7688 
   7689       case 0xF7: {
   7690          IRTemp addr    = newTemp(Ity_I64);
   7691          IRTemp regD    = newTemp(Ity_I64);
   7692          IRTemp regM    = newTemp(Ity_I64);
   7693          IRTemp mask    = newTemp(Ity_I64);
   7694          IRTemp olddata = newTemp(Ity_I64);
   7695          IRTemp newdata = newTemp(Ity_I64);
   7696 
   7697          modrm = getUChar(delta);
   7698          if (sz != 4 || (!epartIsReg(modrm)))
   7699             goto mmx_decode_failure;
   7700          delta++;
   7701 
   7702          assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
   7703          assign( regM, getMMXReg( eregLO3ofRM(modrm) ));
   7704          assign( regD, getMMXReg( gregLO3ofRM(modrm) ));
   7705          assign( mask, binop(Iop_SarN8x8, mkexpr(regM), mkU8(7)) );
   7706          assign( olddata, loadLE( Ity_I64, mkexpr(addr) ));
   7707          assign( newdata,
   7708                  binop(Iop_Or64,
   7709                        binop(Iop_And64,
   7710                              mkexpr(regD),
   7711                              mkexpr(mask) ),
   7712                        binop(Iop_And64,
   7713                              mkexpr(olddata),
   7714                              unop(Iop_Not64, mkexpr(mask)))) );
   7715          storeLE( mkexpr(addr), mkexpr(newdata) );
   7716          DIP("maskmovq %s,%s\n", nameMMXReg( eregLO3ofRM(modrm) ),
   7717                                  nameMMXReg( gregLO3ofRM(modrm) ) );
   7718          break;
   7719       }
   7720 
   7721       /* --- MMX decode failure --- */
   7722       default:
   7723       mmx_decode_failure:
   7724          *decode_ok = False;
   7725          return delta; /* ignored */
   7726 
   7727    }
   7728 
   7729    *decode_ok = True;
   7730    return delta;
   7731 }
   7732 
   7733 
   7734 /*------------------------------------------------------------*/
   7735 /*--- More misc arithmetic and other obscure insns.        ---*/
   7736 /*------------------------------------------------------------*/
   7737 
   7738 /* Generate base << amt with vacated places filled with stuff
   7739    from xtra.  amt guaranteed in 0 .. 63. */
   7740 static
   7741 IRExpr* shiftL64_with_extras ( IRTemp base, IRTemp xtra, IRTemp amt )
   7742 {
   7743    /* if   amt == 0
   7744       then base
   7745       else (base << amt) | (xtra >>u (64-amt))
   7746    */
   7747    return
   7748       IRExpr_ITE(
   7749          binop(Iop_CmpNE8, mkexpr(amt), mkU8(0)),
   7750          binop(Iop_Or64,
   7751                binop(Iop_Shl64, mkexpr(base), mkexpr(amt)),
   7752                binop(Iop_Shr64, mkexpr(xtra),
   7753                                 binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
   7754                ),
   7755          mkexpr(base)
   7756       );
   7757 }
   7758 
   7759 /* Generate base >>u amt with vacated places filled with stuff
   7760    from xtra.  amt guaranteed in 0 .. 63. */
   7761 static
   7762 IRExpr* shiftR64_with_extras ( IRTemp xtra, IRTemp base, IRTemp amt )
   7763 {
   7764    /* if   amt == 0
   7765       then base
   7766       else (base >>u amt) | (xtra << (64-amt))
   7767    */
   7768    return
   7769       IRExpr_ITE(
   7770          binop(Iop_CmpNE8, mkexpr(amt), mkU8(0)),
   7771          binop(Iop_Or64,
   7772                binop(Iop_Shr64, mkexpr(base), mkexpr(amt)),
   7773                binop(Iop_Shl64, mkexpr(xtra),
   7774                                 binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
   7775                ),
   7776          mkexpr(base)
   7777       );
   7778 }
   7779 
   7780 /* Double length left and right shifts.  Apparently only required in
   7781    v-size (no b- variant). */
   7782 static
   7783 ULong dis_SHLRD_Gv_Ev ( const VexAbiInfo* vbi,
   7784                         Prefix pfx,
   7785                         Long delta, UChar modrm,
   7786                         Int sz,
   7787                         IRExpr* shift_amt,
   7788                         Bool amt_is_literal,
   7789                         const HChar* shift_amt_txt,
   7790                         Bool left_shift )
   7791 {
   7792    /* shift_amt :: Ity_I8 is the amount to shift.  shift_amt_txt is used
   7793       for printing it.   And eip on entry points at the modrm byte. */
   7794    Int len;
   7795    HChar dis_buf[50];
   7796 
   7797    IRType ty     = szToITy(sz);
   7798    IRTemp gsrc   = newTemp(ty);
   7799    IRTemp esrc   = newTemp(ty);
   7800    IRTemp addr   = IRTemp_INVALID;
   7801    IRTemp tmpSH  = newTemp(Ity_I8);
   7802    IRTemp tmpSS  = newTemp(Ity_I8);
   7803    IRTemp tmp64  = IRTemp_INVALID;
   7804    IRTemp res64  = IRTemp_INVALID;
   7805    IRTemp rss64  = IRTemp_INVALID;
   7806    IRTemp resTy  = IRTemp_INVALID;
   7807    IRTemp rssTy  = IRTemp_INVALID;
   7808    Int    mask   = sz==8 ? 63 : 31;
   7809 
   7810    vassert(sz == 2 || sz == 4 || sz == 8);
   7811 
   7812    /* The E-part is the destination; this is shifted.  The G-part
   7813       supplies bits to be shifted into the E-part, but is not
   7814       changed.
   7815 
   7816       If shifting left, form a double-length word with E at the top
   7817       and G at the bottom, and shift this left.  The result is then in
   7818       the high part.
   7819 
   7820       If shifting right, form a double-length word with G at the top
   7821       and E at the bottom, and shift this right.  The result is then
   7822       at the bottom.  */
   7823 
   7824    /* Fetch the operands. */
   7825 
   7826    assign( gsrc, getIRegG(sz, pfx, modrm) );
   7827 
   7828    if (epartIsReg(modrm)) {
   7829       delta++;
   7830       assign( esrc, getIRegE(sz, pfx, modrm) );
   7831       DIP("sh%cd%c %s, %s, %s\n",
   7832           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   7833           shift_amt_txt,
   7834           nameIRegG(sz, pfx, modrm), nameIRegE(sz, pfx, modrm));
   7835    } else {
   7836       addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
   7837                         /* # bytes following amode */
   7838                         amt_is_literal ? 1 : 0 );
   7839       delta += len;
   7840       assign( esrc, loadLE(ty, mkexpr(addr)) );
   7841       DIP("sh%cd%c %s, %s, %s\n",
   7842           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   7843           shift_amt_txt,
   7844           nameIRegG(sz, pfx, modrm), dis_buf);
   7845    }
   7846 
   7847    /* Calculate the masked shift amount (tmpSH), the masked subshift
   7848       amount (tmpSS), the shifted value (res64) and the subshifted
   7849       value (rss64). */
   7850 
   7851    assign( tmpSH, binop(Iop_And8, shift_amt, mkU8(mask)) );
   7852    assign( tmpSS, binop(Iop_And8,
   7853                         binop(Iop_Sub8, mkexpr(tmpSH), mkU8(1) ),
   7854                         mkU8(mask)));
   7855 
   7856    tmp64 = newTemp(Ity_I64);
   7857    res64 = newTemp(Ity_I64);
   7858    rss64 = newTemp(Ity_I64);
   7859 
   7860    if (sz == 2 || sz == 4) {
   7861 
   7862       /* G is xtra; E is data */
   7863       /* what a freaking nightmare: */
   7864       if (sz == 4 && left_shift) {
   7865          assign( tmp64, binop(Iop_32HLto64, mkexpr(esrc), mkexpr(gsrc)) );
   7866          assign( res64,
   7867                  binop(Iop_Shr64,
   7868                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
   7869                        mkU8(32)) );
   7870          assign( rss64,
   7871                  binop(Iop_Shr64,
   7872                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSS)),
   7873                        mkU8(32)) );
   7874       }
   7875       else
   7876       if (sz == 4 && !left_shift) {
   7877          assign( tmp64, binop(Iop_32HLto64, mkexpr(gsrc), mkexpr(esrc)) );
   7878          assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
   7879          assign( rss64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSS)) );
   7880       }
   7881       else
   7882       if (sz == 2 && left_shift) {
   7883          assign( tmp64,
   7884                  binop(Iop_32HLto64,
   7885                        binop(Iop_16HLto32, mkexpr(esrc), mkexpr(gsrc)),
   7886                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc))
   7887          ));
   7888          /* result formed by shifting [esrc'gsrc'gsrc'gsrc] */
   7889          assign( res64,
   7890                  binop(Iop_Shr64,
   7891                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
   7892                        mkU8(48)) );
   7893          /* subshift formed by shifting [esrc'0000'0000'0000] */
   7894          assign( rss64,
   7895                  binop(Iop_Shr64,
   7896                        binop(Iop_Shl64,
   7897                              binop(Iop_Shl64, unop(Iop_16Uto64, mkexpr(esrc)),
   7898                                               mkU8(48)),
   7899                              mkexpr(tmpSS)),
   7900                        mkU8(48)) );
   7901       }
   7902       else
   7903       if (sz == 2 && !left_shift) {
   7904          assign( tmp64,
   7905                  binop(Iop_32HLto64,
   7906                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc)),
   7907                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(esrc))
   7908          ));
   7909          /* result formed by shifting [gsrc'gsrc'gsrc'esrc] */
   7910          assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
   7911          /* subshift formed by shifting [0000'0000'0000'esrc] */
   7912          assign( rss64, binop(Iop_Shr64,
   7913                               unop(Iop_16Uto64, mkexpr(esrc)),
   7914                               mkexpr(tmpSS)) );
   7915       }
   7916 
   7917    } else {
   7918 
   7919       vassert(sz == 8);
   7920       if (left_shift) {
   7921          assign( res64, shiftL64_with_extras( esrc, gsrc, tmpSH ));
   7922          assign( rss64, shiftL64_with_extras( esrc, gsrc, tmpSS ));
   7923       } else {
   7924          assign( res64, shiftR64_with_extras( gsrc, esrc, tmpSH ));
   7925          assign( rss64, shiftR64_with_extras( gsrc, esrc, tmpSS ));
   7926       }
   7927 
   7928    }
   7929 
   7930    resTy = newTemp(ty);
   7931    rssTy = newTemp(ty);
   7932    assign( resTy, narrowTo(ty, mkexpr(res64)) );
   7933    assign( rssTy, narrowTo(ty, mkexpr(rss64)) );
   7934 
   7935    /* Put result back and write the flags thunk. */
   7936    setFlags_DEP1_DEP2_shift ( left_shift ? Iop_Shl64 : Iop_Sar64,
   7937                               resTy, rssTy, ty, tmpSH );
   7938 
   7939    if (epartIsReg(modrm)) {
   7940       putIRegE(sz, pfx, modrm, mkexpr(resTy));
   7941    } else {
   7942       storeLE( mkexpr(addr), mkexpr(resTy) );
   7943    }
   7944 
   7945    if (amt_is_literal) delta++;
   7946    return delta;
   7947 }
   7948 
   7949 
   7950 /* Handle BT/BTS/BTR/BTC Gv, Ev.  Apparently b-size is not
   7951    required. */
   7952 
   7953 typedef enum { BtOpNone, BtOpSet, BtOpReset, BtOpComp } BtOp;
   7954 
   7955 static const HChar* nameBtOp ( BtOp op )
   7956 {
   7957    switch (op) {
   7958       case BtOpNone:  return "";
   7959       case BtOpSet:   return "s";
   7960       case BtOpReset: return "r";
   7961       case BtOpComp:  return "c";
   7962       default: vpanic("nameBtOp(amd64)");
   7963    }
   7964 }
   7965 
   7966 
   7967 static
   7968 ULong dis_bt_G_E ( const VexAbiInfo* vbi,
   7969                    Prefix pfx, Int sz, Long delta, BtOp op,
   7970                    /*OUT*/Bool* decode_OK )
   7971 {
   7972    HChar  dis_buf[50];
   7973    UChar  modrm;
   7974    Int    len;
   7975    IRTemp t_fetched, t_bitno0, t_bitno1, t_bitno2, t_addr0,
   7976           t_addr1, t_rsp, t_mask, t_new;
   7977 
   7978    vassert(sz == 2 || sz == 4 || sz == 8);
   7979 
   7980    t_fetched = t_bitno0 = t_bitno1 = t_bitno2
   7981              = t_addr0 = t_addr1 = t_rsp
   7982              = t_mask = t_new = IRTemp_INVALID;
   7983 
   7984    t_fetched = newTemp(Ity_I8);
   7985    t_new     = newTemp(Ity_I8);
   7986    t_bitno0  = newTemp(Ity_I64);
   7987    t_bitno1  = newTemp(Ity_I64);
   7988    t_bitno2  = newTemp(Ity_I8);
   7989    t_addr1   = newTemp(Ity_I64);
   7990    modrm     = getUChar(delta);
   7991 
   7992    *decode_OK = True;
   7993    if (epartIsReg(modrm)) {
   7994       /* F2 and F3 are never acceptable. */
   7995       if (haveF2orF3(pfx)) {
   7996          *decode_OK = False;
   7997          return delta;
   7998       }
   7999    } else {
   8000       /* F2 or F3 (but not both) are allowed, provided LOCK is also
   8001          present, and only for the BTC/BTS/BTR cases (not BT). */
   8002       if (haveF2orF3(pfx)) {
   8003          if (haveF2andF3(pfx) || !haveLOCK(pfx) || op == BtOpNone) {
   8004             *decode_OK = False;
   8005             return delta;
   8006          }
   8007       }
   8008    }
   8009 
   8010    assign( t_bitno0, widenSto64(getIRegG(sz, pfx, modrm)) );
   8011 
   8012    if (epartIsReg(modrm)) {
   8013       delta++;
   8014       /* Get it onto the client's stack.  Oh, this is a horrible
   8015          kludge.  See https://bugs.kde.org/show_bug.cgi?id=245925.
   8016          Because of the ELF ABI stack redzone, there may be live data
   8017          up to 128 bytes below %RSP.  So we can't just push it on the
   8018          stack, else we may wind up trashing live data, and causing
   8019          impossible-to-find simulation errors.  (Yes, this did
   8020          happen.)  So we need to drop RSP before at least 128 before
   8021          pushing it.  That unfortunately means hitting Memcheck's
   8022          fast-case painting code.  Ideally we should drop more than
   8023          128, to reduce the chances of breaking buggy programs that
   8024          have live data below -128(%RSP).  Memcheck fast-cases moves
   8025          of 288 bytes due to the need to handle ppc64-linux quickly,
   8026          so let's use 288.  Of course the real fix is to get rid of
   8027          this kludge entirely.  */
   8028       t_rsp = newTemp(Ity_I64);
   8029       t_addr0 = newTemp(Ity_I64);
   8030 
   8031       vassert(vbi->guest_stack_redzone_size == 128);
   8032       assign( t_rsp, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(288)) );
   8033       putIReg64(R_RSP, mkexpr(t_rsp));
   8034 
   8035       storeLE( mkexpr(t_rsp), getIRegE(sz, pfx, modrm) );
   8036 
   8037       /* Make t_addr0 point at it. */
   8038       assign( t_addr0, mkexpr(t_rsp) );
   8039 
   8040       /* Mask out upper bits of the shift amount, since we're doing a
   8041          reg. */
   8042       assign( t_bitno1, binop(Iop_And64,
   8043                               mkexpr(t_bitno0),
   8044                               mkU64(sz == 8 ? 63 : sz == 4 ? 31 : 15)) );
   8045 
   8046    } else {
   8047       t_addr0 = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   8048       delta += len;
   8049       assign( t_bitno1, mkexpr(t_bitno0) );
   8050    }
   8051 
   8052    /* At this point: t_addr0 is the address being operated on.  If it
   8053       was a reg, we will have pushed it onto the client's stack.
   8054       t_bitno1 is the bit number, suitably masked in the case of a
   8055       reg.  */
   8056 
   8057    /* Now the main sequence. */
   8058    assign( t_addr1,
   8059            binop(Iop_Add64,
   8060                  mkexpr(t_addr0),
   8061                  binop(Iop_Sar64, mkexpr(t_bitno1), mkU8(3))) );
   8062 
   8063    /* t_addr1 now holds effective address */
   8064 
   8065    assign( t_bitno2,
   8066            unop(Iop_64to8,
   8067                 binop(Iop_And64, mkexpr(t_bitno1), mkU64(7))) );
   8068 
   8069    /* t_bitno2 contains offset of bit within byte */
   8070 
   8071    if (op != BtOpNone) {
   8072       t_mask = newTemp(Ity_I8);
   8073       assign( t_mask, binop(Iop_Shl8, mkU8(1), mkexpr(t_bitno2)) );
   8074    }
   8075 
   8076    /* t_mask is now a suitable byte mask */
   8077 
   8078    assign( t_fetched, loadLE(Ity_I8, mkexpr(t_addr1)) );
   8079 
   8080    if (op != BtOpNone) {
   8081       switch (op) {
   8082          case BtOpSet:
   8083             assign( t_new,
   8084                     binop(Iop_Or8, mkexpr(t_fetched), mkexpr(t_mask)) );
   8085             break;
   8086          case BtOpComp:
   8087             assign( t_new,
   8088                     binop(Iop_Xor8, mkexpr(t_fetched), mkexpr(t_mask)) );
   8089             break;
   8090          case BtOpReset:
   8091             assign( t_new,
   8092                     binop(Iop_And8, mkexpr(t_fetched),
   8093                                     unop(Iop_Not8, mkexpr(t_mask))) );
   8094             break;
   8095          default:
   8096             vpanic("dis_bt_G_E(amd64)");
   8097       }
   8098       if ((haveLOCK(pfx)) && !epartIsReg(modrm)) {
   8099          casLE( mkexpr(t_addr1), mkexpr(t_fetched)/*expd*/,
   8100                                  mkexpr(t_new)/*new*/,
   8101                                  guest_RIP_curr_instr );
   8102       } else {
   8103          storeLE( mkexpr(t_addr1), mkexpr(t_new) );
   8104       }
   8105    }
   8106 
   8107    /* Side effect done; now get selected bit into Carry flag */
   8108    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
   8109    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   8110    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   8111    stmt( IRStmt_Put(
   8112             OFFB_CC_DEP1,
   8113             binop(Iop_And64,
   8114                   binop(Iop_Shr64,
   8115                         unop(Iop_8Uto64, mkexpr(t_fetched)),
   8116                         mkexpr(t_bitno2)),
   8117                   mkU64(1)))
   8118        );
   8119    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   8120       elimination of previous stores to this field work better. */
   8121    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   8122 
   8123    /* Move reg operand from stack back to reg */
   8124    if (epartIsReg(modrm)) {
   8125       /* t_rsp still points at it. */
   8126       /* only write the reg if actually modifying it; doing otherwise
   8127          zeroes the top half erroneously when doing btl due to
   8128          standard zero-extend rule */
   8129       if (op != BtOpNone)
   8130          putIRegE(sz, pfx, modrm, loadLE(szToITy(sz), mkexpr(t_rsp)) );
   8131       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t_rsp), mkU64(288)) );
   8132    }
   8133 
   8134    DIP("bt%s%c %s, %s\n",
   8135        nameBtOp(op), nameISize(sz), nameIRegG(sz, pfx, modrm),
   8136        ( epartIsReg(modrm) ? nameIRegE(sz, pfx, modrm) : dis_buf ) );
   8137 
   8138    return delta;
   8139 }
   8140 
   8141 
   8142 
   8143 /* Handle BSF/BSR.  Only v-size seems necessary. */
   8144 static
   8145 ULong dis_bs_E_G ( const VexAbiInfo* vbi,
   8146                    Prefix pfx, Int sz, Long delta, Bool fwds )
   8147 {
   8148    Bool   isReg;
   8149    UChar  modrm;
   8150    HChar  dis_buf[50];
   8151 
   8152    IRType ty    = szToITy(sz);
   8153    IRTemp src   = newTemp(ty);
   8154    IRTemp dst   = newTemp(ty);
   8155    IRTemp src64 = newTemp(Ity_I64);
   8156    IRTemp dst64 = newTemp(Ity_I64);
   8157    IRTemp srcB  = newTemp(Ity_I1);
   8158 
   8159    vassert(sz == 8 || sz == 4 || sz == 2);
   8160 
   8161    modrm = getUChar(delta);
   8162    isReg = epartIsReg(modrm);
   8163    if (isReg) {
   8164       delta++;
   8165       assign( src, getIRegE(sz, pfx, modrm) );
   8166    } else {
   8167       Int    len;
   8168       IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   8169       delta += len;
   8170       assign( src, loadLE(ty, mkexpr(addr)) );
   8171    }
   8172 
   8173    DIP("bs%c%c %s, %s\n",
   8174        fwds ? 'f' : 'r', nameISize(sz),
   8175        ( isReg ? nameIRegE(sz, pfx, modrm) : dis_buf ),
   8176        nameIRegG(sz, pfx, modrm));
   8177 
   8178    /* First, widen src to 64 bits if it is not already. */
   8179    assign( src64, widenUto64(mkexpr(src)) );
   8180 
   8181    /* Generate a bool expression which is zero iff the original is
   8182       zero, and nonzero otherwise.  Ask for a CmpNE version which, if
   8183       instrumented by Memcheck, is instrumented expensively, since
   8184       this may be used on the output of a preceding movmskb insn,
   8185       which has been known to be partially defined, and in need of
   8186       careful handling. */
   8187    assign( srcB, binop(Iop_ExpCmpNE64, mkexpr(src64), mkU64(0)) );
   8188 
   8189    /* Flags: Z is 1 iff source value is zero.  All others
   8190       are undefined -- we force them to zero. */
   8191    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   8192    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   8193    stmt( IRStmt_Put(
   8194             OFFB_CC_DEP1,
   8195             IRExpr_ITE( mkexpr(srcB),
   8196                         /* src!=0 */
   8197                         mkU64(0),
   8198                         /* src==0 */
   8199                         mkU64(AMD64G_CC_MASK_Z)
   8200                         )
   8201        ));
   8202    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   8203       elimination of previous stores to this field work better. */
   8204    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   8205 
   8206    /* Result: iff source value is zero, we can't use
   8207       Iop_Clz64/Iop_Ctz64 as they have no defined result in that case.
   8208       But anyway, amd64 semantics say the result is undefined in
   8209       such situations.  Hence handle the zero case specially. */
   8210 
   8211    /* Bleh.  What we compute:
   8212 
   8213           bsf64:  if src == 0 then {dst is unchanged}
   8214                               else Ctz64(src)
   8215 
   8216           bsr64:  if src == 0 then {dst is unchanged}
   8217                               else 63 - Clz64(src)
   8218 
   8219           bsf32:  if src == 0 then {dst is unchanged}
   8220                               else Ctz64(32Uto64(src))
   8221 
   8222           bsr32:  if src == 0 then {dst is unchanged}
   8223                               else 63 - Clz64(32Uto64(src))
   8224 
   8225           bsf16:  if src == 0 then {dst is unchanged}
   8226                               else Ctz64(32Uto64(16Uto32(src)))
   8227 
   8228           bsr16:  if src == 0 then {dst is unchanged}
   8229                               else 63 - Clz64(32Uto64(16Uto32(src)))
   8230    */
   8231 
   8232    /* The main computation, guarding against zero. */
   8233    assign( dst64,
   8234            IRExpr_ITE(
   8235               mkexpr(srcB),
   8236               /* src != 0 */
   8237               fwds ? unop(Iop_Ctz64, mkexpr(src64))
   8238                    : binop(Iop_Sub64,
   8239                            mkU64(63),
   8240                            unop(Iop_Clz64, mkexpr(src64))),
   8241               /* src == 0 -- leave dst unchanged */
   8242               widenUto64( getIRegG( sz, pfx, modrm ) )
   8243            )
   8244          );
   8245 
   8246    if (sz == 2)
   8247       assign( dst, unop(Iop_64to16, mkexpr(dst64)) );
   8248    else
   8249    if (sz == 4)
   8250       assign( dst, unop(Iop_64to32, mkexpr(dst64)) );
   8251    else
   8252       assign( dst, mkexpr(dst64) );
   8253 
   8254    /* dump result back */
   8255    putIRegG( sz, pfx, modrm, mkexpr(dst) );
   8256 
   8257    return delta;
   8258 }
   8259 
   8260 
   8261 /* swap rAX with the reg specified by reg and REX.B */
   8262 static
   8263 void codegen_xchg_rAX_Reg ( Prefix pfx, Int sz, UInt regLo3 )
   8264 {
   8265    IRType ty = szToITy(sz);
   8266    IRTemp t1 = newTemp(ty);
   8267    IRTemp t2 = newTemp(ty);
   8268    vassert(sz == 2 || sz == 4 || sz == 8);
   8269    vassert(regLo3 < 8);
   8270    if (sz == 8) {
   8271       assign( t1, getIReg64(R_RAX) );
   8272       assign( t2, getIRegRexB(8, pfx, regLo3) );
   8273       putIReg64( R_RAX, mkexpr(t2) );
   8274       putIRegRexB(8, pfx, regLo3, mkexpr(t1) );
   8275    } else if (sz == 4) {
   8276       assign( t1, getIReg32(R_RAX) );
   8277       assign( t2, getIRegRexB(4, pfx, regLo3) );
   8278       putIReg32( R_RAX, mkexpr(t2) );
   8279       putIRegRexB(4, pfx, regLo3, mkexpr(t1) );
   8280    } else {
   8281       assign( t1, getIReg16(R_RAX) );
   8282       assign( t2, getIRegRexB(2, pfx, regLo3) );
   8283       putIReg16( R_RAX, mkexpr(t2) );
   8284       putIRegRexB(2, pfx, regLo3, mkexpr(t1) );
   8285    }
   8286    DIP("xchg%c %s, %s\n",
   8287        nameISize(sz), nameIRegRAX(sz),
   8288                       nameIRegRexB(sz,pfx, regLo3));
   8289 }
   8290 
   8291 
   8292 static
   8293 void codegen_SAHF ( void )
   8294 {
   8295    /* Set the flags to:
   8296       (amd64g_calculate_flags_all() & AMD64G_CC_MASK_O)
   8297                                     -- retain the old O flag
   8298       | (%AH & (AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
   8299                 |AMD64G_CC_MASK_P|AMD64G_CC_MASK_C)
   8300    */
   8301    ULong  mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
   8302                        |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
   8303    IRTemp oldflags   = newTemp(Ity_I64);
   8304    assign( oldflags, mk_amd64g_calculate_rflags_all() );
   8305    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   8306    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   8307    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   8308    stmt( IRStmt_Put( OFFB_CC_DEP1,
   8309          binop(Iop_Or64,
   8310                binop(Iop_And64, mkexpr(oldflags), mkU64(AMD64G_CC_MASK_O)),
   8311                binop(Iop_And64,
   8312                      binop(Iop_Shr64, getIReg64(R_RAX), mkU8(8)),
   8313                      mkU64(mask_SZACP))
   8314               )
   8315    ));
   8316 }
   8317 
   8318 
   8319 static
   8320 void codegen_LAHF ( void  )
   8321 {
   8322    /* AH <- EFLAGS(SF:ZF:0:AF:0:PF:1:CF) */
   8323    IRExpr* rax_with_hole;
   8324    IRExpr* new_byte;
   8325    IRExpr* new_rax;
   8326    ULong   mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
   8327                         |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
   8328 
   8329    IRTemp  flags = newTemp(Ity_I64);
   8330    assign( flags, mk_amd64g_calculate_rflags_all() );
   8331 
   8332    rax_with_hole
   8333       = binop(Iop_And64, getIReg64(R_RAX), mkU64(~0xFF00ULL));
   8334    new_byte
   8335       = binop(Iop_Or64, binop(Iop_And64, mkexpr(flags), mkU64(mask_SZACP)),
   8336                         mkU64(1<<1));
   8337    new_rax
   8338       = binop(Iop_Or64, rax_with_hole,
   8339                         binop(Iop_Shl64, new_byte, mkU8(8)));
   8340    putIReg64(R_RAX, new_rax);
   8341 }
   8342 
   8343 
   8344 static
   8345 ULong dis_cmpxchg_G_E ( /*OUT*/Bool* ok,
   8346                         const VexAbiInfo*  vbi,
   8347                         Prefix       pfx,
   8348                         Int          size,
   8349                         Long         delta0 )
   8350 {
   8351    HChar dis_buf[50];
   8352    Int   len;
   8353 
   8354    IRType ty    = szToITy(size);
   8355    IRTemp acc   = newTemp(ty);
   8356    IRTemp src   = newTemp(ty);
   8357    IRTemp dest  = newTemp(ty);
   8358    IRTemp dest2 = newTemp(ty);
   8359    IRTemp acc2  = newTemp(ty);
   8360    IRTemp cond  = newTemp(Ity_I1);
   8361    IRTemp addr  = IRTemp_INVALID;
   8362    UChar  rm    = getUChar(delta0);
   8363 
   8364    /* There are 3 cases to consider:
   8365 
   8366       reg-reg: ignore any lock prefix, generate sequence based
   8367                on ITE
   8368 
   8369       reg-mem, not locked: ignore any lock prefix, generate sequence
   8370                            based on ITE
   8371 
   8372       reg-mem, locked: use IRCAS
   8373    */
   8374 
   8375    /* Decide whether F2 or F3 are acceptable.  Never for register
   8376       case, but for the memory case, one or the other is OK provided
   8377       LOCK is also present. */
   8378    if (epartIsReg(rm)) {
   8379       if (haveF2orF3(pfx)) {
   8380          *ok = False;
   8381          return delta0;
   8382       }
   8383    } else {
   8384       if (haveF2orF3(pfx)) {
   8385          if (haveF2andF3(pfx) || !haveLOCK(pfx)) {
   8386             *ok = False;
   8387             return delta0;
   8388          }
   8389       }
   8390    }
   8391 
   8392    if (epartIsReg(rm)) {
   8393       /* case 1 */
   8394       assign( dest, getIRegE(size, pfx, rm) );
   8395       delta0++;
   8396       assign( src, getIRegG(size, pfx, rm) );
   8397       assign( acc, getIRegRAX(size) );
   8398       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   8399       assign( cond, mk_amd64g_calculate_condition(AMD64CondZ) );
   8400       assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
   8401       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
   8402       putIRegRAX(size, mkexpr(acc2));
   8403       putIRegE(size, pfx, rm, mkexpr(dest2));
   8404       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   8405                                nameIRegG(size,pfx,rm),
   8406                                nameIRegE(size,pfx,rm) );
   8407    }
   8408    else if (!epartIsReg(rm) && !haveLOCK(pfx)) {
   8409       /* case 2 */
   8410       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8411       assign( dest, loadLE(ty, mkexpr(addr)) );
   8412       delta0 += len;
   8413       assign( src, getIRegG(size, pfx, rm) );
   8414       assign( acc, getIRegRAX(size) );
   8415       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   8416       assign( cond, mk_amd64g_calculate_condition(AMD64CondZ) );
   8417       assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
   8418       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
   8419       putIRegRAX(size, mkexpr(acc2));
   8420       storeLE( mkexpr(addr), mkexpr(dest2) );
   8421       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   8422                                nameIRegG(size,pfx,rm), dis_buf);
   8423    }
   8424    else if (!epartIsReg(rm) && haveLOCK(pfx)) {
   8425       /* case 3 */
   8426       /* src is new value.  acc is expected value.  dest is old value.
   8427          Compute success from the output of the IRCAS, and steer the
   8428          new value for RAX accordingly: in case of success, RAX is
   8429          unchanged. */
   8430       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8431       delta0 += len;
   8432       assign( src, getIRegG(size, pfx, rm) );
   8433       assign( acc, getIRegRAX(size) );
   8434       stmt( IRStmt_CAS(
   8435          mkIRCAS( IRTemp_INVALID, dest, Iend_LE, mkexpr(addr),
   8436                   NULL, mkexpr(acc), NULL, mkexpr(src) )
   8437       ));
   8438       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   8439       assign( cond, mk_amd64g_calculate_condition(AMD64CondZ) );
   8440       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
   8441       putIRegRAX(size, mkexpr(acc2));
   8442       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   8443                                nameIRegG(size,pfx,rm), dis_buf);
   8444    }
   8445    else vassert(0);
   8446 
   8447    *ok = True;
   8448    return delta0;
   8449 }
   8450 
   8451 
   8452 /* Handle conditional move instructions of the form
   8453       cmovcc E(reg-or-mem), G(reg)
   8454 
   8455    E(src) is reg-or-mem
   8456    G(dst) is reg.
   8457 
   8458    If E is reg, -->    GET %E, tmps
   8459                        GET %G, tmpd
   8460                        CMOVcc tmps, tmpd
   8461                        PUT tmpd, %G
   8462 
   8463    If E is mem  -->    (getAddr E) -> tmpa
   8464                        LD (tmpa), tmps
   8465                        GET %G, tmpd
   8466                        CMOVcc tmps, tmpd
   8467                        PUT tmpd, %G
   8468 */
   8469 static
   8470 ULong dis_cmov_E_G ( const VexAbiInfo* vbi,
   8471                      Prefix        pfx,
   8472                      Int           sz,
   8473                      AMD64Condcode cond,
   8474                      Long          delta0 )
   8475 {
   8476    UChar rm  = getUChar(delta0);
   8477    HChar dis_buf[50];
   8478    Int   len;
   8479 
   8480    IRType ty   = szToITy(sz);
   8481    IRTemp tmps = newTemp(ty);
   8482    IRTemp tmpd = newTemp(ty);
   8483 
   8484    if (epartIsReg(rm)) {
   8485       assign( tmps, getIRegE(sz, pfx, rm) );
   8486       assign( tmpd, getIRegG(sz, pfx, rm) );
   8487 
   8488       putIRegG( sz, pfx, rm,
   8489                 IRExpr_ITE( mk_amd64g_calculate_condition(cond),
   8490                             mkexpr(tmps),
   8491                             mkexpr(tmpd) )
   8492               );
   8493       DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
   8494                             nameIRegE(sz,pfx,rm),
   8495                             nameIRegG(sz,pfx,rm));
   8496       return 1+delta0;
   8497    }
   8498 
   8499    /* E refers to memory */
   8500    {
   8501       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8502       assign( tmps, loadLE(ty, mkexpr(addr)) );
   8503       assign( tmpd, getIRegG(sz, pfx, rm) );
   8504 
   8505       putIRegG( sz, pfx, rm,
   8506                 IRExpr_ITE( mk_amd64g_calculate_condition(cond),
   8507                             mkexpr(tmps),
   8508                             mkexpr(tmpd) )
   8509               );
   8510 
   8511       DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
   8512                             dis_buf,
   8513                             nameIRegG(sz,pfx,rm));
   8514       return len+delta0;
   8515    }
   8516 }
   8517 
   8518 
   8519 static
   8520 ULong dis_xadd_G_E ( /*OUT*/Bool* decode_ok,
   8521                      const VexAbiInfo* vbi,
   8522                      Prefix pfx, Int sz, Long delta0 )
   8523 {
   8524    Int   len;
   8525    UChar rm = getUChar(delta0);
   8526    HChar dis_buf[50];
   8527 
   8528    IRType ty    = szToITy(sz);
   8529    IRTemp tmpd  = newTemp(ty);
   8530    IRTemp tmpt0 = newTemp(ty);
   8531    IRTemp tmpt1 = newTemp(ty);
   8532 
   8533    /* There are 3 cases to consider:
   8534 
   8535       reg-reg: ignore any lock prefix,
   8536                generate 'naive' (non-atomic) sequence
   8537 
   8538       reg-mem, not locked: ignore any lock prefix, generate 'naive'
   8539                            (non-atomic) sequence
   8540 
   8541       reg-mem, locked: use IRCAS
   8542    */
   8543 
   8544    if (epartIsReg(rm)) {
   8545       /* case 1 */
   8546       assign( tmpd, getIRegE(sz, pfx, rm) );
   8547       assign( tmpt0, getIRegG(sz, pfx, rm) );
   8548       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   8549                            mkexpr(tmpd), mkexpr(tmpt0)) );
   8550       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   8551       putIRegG(sz, pfx, rm, mkexpr(tmpd));
   8552       putIRegE(sz, pfx, rm, mkexpr(tmpt1));
   8553       DIP("xadd%c %s, %s\n",
   8554           nameISize(sz), nameIRegG(sz,pfx,rm), nameIRegE(sz,pfx,rm));
   8555       *decode_ok = True;
   8556       return 1+delta0;
   8557    }
   8558    else if (!epartIsReg(rm) && !haveLOCK(pfx)) {
   8559       /* case 2 */
   8560       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8561       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   8562       assign( tmpt0, getIRegG(sz, pfx, rm) );
   8563       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   8564                            mkexpr(tmpd), mkexpr(tmpt0)) );
   8565       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   8566       storeLE( mkexpr(addr), mkexpr(tmpt1) );
   8567       putIRegG(sz, pfx, rm, mkexpr(tmpd));
   8568       DIP("xadd%c %s, %s\n",
   8569           nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
   8570       *decode_ok = True;
   8571       return len+delta0;
   8572    }
   8573    else if (!epartIsReg(rm) && haveLOCK(pfx)) {
   8574       /* case 3 */
   8575       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8576       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   8577       assign( tmpt0, getIRegG(sz, pfx, rm) );
   8578       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   8579                            mkexpr(tmpd), mkexpr(tmpt0)) );
   8580       casLE( mkexpr(addr), mkexpr(tmpd)/*expVal*/,
   8581                            mkexpr(tmpt1)/*newVal*/, guest_RIP_curr_instr );
   8582       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   8583       putIRegG(sz, pfx, rm, mkexpr(tmpd));
   8584       DIP("xadd%c %s, %s\n",
   8585           nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
   8586       *decode_ok = True;
   8587       return len+delta0;
   8588    }
   8589    /*UNREACHED*/
   8590    vassert(0);
   8591 }
   8592 
   8593 //.. /* Move 16 bits from Ew (ireg or mem) to G (a segment register). */
   8594 //..
   8595 //.. static
   8596 //.. UInt dis_mov_Ew_Sw ( UChar sorb, Long delta0 )
   8597 //.. {
   8598 //..    Int    len;
   8599 //..    IRTemp addr;
   8600 //..    UChar  rm  = getUChar(delta0);
   8601 //..    HChar  dis_buf[50];
   8602 //..
   8603 //..    if (epartIsReg(rm)) {
   8604 //..       putSReg( gregOfRM(rm), getIReg(2, eregOfRM(rm)) );
   8605 //..       DIP("movw %s,%s\n", nameIReg(2,eregOfRM(rm)), nameSReg(gregOfRM(rm)));
   8606 //..       return 1+delta0;
   8607 //..    } else {
   8608 //..       addr = disAMode ( &len, sorb, delta0, dis_buf );
   8609 //..       putSReg( gregOfRM(rm), loadLE(Ity_I16, mkexpr(addr)) );
   8610 //..       DIP("movw %s,%s\n", dis_buf, nameSReg(gregOfRM(rm)));
   8611 //..       return len+delta0;
   8612 //..    }
   8613 //.. }
   8614 //..
   8615 //.. /* Move 16 bits from G (a segment register) to Ew (ireg or mem).  If
   8616 //..    dst is ireg and sz==4, zero out top half of it.  */
   8617 //..
   8618 //.. static
   8619 //.. UInt dis_mov_Sw_Ew ( UChar sorb,
   8620 //..                      Int   sz,
   8621 //..                      UInt  delta0 )
   8622 //.. {
   8623 //..    Int    len;
   8624 //..    IRTemp addr;
   8625 //..    UChar  rm  = getUChar(delta0);
   8626 //..    HChar  dis_buf[50];
   8627 //..
   8628 //..    vassert(sz == 2 || sz == 4);
   8629 //..
   8630 //..    if (epartIsReg(rm)) {
   8631 //..       if (sz == 4)
   8632 //..          putIReg(4, eregOfRM(rm), unop(Iop_16Uto32, getSReg(gregOfRM(rm))));
   8633 //..       else
   8634 //..          putIReg(2, eregOfRM(rm), getSReg(gregOfRM(rm)));
   8635 //..
   8636 //..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), nameIReg(sz,eregOfRM(rm)));
   8637 //..       return 1+delta0;
   8638 //..    } else {
   8639 //..       addr = disAMode ( &len, sorb, delta0, dis_buf );
   8640 //..       storeLE( mkexpr(addr), getSReg(gregOfRM(rm)) );
   8641 //..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), dis_buf);
   8642 //..       return len+delta0;
   8643 //..    }
   8644 //.. }
   8645 
   8646 /* Handle move instructions of the form
   8647       mov S, E  meaning
   8648       mov sreg, reg-or-mem
   8649    Is passed the a ptr to the modRM byte, and the data size.  Returns
   8650    the address advanced completely over this instruction.
   8651 
   8652    VEX does not currently simulate segment registers on AMD64 which means that
   8653    instead of moving a value of a segment register, zero is moved to the
   8654    destination.  The zero value represents a null (unused) selector.  This is
   8655    not correct (especially for the %cs, %fs and %gs registers) but it seems to
   8656    provide a sufficient simulation for currently seen programs that use this
   8657    instruction.  If some program actually decides to use the obtained segment
   8658    selector for something meaningful then the zero value should be a clear
   8659    indicator that there is some problem.
   8660 
   8661    S(src) is sreg.
   8662    E(dst) is reg-or-mem
   8663 
   8664    If E is reg, -->    PUT $0, %E
   8665 
   8666    If E is mem, -->    (getAddr E) -> tmpa
   8667                        ST $0, (tmpa)
   8668 */
   8669 static
   8670 ULong dis_mov_S_E ( const VexAbiInfo* vbi,
   8671                     Prefix      pfx,
   8672                     Int         size,
   8673                     Long        delta0 )
   8674 {
   8675    Int   len;
   8676    UChar rm = getUChar(delta0);
   8677    HChar dis_buf[50];
   8678 
   8679    if (epartIsReg(rm)) {
   8680       putIRegE(size, pfx, rm, mkU(szToITy(size), 0));
   8681       DIP("mov %s,%s\n", nameSReg(gregOfRexRM(pfx, rm)),
   8682                          nameIRegE(size, pfx, rm));
   8683       return 1+delta0;
   8684    }
   8685 
   8686    /* E refers to memory */
   8687    {
   8688       IRTemp addr = disAMode(&len, vbi, pfx, delta0, dis_buf, 0);
   8689       storeLE(mkexpr(addr), mkU16(0));
   8690       DIP("mov %s,%s\n", nameSReg(gregOfRexRM(pfx, rm)),
   8691                          dis_buf);
   8692       return len+delta0;
   8693    }
   8694 }
   8695 
   8696 //.. static
   8697 //.. void dis_push_segreg ( UInt sreg, Int sz )
   8698 //.. {
   8699 //..     IRTemp t1 = newTemp(Ity_I16);
   8700 //..     IRTemp ta = newTemp(Ity_I32);
   8701 //..     vassert(sz == 2 || sz == 4);
   8702 //..
   8703 //..     assign( t1, getSReg(sreg) );
   8704 //..     assign( ta, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)) );
   8705 //..     putIReg(4, R_ESP, mkexpr(ta));
   8706 //..     storeLE( mkexpr(ta), mkexpr(t1) );
   8707 //..
   8708 //..     DIP("pushw %s\n", nameSReg(sreg));
   8709 //.. }
   8710 //..
   8711 //.. static
   8712 //.. void dis_pop_segreg ( UInt sreg, Int sz )
   8713 //.. {
   8714 //..     IRTemp t1 = newTemp(Ity_I16);
   8715 //..     IRTemp ta = newTemp(Ity_I32);
   8716 //..     vassert(sz == 2 || sz == 4);
   8717 //..
   8718 //..     assign( ta, getIReg(4, R_ESP) );
   8719 //..     assign( t1, loadLE(Ity_I16, mkexpr(ta)) );
   8720 //..
   8721 //..     putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(ta), mkU32(sz)) );
   8722 //..     putSReg( sreg, mkexpr(t1) );
   8723 //..     DIP("pop %s\n", nameSReg(sreg));
   8724 //.. }
   8725 
   8726 static
   8727 void dis_ret ( /*MOD*/DisResult* dres, const VexAbiInfo* vbi, ULong d64 )
   8728 {
   8729    IRTemp t1 = newTemp(Ity_I64);
   8730    IRTemp t2 = newTemp(Ity_I64);
   8731    IRTemp t3 = newTemp(Ity_I64);
   8732    assign(t1, getIReg64(R_RSP));
   8733    assign(t2, loadLE(Ity_I64,mkexpr(t1)));
   8734    assign(t3, binop(Iop_Add64, mkexpr(t1), mkU64(8+d64)));
   8735    putIReg64(R_RSP, mkexpr(t3));
   8736    make_redzone_AbiHint(vbi, t3, t2/*nia*/, "ret");
   8737    jmp_treg(dres, Ijk_Ret, t2);
   8738    vassert(dres->whatNext == Dis_StopHere);
   8739 }
   8740 
   8741 
   8742 /*------------------------------------------------------------*/
   8743 /*--- SSE/SSE2/SSE3 helpers                                ---*/
   8744 /*------------------------------------------------------------*/
   8745 
   8746 /* Indicates whether the op requires a rounding-mode argument.  Note
   8747    that this covers only vector floating point arithmetic ops, and
   8748    omits the scalar ones that need rounding modes.  Note also that
   8749    inconsistencies here will get picked up later by the IR sanity
   8750    checker, so this isn't correctness-critical. */
   8751 static Bool requiresRMode ( IROp op )
   8752 {
   8753    switch (op) {
   8754       /* 128 bit ops */
   8755       case Iop_Add32Fx4: case Iop_Sub32Fx4:
   8756       case Iop_Mul32Fx4: case Iop_Div32Fx4:
   8757       case Iop_Add64Fx2: case Iop_Sub64Fx2:
   8758       case Iop_Mul64Fx2: case Iop_Div64Fx2:
   8759       /* 256 bit ops */
   8760       case Iop_Add32Fx8: case Iop_Sub32Fx8:
   8761       case Iop_Mul32Fx8: case Iop_Div32Fx8:
   8762       case Iop_Add64Fx4: case Iop_Sub64Fx4:
   8763       case Iop_Mul64Fx4: case Iop_Div64Fx4:
   8764          return True;
   8765       default:
   8766          break;
   8767    }
   8768    return False;
   8769 }
   8770 
   8771 
   8772 /* Worker function; do not call directly.
   8773    Handles full width G = G `op` E   and   G = (not G) `op` E.
   8774 */
   8775 
   8776 static ULong dis_SSE_E_to_G_all_wrk (
   8777                 const VexAbiInfo* vbi,
   8778                 Prefix pfx, Long delta,
   8779                 const HChar* opname, IROp op,
   8780                 Bool   invertG
   8781              )
   8782 {
   8783    HChar   dis_buf[50];
   8784    Int     alen;
   8785    IRTemp  addr;
   8786    UChar   rm = getUChar(delta);
   8787    Bool    needsRMode = requiresRMode(op);
   8788    IRExpr* gpart
   8789       = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRexRM(pfx,rm)))
   8790                 : getXMMReg(gregOfRexRM(pfx,rm));
   8791    if (epartIsReg(rm)) {
   8792       putXMMReg(
   8793          gregOfRexRM(pfx,rm),
   8794          needsRMode
   8795             ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   8796                         gpart,
   8797                         getXMMReg(eregOfRexRM(pfx,rm)))
   8798             : binop(op, gpart,
   8799                         getXMMReg(eregOfRexRM(pfx,rm)))
   8800       );
   8801       DIP("%s %s,%s\n", opname,
   8802                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8803                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8804       return delta+1;
   8805    } else {
   8806       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8807       putXMMReg(
   8808          gregOfRexRM(pfx,rm),
   8809          needsRMode
   8810             ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   8811                         gpart,
   8812                         loadLE(Ity_V128, mkexpr(addr)))
   8813             : binop(op, gpart,
   8814                         loadLE(Ity_V128, mkexpr(addr)))
   8815       );
   8816       DIP("%s %s,%s\n", opname,
   8817                         dis_buf,
   8818                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8819       return delta+alen;
   8820    }
   8821 }
   8822 
   8823 
   8824 /* All lanes SSE binary operation, G = G `op` E. */
   8825 
   8826 static
   8827 ULong dis_SSE_E_to_G_all ( const VexAbiInfo* vbi,
   8828                            Prefix pfx, Long delta,
   8829                            const HChar* opname, IROp op )
   8830 {
   8831    return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, False );
   8832 }
   8833 
   8834 /* All lanes SSE binary operation, G = (not G) `op` E. */
   8835 
   8836 static
   8837 ULong dis_SSE_E_to_G_all_invG ( const VexAbiInfo* vbi,
   8838                                 Prefix pfx, Long delta,
   8839                                 const HChar* opname, IROp op )
   8840 {
   8841    return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, True );
   8842 }
   8843 
   8844 
   8845 /* Lowest 32-bit lane only SSE binary operation, G = G `op` E. */
   8846 
   8847 static ULong dis_SSE_E_to_G_lo32 ( const VexAbiInfo* vbi,
   8848                                    Prefix pfx, Long delta,
   8849                                    const HChar* opname, IROp op )
   8850 {
   8851    HChar   dis_buf[50];
   8852    Int     alen;
   8853    IRTemp  addr;
   8854    UChar   rm = getUChar(delta);
   8855    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
   8856    if (epartIsReg(rm)) {
   8857       putXMMReg( gregOfRexRM(pfx,rm),
   8858                  binop(op, gpart,
   8859                            getXMMReg(eregOfRexRM(pfx,rm))) );
   8860       DIP("%s %s,%s\n", opname,
   8861                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8862                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8863       return delta+1;
   8864    } else {
   8865       /* We can only do a 32-bit memory read, so the upper 3/4 of the
   8866          E operand needs to be made simply of zeroes. */
   8867       IRTemp epart = newTemp(Ity_V128);
   8868       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8869       assign( epart, unop( Iop_32UtoV128,
   8870                            loadLE(Ity_I32, mkexpr(addr))) );
   8871       putXMMReg( gregOfRexRM(pfx,rm),
   8872                  binop(op, gpart, mkexpr(epart)) );
   8873       DIP("%s %s,%s\n", opname,
   8874                         dis_buf,
   8875                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8876       return delta+alen;
   8877    }
   8878 }
   8879 
   8880 
   8881 /* Lower 64-bit lane only SSE binary operation, G = G `op` E. */
   8882 
   8883 static ULong dis_SSE_E_to_G_lo64 ( const VexAbiInfo* vbi,
   8884                                    Prefix pfx, Long delta,
   8885                                    const HChar* opname, IROp op )
   8886 {
   8887    HChar   dis_buf[50];
   8888    Int     alen;
   8889    IRTemp  addr;
   8890    UChar   rm = getUChar(delta);
   8891    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
   8892    if (epartIsReg(rm)) {
   8893       putXMMReg( gregOfRexRM(pfx,rm),
   8894                  binop(op, gpart,
   8895                            getXMMReg(eregOfRexRM(pfx,rm))) );
   8896       DIP("%s %s,%s\n", opname,
   8897                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8898                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8899       return delta+1;
   8900    } else {
   8901       /* We can only do a 64-bit memory read, so the upper half of the
   8902          E operand needs to be made simply of zeroes. */
   8903       IRTemp epart = newTemp(Ity_V128);
   8904       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8905       assign( epart, unop( Iop_64UtoV128,
   8906                            loadLE(Ity_I64, mkexpr(addr))) );
   8907       putXMMReg( gregOfRexRM(pfx,rm),
   8908                  binop(op, gpart, mkexpr(epart)) );
   8909       DIP("%s %s,%s\n", opname,
   8910                         dis_buf,
   8911                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8912       return delta+alen;
   8913    }
   8914 }
   8915 
   8916 
   8917 /* All lanes unary SSE operation, G = op(E). */
   8918 
   8919 static ULong dis_SSE_E_to_G_unary_all (
   8920                 const VexAbiInfo* vbi,
   8921                 Prefix pfx, Long delta,
   8922                 const HChar* opname, IROp op
   8923              )
   8924 {
   8925    HChar   dis_buf[50];
   8926    Int     alen;
   8927    IRTemp  addr;
   8928    UChar   rm = getUChar(delta);
   8929    // Sqrt32Fx4 and Sqrt64Fx2 take a rounding mode, which is faked
   8930    // up in the usual way.
   8931    Bool needsIRRM = op == Iop_Sqrt32Fx4 || op == Iop_Sqrt64Fx2;
   8932    if (epartIsReg(rm)) {
   8933       IRExpr* src = getXMMReg(eregOfRexRM(pfx,rm));
   8934       /* XXXROUNDINGFIXME */
   8935       IRExpr* res = needsIRRM ? binop(op, get_FAKE_roundingmode(), src)
   8936                               : unop(op, src);
   8937       putXMMReg( gregOfRexRM(pfx,rm), res );
   8938       DIP("%s %s,%s\n", opname,
   8939                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8940                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8941       return delta+1;
   8942    } else {
   8943       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8944       IRExpr* src = loadLE(Ity_V128, mkexpr(addr));
   8945       /* XXXROUNDINGFIXME */
   8946       IRExpr* res = needsIRRM ? binop(op, get_FAKE_roundingmode(), src)
   8947                               : unop(op, src);
   8948       putXMMReg( gregOfRexRM(pfx,rm), res );
   8949       DIP("%s %s,%s\n", opname,
   8950                         dis_buf,
   8951                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8952       return delta+alen;
   8953    }
   8954 }
   8955 
   8956 
   8957 /* Lowest 32-bit lane only unary SSE operation, G = op(E). */
   8958 
   8959 static ULong dis_SSE_E_to_G_unary_lo32 (
   8960                 const VexAbiInfo* vbi,
   8961                 Prefix pfx, Long delta,
   8962                 const HChar* opname, IROp op
   8963              )
   8964 {
   8965    /* First we need to get the old G value and patch the low 32 bits
   8966       of the E operand into it.  Then apply op and write back to G. */
   8967    HChar   dis_buf[50];
   8968    Int     alen;
   8969    IRTemp  addr;
   8970    UChar   rm = getUChar(delta);
   8971    IRTemp  oldG0 = newTemp(Ity_V128);
   8972    IRTemp  oldG1 = newTemp(Ity_V128);
   8973 
   8974    assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
   8975 
   8976    if (epartIsReg(rm)) {
   8977       assign( oldG1,
   8978               binop( Iop_SetV128lo32,
   8979                      mkexpr(oldG0),
   8980                      getXMMRegLane32(eregOfRexRM(pfx,rm), 0)) );
   8981       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   8982       DIP("%s %s,%s\n", opname,
   8983                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8984                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8985       return delta+1;
   8986    } else {
   8987       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8988       assign( oldG1,
   8989               binop( Iop_SetV128lo32,
   8990                      mkexpr(oldG0),
   8991                      loadLE(Ity_I32, mkexpr(addr)) ));
   8992       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   8993       DIP("%s %s,%s\n", opname,
   8994                         dis_buf,
   8995                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8996       return delta+alen;
   8997    }
   8998 }
   8999 
   9000 
   9001 /* Lowest 64-bit lane only unary SSE operation, G = op(E). */
   9002 
   9003 static ULong dis_SSE_E_to_G_unary_lo64 (
   9004                 const VexAbiInfo* vbi,
   9005                 Prefix pfx, Long delta,
   9006                 const HChar* opname, IROp op
   9007              )
   9008 {
   9009    /* First we need to get the old G value and patch the low 64 bits
   9010       of the E operand into it.  Then apply op and write back to G. */
   9011    HChar   dis_buf[50];
   9012    Int     alen;
   9013    IRTemp  addr;
   9014    UChar   rm = getUChar(delta);
   9015    IRTemp  oldG0 = newTemp(Ity_V128);
   9016    IRTemp  oldG1 = newTemp(Ity_V128);
   9017 
   9018    assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
   9019 
   9020    if (epartIsReg(rm)) {
   9021       assign( oldG1,
   9022               binop( Iop_SetV128lo64,
   9023                      mkexpr(oldG0),
   9024                      getXMMRegLane64(eregOfRexRM(pfx,rm), 0)) );
   9025       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   9026       DIP("%s %s,%s\n", opname,
   9027                         nameXMMReg(eregOfRexRM(pfx,rm)),
   9028                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9029       return delta+1;
   9030    } else {
   9031       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9032       assign( oldG1,
   9033               binop( Iop_SetV128lo64,
   9034                      mkexpr(oldG0),
   9035                      loadLE(Ity_I64, mkexpr(addr)) ));
   9036       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   9037       DIP("%s %s,%s\n", opname,
   9038                         dis_buf,
   9039                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9040       return delta+alen;
   9041    }
   9042 }
   9043 
   9044 
   9045 /* SSE integer binary operation:
   9046       G = G `op` E   (eLeft == False)
   9047       G = E `op` G   (eLeft == True)
   9048 */
   9049 static ULong dis_SSEint_E_to_G(
   9050                 const VexAbiInfo* vbi,
   9051                 Prefix pfx, Long delta,
   9052                 const HChar* opname, IROp op,
   9053                 Bool   eLeft
   9054              )
   9055 {
   9056    HChar   dis_buf[50];
   9057    Int     alen;
   9058    IRTemp  addr;
   9059    UChar   rm = getUChar(delta);
   9060    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
   9061    IRExpr* epart = NULL;
   9062    if (epartIsReg(rm)) {
   9063       epart = getXMMReg(eregOfRexRM(pfx,rm));
   9064       DIP("%s %s,%s\n", opname,
   9065                         nameXMMReg(eregOfRexRM(pfx,rm)),
   9066                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9067       delta += 1;
   9068    } else {
   9069       addr  = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9070       epart = loadLE(Ity_V128, mkexpr(addr));
   9071       DIP("%s %s,%s\n", opname,
   9072                         dis_buf,
   9073                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9074       delta += alen;
   9075    }
   9076    putXMMReg( gregOfRexRM(pfx,rm),
   9077               eLeft ? binop(op, epart, gpart)
   9078                     : binop(op, gpart, epart) );
   9079    return delta;
   9080 }
   9081 
   9082 
   9083 /* Helper for doing SSE FP comparisons.  False return ==> unhandled.
   9084    This is all a bit of a kludge in that it ignores the subtleties of
   9085    ordered-vs-unordered and signalling-vs-nonsignalling in the Intel
   9086    spec. */
   9087 static Bool findSSECmpOp ( /*OUT*/Bool* preSwapP,
   9088                            /*OUT*/IROp* opP,
   9089                            /*OUT*/Bool* postNotP,
   9090                            UInt imm8, Bool all_lanes, Int sz )
   9091 {
   9092    if (imm8 >= 32) return False;
   9093 
   9094    /* First, compute a (preSwap, op, postNot) triple from
   9095       the supplied imm8. */
   9096    Bool pre = False;
   9097    IROp op  = Iop_INVALID;
   9098    Bool not = False;
   9099 
   9100 #  define XXX(_pre, _op, _not) { pre = _pre; op = _op; not = _not; }
   9101    // If you add a case here, add a corresponding test for both VCMPSD_128
   9102    // and VCMPSS_128 in avx-1.c.
   9103    // Cases 0xA and above are
   9104    //    "Enhanced Comparison Predicate[s] for VEX-Encoded [insns]"
   9105    switch (imm8) {
   9106       // "O" = ordered, "U" = unordered
   9107       // "Q" = non-signalling (quiet), "S" = signalling
   9108       //
   9109       //             swap operands?
   9110       //             |
   9111       //             |      cmp op          invert after?
   9112       //             |      |               |
   9113       //             v      v               v
   9114       case 0x0:  XXX(False, Iop_CmpEQ32Fx4, False); break; // EQ_OQ
   9115       case 0x8:  XXX(False, Iop_CmpEQ32Fx4, False); break; // EQ_UQ
   9116       case 0x10: XXX(False, Iop_CmpEQ32Fx4, False); break; // EQ_OS
   9117       case 0x18: XXX(False, Iop_CmpEQ32Fx4, False); break; // EQ_US
   9118       //
   9119       case 0x1:  XXX(False, Iop_CmpLT32Fx4, False); break; // LT_OS
   9120       case 0x11: XXX(False, Iop_CmpLT32Fx4, False); break; // LT_OQ
   9121       //
   9122       case 0x2:  XXX(False, Iop_CmpLE32Fx4, False); break; // LE_OS
   9123       case 0x12: XXX(False, Iop_CmpLE32Fx4, False); break; // LE_OQ
   9124       //
   9125       case 0x3:  XXX(False, Iop_CmpUN32Fx4, False); break; // UNORD_Q
   9126       case 0x13: XXX(False, Iop_CmpUN32Fx4, False); break; // UNORD_S
   9127       //
   9128       // 0xC: this isn't really right because it returns all-1s when
   9129       // either operand is a NaN, and it should return all-0s.
   9130       case 0x4:  XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_UQ
   9131       case 0xC:  XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_OQ
   9132       case 0x14: XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_US
   9133       case 0x1C: XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_OS
   9134       //
   9135       case 0x5:  XXX(False, Iop_CmpLT32Fx4, True);  break; // NLT_US
   9136       case 0x15: XXX(False, Iop_CmpLT32Fx4, True);  break; // NLT_UQ
   9137       //
   9138       case 0x6:  XXX(False, Iop_CmpLE32Fx4, True);  break; // NLE_US
   9139       case 0x16: XXX(False, Iop_CmpLE32Fx4, True);  break; // NLE_UQ
   9140       //
   9141       case 0x7:  XXX(False, Iop_CmpUN32Fx4, True);  break; // ORD_Q
   9142       case 0x17: XXX(False, Iop_CmpUN32Fx4, True);  break; // ORD_S
   9143       //
   9144       case 0x9:  XXX(True,  Iop_CmpLE32Fx4, True);  break; // NGE_US
   9145       case 0x19: XXX(True,  Iop_CmpLE32Fx4, True);  break; // NGE_UQ
   9146       //
   9147       case 0xA:  XXX(True,  Iop_CmpLT32Fx4, True);  break; // NGT_US
   9148       case 0x1A: XXX(True,  Iop_CmpLT32Fx4, True);  break; // NGT_UQ
   9149       //
   9150       case 0xD:  XXX(True,  Iop_CmpLE32Fx4, False); break; // GE_OS
   9151       case 0x1D: XXX(True,  Iop_CmpLE32Fx4, False); break; // GE_OQ
   9152       //
   9153       case 0xE:  XXX(True,  Iop_CmpLT32Fx4, False); break; // GT_OS
   9154       case 0x1E: XXX(True,  Iop_CmpLT32Fx4, False); break; // GT_OQ
   9155       // Unhandled:
   9156       // 0xB  FALSE_OQ
   9157       // 0xF  TRUE_UQ
   9158       // 0x1B  FALSE_OS
   9159       // 0x1F  TRUE_US
   9160       /* Don't forget to add test cases to VCMPSS_128_<imm8> in
   9161          avx-1.c if new cases turn up. */
   9162       default: break;
   9163    }
   9164 #  undef XXX
   9165    if (op == Iop_INVALID) return False;
   9166 
   9167    /* Now convert the op into one with the same arithmetic but that is
   9168       correct for the width and laneage requirements. */
   9169 
   9170    /**/ if (sz == 4 && all_lanes) {
   9171       switch (op) {
   9172          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ32Fx4; break;
   9173          case Iop_CmpLT32Fx4: op = Iop_CmpLT32Fx4; break;
   9174          case Iop_CmpLE32Fx4: op = Iop_CmpLE32Fx4; break;
   9175          case Iop_CmpUN32Fx4: op = Iop_CmpUN32Fx4; break;
   9176          default: vassert(0);
   9177       }
   9178    }
   9179    else if (sz == 4 && !all_lanes) {
   9180       switch (op) {
   9181          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ32F0x4; break;
   9182          case Iop_CmpLT32Fx4: op = Iop_CmpLT32F0x4; break;
   9183          case Iop_CmpLE32Fx4: op = Iop_CmpLE32F0x4; break;
   9184          case Iop_CmpUN32Fx4: op = Iop_CmpUN32F0x4; break;
   9185          default: vassert(0);
   9186       }
   9187    }
   9188    else if (sz == 8 && all_lanes) {
   9189       switch (op) {
   9190          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ64Fx2; break;
   9191          case Iop_CmpLT32Fx4: op = Iop_CmpLT64Fx2; break;
   9192          case Iop_CmpLE32Fx4: op = Iop_CmpLE64Fx2; break;
   9193          case Iop_CmpUN32Fx4: op = Iop_CmpUN64Fx2; break;
   9194          default: vassert(0);
   9195       }
   9196    }
   9197    else if (sz == 8 && !all_lanes) {
   9198       switch (op) {
   9199          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ64F0x2; break;
   9200          case Iop_CmpLT32Fx4: op = Iop_CmpLT64F0x2; break;
   9201          case Iop_CmpLE32Fx4: op = Iop_CmpLE64F0x2; break;
   9202          case Iop_CmpUN32Fx4: op = Iop_CmpUN64F0x2; break;
   9203          default: vassert(0);
   9204       }
   9205    }
   9206    else {
   9207       vpanic("findSSECmpOp(amd64,guest)");
   9208    }
   9209 
   9210    *preSwapP = pre; *opP = op; *postNotP = not;
   9211    return True;
   9212 }
   9213 
   9214 
   9215 /* Handles SSE 32F/64F comparisons.  It can fail, in which case it
   9216    returns the original delta to indicate failure. */
   9217 
   9218 static Long dis_SSE_cmp_E_to_G ( const VexAbiInfo* vbi,
   9219                                  Prefix pfx, Long delta,
   9220                                  const HChar* opname, Bool all_lanes, Int sz )
   9221 {
   9222    Long    delta0 = delta;
   9223    HChar   dis_buf[50];
   9224    Int     alen;
   9225    UInt    imm8;
   9226    IRTemp  addr;
   9227    Bool    preSwap = False;
   9228    IROp    op      = Iop_INVALID;
   9229    Bool    postNot = False;
   9230    IRTemp  plain   = newTemp(Ity_V128);
   9231    UChar   rm      = getUChar(delta);
   9232    UShort  mask    = 0;
   9233    vassert(sz == 4 || sz == 8);
   9234    if (epartIsReg(rm)) {
   9235       imm8 = getUChar(delta+1);
   9236       if (imm8 >= 8) return delta0; /* FAIL */
   9237       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
   9238       if (!ok) return delta0; /* FAIL */
   9239       vassert(!preSwap); /* never needed for imm8 < 8 */
   9240       assign( plain, binop(op, getXMMReg(gregOfRexRM(pfx,rm)),
   9241                                getXMMReg(eregOfRexRM(pfx,rm))) );
   9242       delta += 2;
   9243       DIP("%s $%u,%s,%s\n", opname,
   9244                             imm8,
   9245                             nameXMMReg(eregOfRexRM(pfx,rm)),
   9246                             nameXMMReg(gregOfRexRM(pfx,rm)) );
   9247    } else {
   9248       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   9249       imm8 = getUChar(delta+alen);
   9250       if (imm8 >= 8) return delta0; /* FAIL */
   9251       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
   9252       if (!ok) return delta0; /* FAIL */
   9253       vassert(!preSwap); /* never needed for imm8 < 8 */
   9254       assign( plain,
   9255               binop(
   9256                  op,
   9257                  getXMMReg(gregOfRexRM(pfx,rm)),
   9258                    all_lanes
   9259                       ? loadLE(Ity_V128, mkexpr(addr))
   9260                    : sz == 8
   9261                       ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
   9262                    : /*sz==4*/
   9263                       unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
   9264               )
   9265       );
   9266       delta += alen+1;
   9267       DIP("%s $%u,%s,%s\n", opname,
   9268                             imm8,
   9269                             dis_buf,
   9270                             nameXMMReg(gregOfRexRM(pfx,rm)) );
   9271    }
   9272 
   9273    if (postNot && all_lanes) {
   9274       putXMMReg( gregOfRexRM(pfx,rm),
   9275                  unop(Iop_NotV128, mkexpr(plain)) );
   9276    }
   9277    else
   9278    if (postNot && !all_lanes) {
   9279       mask = toUShort(sz==4 ? 0x000F : 0x00FF);
   9280       putXMMReg( gregOfRexRM(pfx,rm),
   9281                  binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) );
   9282    }
   9283    else {
   9284       putXMMReg( gregOfRexRM(pfx,rm), mkexpr(plain) );
   9285    }
   9286 
   9287    return delta;
   9288 }
   9289 
   9290 
   9291 /* Vector by scalar shift of G by the amount specified at the bottom
   9292    of E. */
   9293 
   9294 static ULong dis_SSE_shiftG_byE ( const VexAbiInfo* vbi,
   9295                                   Prefix pfx, Long delta,
   9296                                   const HChar* opname, IROp op )
   9297 {
   9298    HChar   dis_buf[50];
   9299    Int     alen, size;
   9300    IRTemp  addr;
   9301    Bool    shl, shr, sar;
   9302    UChar   rm   = getUChar(delta);
   9303    IRTemp  g0   = newTemp(Ity_V128);
   9304    IRTemp  g1   = newTemp(Ity_V128);
   9305    IRTemp  amt  = newTemp(Ity_I64);
   9306    IRTemp  amt8 = newTemp(Ity_I8);
   9307    if (epartIsReg(rm)) {
   9308       assign( amt, getXMMRegLane64(eregOfRexRM(pfx,rm), 0) );
   9309       DIP("%s %s,%s\n", opname,
   9310                         nameXMMReg(eregOfRexRM(pfx,rm)),
   9311                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9312       delta++;
   9313    } else {
   9314       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9315       assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
   9316       DIP("%s %s,%s\n", opname,
   9317                         dis_buf,
   9318                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9319       delta += alen;
   9320    }
   9321    assign( g0,   getXMMReg(gregOfRexRM(pfx,rm)) );
   9322    assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
   9323 
   9324    shl = shr = sar = False;
   9325    size = 0;
   9326    switch (op) {
   9327       case Iop_ShlN16x8: shl = True; size = 32; break;
   9328       case Iop_ShlN32x4: shl = True; size = 32; break;
   9329       case Iop_ShlN64x2: shl = True; size = 64; break;
   9330       case Iop_SarN16x8: sar = True; size = 16; break;
   9331       case Iop_SarN32x4: sar = True; size = 32; break;
   9332       case Iop_ShrN16x8: shr = True; size = 16; break;
   9333       case Iop_ShrN32x4: shr = True; size = 32; break;
   9334       case Iop_ShrN64x2: shr = True; size = 64; break;
   9335       default: vassert(0);
   9336    }
   9337 
   9338    if (shl || shr) {
   9339      assign(
   9340         g1,
   9341         IRExpr_ITE(
   9342            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
   9343            binop(op, mkexpr(g0), mkexpr(amt8)),
   9344            mkV128(0x0000)
   9345         )
   9346      );
   9347    } else
   9348    if (sar) {
   9349      assign(
   9350         g1,
   9351         IRExpr_ITE(
   9352            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
   9353            binop(op, mkexpr(g0), mkexpr(amt8)),
   9354            binop(op, mkexpr(g0), mkU8(size-1))
   9355         )
   9356      );
   9357    } else {
   9358       vassert(0);
   9359    }
   9360 
   9361    putXMMReg( gregOfRexRM(pfx,rm), mkexpr(g1) );
   9362    return delta;
   9363 }
   9364 
   9365 
   9366 /* Vector by scalar shift of E by an immediate byte. */
   9367 
   9368 static
   9369 ULong dis_SSE_shiftE_imm ( Prefix pfx,
   9370                            Long delta, const HChar* opname, IROp op )
   9371 {
   9372    Bool    shl, shr, sar;
   9373    UChar   rm   = getUChar(delta);
   9374    IRTemp  e0   = newTemp(Ity_V128);
   9375    IRTemp  e1   = newTemp(Ity_V128);
   9376    UChar   amt, size;
   9377    vassert(epartIsReg(rm));
   9378    vassert(gregLO3ofRM(rm) == 2
   9379            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
   9380    amt = getUChar(delta+1);
   9381    delta += 2;
   9382    DIP("%s $%d,%s\n", opname,
   9383                       (Int)amt,
   9384                       nameXMMReg(eregOfRexRM(pfx,rm)) );
   9385    assign( e0, getXMMReg(eregOfRexRM(pfx,rm)) );
   9386 
   9387    shl = shr = sar = False;
   9388    size = 0;
   9389    switch (op) {
   9390       case Iop_ShlN16x8: shl = True; size = 16; break;
   9391       case Iop_ShlN32x4: shl = True; size = 32; break;
   9392       case Iop_ShlN64x2: shl = True; size = 64; break;
   9393       case Iop_SarN16x8: sar = True; size = 16; break;
   9394       case Iop_SarN32x4: sar = True; size = 32; break;
   9395       case Iop_ShrN16x8: shr = True; size = 16; break;
   9396       case Iop_ShrN32x4: shr = True; size = 32; break;
   9397       case Iop_ShrN64x2: shr = True; size = 64; break;
   9398       default: vassert(0);
   9399    }
   9400 
   9401    if (shl || shr) {
   9402      assign( e1, amt >= size
   9403                     ? mkV128(0x0000)
   9404                     : binop(op, mkexpr(e0), mkU8(amt))
   9405      );
   9406    } else
   9407    if (sar) {
   9408      assign( e1, amt >= size
   9409                     ? binop(op, mkexpr(e0), mkU8(size-1))
   9410                     : binop(op, mkexpr(e0), mkU8(amt))
   9411      );
   9412    } else {
   9413       vassert(0);
   9414    }
   9415 
   9416    putXMMReg( eregOfRexRM(pfx,rm), mkexpr(e1) );
   9417    return delta;
   9418 }
   9419 
   9420 
   9421 /* Get the current SSE rounding mode. */
   9422 
   9423 static IRExpr* /* :: Ity_I32 */ get_sse_roundingmode ( void )
   9424 {
   9425    return
   9426       unop( Iop_64to32,
   9427             binop( Iop_And64,
   9428                    IRExpr_Get( OFFB_SSEROUND, Ity_I64 ),
   9429                    mkU64(3) ));
   9430 }
   9431 
   9432 static void put_sse_roundingmode ( IRExpr* sseround )
   9433 {
   9434    vassert(typeOfIRExpr(irsb->tyenv, sseround) == Ity_I32);
   9435    stmt( IRStmt_Put( OFFB_SSEROUND,
   9436                      unop(Iop_32Uto64,sseround) ) );
   9437 }
   9438 
   9439 /* Break a V128-bit value up into four 32-bit ints. */
   9440 
   9441 static void breakupV128to32s ( IRTemp t128,
   9442                                /*OUTs*/
   9443                                IRTemp* t3, IRTemp* t2,
   9444                                IRTemp* t1, IRTemp* t0 )
   9445 {
   9446    IRTemp hi64 = newTemp(Ity_I64);
   9447    IRTemp lo64 = newTemp(Ity_I64);
   9448    assign( hi64, unop(Iop_V128HIto64, mkexpr(t128)) );
   9449    assign( lo64, unop(Iop_V128to64,   mkexpr(t128)) );
   9450 
   9451    vassert(t0 && *t0 == IRTemp_INVALID);
   9452    vassert(t1 && *t1 == IRTemp_INVALID);
   9453    vassert(t2 && *t2 == IRTemp_INVALID);
   9454    vassert(t3 && *t3 == IRTemp_INVALID);
   9455 
   9456    *t0 = newTemp(Ity_I32);
   9457    *t1 = newTemp(Ity_I32);
   9458    *t2 = newTemp(Ity_I32);
   9459    *t3 = newTemp(Ity_I32);
   9460    assign( *t0, unop(Iop_64to32,   mkexpr(lo64)) );
   9461    assign( *t1, unop(Iop_64HIto32, mkexpr(lo64)) );
   9462    assign( *t2, unop(Iop_64to32,   mkexpr(hi64)) );
   9463    assign( *t3, unop(Iop_64HIto32, mkexpr(hi64)) );
   9464 }
   9465 
   9466 /* Construct a V128-bit value from four 32-bit ints. */
   9467 
   9468 static IRExpr* mkV128from32s ( IRTemp t3, IRTemp t2,
   9469                                IRTemp t1, IRTemp t0 )
   9470 {
   9471    return
   9472       binop( Iop_64HLtoV128,
   9473              binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
   9474              binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0))
   9475    );
   9476 }
   9477 
   9478 /* Break a 64-bit value up into four 16-bit ints. */
   9479 
   9480 static void breakup64to16s ( IRTemp t64,
   9481                              /*OUTs*/
   9482                              IRTemp* t3, IRTemp* t2,
   9483                              IRTemp* t1, IRTemp* t0 )
   9484 {
   9485    IRTemp hi32 = newTemp(Ity_I32);
   9486    IRTemp lo32 = newTemp(Ity_I32);
   9487    assign( hi32, unop(Iop_64HIto32, mkexpr(t64)) );
   9488    assign( lo32, unop(Iop_64to32,   mkexpr(t64)) );
   9489 
   9490    vassert(t0 && *t0 == IRTemp_INVALID);
   9491    vassert(t1 && *t1 == IRTemp_INVALID);
   9492    vassert(t2 && *t2 == IRTemp_INVALID);
   9493    vassert(t3 && *t3 == IRTemp_INVALID);
   9494 
   9495    *t0 = newTemp(Ity_I16);
   9496    *t1 = newTemp(Ity_I16);
   9497    *t2 = newTemp(Ity_I16);
   9498    *t3 = newTemp(Ity_I16);
   9499    assign( *t0, unop(Iop_32to16,   mkexpr(lo32)) );
   9500    assign( *t1, unop(Iop_32HIto16, mkexpr(lo32)) );
   9501    assign( *t2, unop(Iop_32to16,   mkexpr(hi32)) );
   9502    assign( *t3, unop(Iop_32HIto16, mkexpr(hi32)) );
   9503 }
   9504 
   9505 /* Construct a 64-bit value from four 16-bit ints. */
   9506 
   9507 static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2,
   9508                              IRTemp t1, IRTemp t0 )
   9509 {
   9510    return
   9511       binop( Iop_32HLto64,
   9512              binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)),
   9513              binop(Iop_16HLto32, mkexpr(t1), mkexpr(t0))
   9514    );
   9515 }
   9516 
   9517 /* Break a V256-bit value up into four 64-bit ints. */
   9518 
   9519 static void breakupV256to64s ( IRTemp t256,
   9520                                /*OUTs*/
   9521                                IRTemp* t3, IRTemp* t2,
   9522                                IRTemp* t1, IRTemp* t0 )
   9523 {
   9524    vassert(t0 && *t0 == IRTemp_INVALID);
   9525    vassert(t1 && *t1 == IRTemp_INVALID);
   9526    vassert(t2 && *t2 == IRTemp_INVALID);
   9527    vassert(t3 && *t3 == IRTemp_INVALID);
   9528    *t0 = newTemp(Ity_I64);
   9529    *t1 = newTemp(Ity_I64);
   9530    *t2 = newTemp(Ity_I64);
   9531    *t3 = newTemp(Ity_I64);
   9532    assign( *t0, unop(Iop_V256to64_0, mkexpr(t256)) );
   9533    assign( *t1, unop(Iop_V256to64_1, mkexpr(t256)) );
   9534    assign( *t2, unop(Iop_V256to64_2, mkexpr(t256)) );
   9535    assign( *t3, unop(Iop_V256to64_3, mkexpr(t256)) );
   9536 }
   9537 
   9538 /* Break a V256-bit value up into two V128s. */
   9539 
   9540 static void breakupV256toV128s ( IRTemp t256,
   9541                                  /*OUTs*/
   9542                                  IRTemp* t1, IRTemp* t0 )
   9543 {
   9544    vassert(t0 && *t0 == IRTemp_INVALID);
   9545    vassert(t1 && *t1 == IRTemp_INVALID);
   9546    *t0 = newTemp(Ity_V128);
   9547    *t1 = newTemp(Ity_V128);
   9548    assign(*t1, unop(Iop_V256toV128_1, mkexpr(t256)));
   9549    assign(*t0, unop(Iop_V256toV128_0, mkexpr(t256)));
   9550 }
   9551 
   9552 /* Break a V256-bit value up into eight 32-bit ints.  */
   9553 
   9554 static void breakupV256to32s ( IRTemp t256,
   9555                                /*OUTs*/
   9556                                IRTemp* t7, IRTemp* t6,
   9557                                IRTemp* t5, IRTemp* t4,
   9558                                IRTemp* t3, IRTemp* t2,
   9559                                IRTemp* t1, IRTemp* t0 )
   9560 {
   9561    IRTemp t128_1 = IRTemp_INVALID;
   9562    IRTemp t128_0 = IRTemp_INVALID;
   9563    breakupV256toV128s( t256, &t128_1, &t128_0 );
   9564    breakupV128to32s( t128_1, t7, t6, t5, t4 );
   9565    breakupV128to32s( t128_0, t3, t2, t1, t0 );
   9566 }
   9567 
   9568 /* Break a V128-bit value up into two 64-bit ints. */
   9569 
   9570 static void breakupV128to64s ( IRTemp t128,
   9571                                /*OUTs*/
   9572                                IRTemp* t1, IRTemp* t0 )
   9573 {
   9574    vassert(t0 && *t0 == IRTemp_INVALID);
   9575    vassert(t1 && *t1 == IRTemp_INVALID);
   9576    *t0 = newTemp(Ity_I64);
   9577    *t1 = newTemp(Ity_I64);
   9578    assign( *t0, unop(Iop_V128to64,   mkexpr(t128)) );
   9579    assign( *t1, unop(Iop_V128HIto64, mkexpr(t128)) );
   9580 }
   9581 
   9582 /* Construct a V256-bit value from eight 32-bit ints. */
   9583 
   9584 static IRExpr* mkV256from32s ( IRTemp t7, IRTemp t6,
   9585                                IRTemp t5, IRTemp t4,
   9586                                IRTemp t3, IRTemp t2,
   9587                                IRTemp t1, IRTemp t0 )
   9588 {
   9589    return
   9590       binop( Iop_V128HLtoV256,
   9591              binop( Iop_64HLtoV128,
   9592                     binop(Iop_32HLto64, mkexpr(t7), mkexpr(t6)),
   9593                     binop(Iop_32HLto64, mkexpr(t5), mkexpr(t4)) ),
   9594              binop( Iop_64HLtoV128,
   9595                     binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
   9596                     binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0)) )
   9597    );
   9598 }
   9599 
   9600 /* Construct a V256-bit value from four 64-bit ints. */
   9601 
   9602 static IRExpr* mkV256from64s ( IRTemp t3, IRTemp t2,
   9603                                IRTemp t1, IRTemp t0 )
   9604 {
   9605    return
   9606       binop( Iop_V128HLtoV256,
   9607              binop(Iop_64HLtoV128, mkexpr(t3), mkexpr(t2)),
   9608              binop(Iop_64HLtoV128, mkexpr(t1), mkexpr(t0))
   9609    );
   9610 }
   9611 
   9612 /* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
   9613    values (aa,bb), computes, for each of the 4 16-bit lanes:
   9614 
   9615    (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
   9616 */
   9617 static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
   9618 {
   9619    IRTemp aa      = newTemp(Ity_I64);
   9620    IRTemp bb      = newTemp(Ity_I64);
   9621    IRTemp aahi32s = newTemp(Ity_I64);
   9622    IRTemp aalo32s = newTemp(Ity_I64);
   9623    IRTemp bbhi32s = newTemp(Ity_I64);
   9624    IRTemp bblo32s = newTemp(Ity_I64);
   9625    IRTemp rHi     = newTemp(Ity_I64);
   9626    IRTemp rLo     = newTemp(Ity_I64);
   9627    IRTemp one32x2 = newTemp(Ity_I64);
   9628    assign(aa, aax);
   9629    assign(bb, bbx);
   9630    assign( aahi32s,
   9631            binop(Iop_SarN32x2,
   9632                  binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
   9633                  mkU8(16) ));
   9634    assign( aalo32s,
   9635            binop(Iop_SarN32x2,
   9636                  binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
   9637                  mkU8(16) ));
   9638    assign( bbhi32s,
   9639            binop(Iop_SarN32x2,
   9640                  binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
   9641                  mkU8(16) ));
   9642    assign( bblo32s,
   9643            binop(Iop_SarN32x2,
   9644                  binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
   9645                  mkU8(16) ));
   9646    assign(one32x2, mkU64( (1ULL << 32) + 1 ));
   9647    assign(
   9648       rHi,
   9649       binop(
   9650          Iop_ShrN32x2,
   9651          binop(
   9652             Iop_Add32x2,
   9653             binop(
   9654                Iop_ShrN32x2,
   9655                binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
   9656                mkU8(14)
   9657             ),
   9658             mkexpr(one32x2)
   9659          ),
   9660          mkU8(1)
   9661       )
   9662    );
   9663    assign(
   9664       rLo,
   9665       binop(
   9666          Iop_ShrN32x2,
   9667          binop(
   9668             Iop_Add32x2,
   9669             binop(
   9670                Iop_ShrN32x2,
   9671                binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
   9672                mkU8(14)
   9673             ),
   9674             mkexpr(one32x2)
   9675          ),
   9676          mkU8(1)
   9677       )
   9678    );
   9679    return
   9680       binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
   9681 }
   9682 
   9683 /* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns.  Given two 64-bit
   9684    values (aa,bb), computes, for each lane:
   9685 
   9686           if aa_lane < 0 then - bb_lane
   9687      else if aa_lane > 0 then bb_lane
   9688      else 0
   9689 */
   9690 static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
   9691 {
   9692    IRTemp aa       = newTemp(Ity_I64);
   9693    IRTemp bb       = newTemp(Ity_I64);
   9694    IRTemp zero     = newTemp(Ity_I64);
   9695    IRTemp bbNeg    = newTemp(Ity_I64);
   9696    IRTemp negMask  = newTemp(Ity_I64);
   9697    IRTemp posMask  = newTemp(Ity_I64);
   9698    IROp   opSub    = Iop_INVALID;
   9699    IROp   opCmpGTS = Iop_INVALID;
   9700 
   9701    switch (laneszB) {
   9702       case 1: opSub = Iop_Sub8x8;  opCmpGTS = Iop_CmpGT8Sx8;  break;
   9703       case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
   9704       case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
   9705       default: vassert(0);
   9706    }
   9707 
   9708    assign( aa,      aax );
   9709    assign( bb,      bbx );
   9710    assign( zero,    mkU64(0) );
   9711    assign( bbNeg,   binop(opSub,    mkexpr(zero), mkexpr(bb)) );
   9712    assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
   9713    assign( posMask, binop(opCmpGTS, mkexpr(aa),   mkexpr(zero)) );
   9714 
   9715    return
   9716       binop(Iop_Or64,
   9717             binop(Iop_And64, mkexpr(bb),    mkexpr(posMask)),
   9718             binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
   9719 
   9720 }
   9721 
   9722 
   9723 /* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns.  Given a 64-bit
   9724    value aa, computes, for each lane
   9725 
   9726    if aa < 0 then -aa else aa
   9727 
   9728    Note that the result is interpreted as unsigned, so that the
   9729    absolute value of the most negative signed input can be
   9730    represented.
   9731 */
   9732 static IRTemp math_PABS_MMX ( IRTemp aa, Int laneszB )
   9733 {
   9734    IRTemp res     = newTemp(Ity_I64);
   9735    IRTemp zero    = newTemp(Ity_I64);
   9736    IRTemp aaNeg   = newTemp(Ity_I64);
   9737    IRTemp negMask = newTemp(Ity_I64);
   9738    IRTemp posMask = newTemp(Ity_I64);
   9739    IROp   opSub   = Iop_INVALID;
   9740    IROp   opSarN  = Iop_INVALID;
   9741 
   9742    switch (laneszB) {
   9743       case 1: opSub = Iop_Sub8x8;  opSarN = Iop_SarN8x8;  break;
   9744       case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
   9745       case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
   9746       default: vassert(0);
   9747    }
   9748 
   9749    assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
   9750    assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
   9751    assign( zero,    mkU64(0) );
   9752    assign( aaNeg,   binop(opSub, mkexpr(zero), mkexpr(aa)) );
   9753    assign( res,
   9754            binop(Iop_Or64,
   9755                  binop(Iop_And64, mkexpr(aa),    mkexpr(posMask)),
   9756                  binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) ));
   9757    return res;
   9758 }
   9759 
   9760 /* XMM version of math_PABS_MMX. */
   9761 static IRTemp math_PABS_XMM ( IRTemp aa, Int laneszB )
   9762 {
   9763    IRTemp res  = newTemp(Ity_V128);
   9764    IRTemp aaHi = newTemp(Ity_I64);
   9765    IRTemp aaLo = newTemp(Ity_I64);
   9766    assign(aaHi, unop(Iop_V128HIto64, mkexpr(aa)));
   9767    assign(aaLo, unop(Iop_V128to64, mkexpr(aa)));
   9768    assign(res, binop(Iop_64HLtoV128,
   9769                      mkexpr(math_PABS_MMX(aaHi, laneszB)),
   9770                      mkexpr(math_PABS_MMX(aaLo, laneszB))));
   9771    return res;
   9772 }
   9773 
   9774 /* Specialisations of math_PABS_XMM, since there's no easy way to do
   9775    partial applications in C :-( */
   9776 static IRTemp math_PABS_XMM_pap4 ( IRTemp aa ) {
   9777    return math_PABS_XMM(aa, 4);
   9778 }
   9779 
   9780 static IRTemp math_PABS_XMM_pap2 ( IRTemp aa ) {
   9781    return math_PABS_XMM(aa, 2);
   9782 }
   9783 
   9784 static IRTemp math_PABS_XMM_pap1 ( IRTemp aa ) {
   9785    return math_PABS_XMM(aa, 1);
   9786 }
   9787 
   9788 /* YMM version of math_PABS_XMM. */
   9789 static IRTemp math_PABS_YMM ( IRTemp aa, Int laneszB )
   9790 {
   9791    IRTemp res  = newTemp(Ity_V256);
   9792    IRTemp aaHi = IRTemp_INVALID;
   9793    IRTemp aaLo = IRTemp_INVALID;
   9794    breakupV256toV128s(aa, &aaHi, &aaLo);
   9795    assign(res, binop(Iop_V128HLtoV256,
   9796                      mkexpr(math_PABS_XMM(aaHi, laneszB)),
   9797                      mkexpr(math_PABS_XMM(aaLo, laneszB))));
   9798    return res;
   9799 }
   9800 
   9801 static IRTemp math_PABS_YMM_pap4 ( IRTemp aa ) {
   9802    return math_PABS_YMM(aa, 4);
   9803 }
   9804 
   9805 static IRTemp math_PABS_YMM_pap2 ( IRTemp aa ) {
   9806    return math_PABS_YMM(aa, 2);
   9807 }
   9808 
   9809 static IRTemp math_PABS_YMM_pap1 ( IRTemp aa ) {
   9810    return math_PABS_YMM(aa, 1);
   9811 }
   9812 
   9813 static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
   9814                                         IRTemp lo64, Long byteShift )
   9815 {
   9816    vassert(byteShift >= 1 && byteShift <= 7);
   9817    return
   9818       binop(Iop_Or64,
   9819             binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
   9820             binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
   9821       );
   9822 }
   9823 
   9824 static IRTemp math_PALIGNR_XMM ( IRTemp sV, IRTemp dV, UInt imm8 )
   9825 {
   9826    IRTemp res = newTemp(Ity_V128);
   9827    IRTemp sHi = newTemp(Ity_I64);
   9828    IRTemp sLo = newTemp(Ity_I64);
   9829    IRTemp dHi = newTemp(Ity_I64);
   9830    IRTemp dLo = newTemp(Ity_I64);
   9831    IRTemp rHi = newTemp(Ity_I64);
   9832    IRTemp rLo = newTemp(Ity_I64);
   9833 
   9834    assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   9835    assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   9836    assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   9837    assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   9838 
   9839    if (imm8 == 0) {
   9840       assign( rHi, mkexpr(sHi) );
   9841       assign( rLo, mkexpr(sLo) );
   9842    }
   9843    else if (imm8 >= 1 && imm8 <= 7) {
   9844       assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, imm8) );
   9845       assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, imm8) );
   9846    }
   9847    else if (imm8 == 8) {
   9848       assign( rHi, mkexpr(dLo) );
   9849       assign( rLo, mkexpr(sHi) );
   9850    }
   9851    else if (imm8 >= 9 && imm8 <= 15) {
   9852       assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, imm8-8) );
   9853       assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, imm8-8) );
   9854    }
   9855    else if (imm8 == 16) {
   9856       assign( rHi, mkexpr(dHi) );
   9857       assign( rLo, mkexpr(dLo) );
   9858    }
   9859    else if (imm8 >= 17 && imm8 <= 23) {
   9860       assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(imm8-16))) );
   9861       assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, imm8-16) );
   9862    }
   9863    else if (imm8 == 24) {
   9864       assign( rHi, mkU64(0) );
   9865       assign( rLo, mkexpr(dHi) );
   9866    }
   9867    else if (imm8 >= 25 && imm8 <= 31) {
   9868       assign( rHi, mkU64(0) );
   9869       assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(imm8-24))) );
   9870    }
   9871    else if (imm8 >= 32 && imm8 <= 255) {
   9872       assign( rHi, mkU64(0) );
   9873       assign( rLo, mkU64(0) );
   9874    }
   9875    else
   9876       vassert(0);
   9877 
   9878    assign( res, binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo)));
   9879    return res;
   9880 }
   9881 
   9882 
   9883 /* Generate a SIGSEGV followed by a restart of the current instruction
   9884    if effective_addr is not 16-aligned.  This is required behaviour
   9885    for some SSE3 instructions and all 128-bit SSSE3 instructions.
   9886    This assumes that guest_RIP_curr_instr is set correctly! */
   9887 static
   9888 void gen_SEGV_if_not_XX_aligned ( IRTemp effective_addr, ULong mask )
   9889 {
   9890    stmt(
   9891       IRStmt_Exit(
   9892          binop(Iop_CmpNE64,
   9893                binop(Iop_And64,mkexpr(effective_addr),mkU64(mask)),
   9894                mkU64(0)),
   9895          Ijk_SigSEGV,
   9896          IRConst_U64(guest_RIP_curr_instr),
   9897          OFFB_RIP
   9898       )
   9899    );
   9900 }
   9901 
   9902 static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr ) {
   9903    gen_SEGV_if_not_XX_aligned(effective_addr, 16-1);
   9904 }
   9905 
   9906 static void gen_SEGV_if_not_32_aligned ( IRTemp effective_addr ) {
   9907    gen_SEGV_if_not_XX_aligned(effective_addr, 32-1);
   9908 }
   9909 
   9910 static void gen_SEGV_if_not_64_aligned ( IRTemp effective_addr ) {
   9911    gen_SEGV_if_not_XX_aligned(effective_addr, 64-1);
   9912 }
   9913 
   9914 /* Helper for deciding whether a given insn (starting at the opcode
   9915    byte) may validly be used with a LOCK prefix.  The following insns
   9916    may be used with LOCK when their destination operand is in memory.
   9917    AFAICS this is exactly the same for both 32-bit and 64-bit mode.
   9918 
   9919    ADD        80 /0,  81 /0,  82 /0,  83 /0,  00,  01
   9920    OR         80 /1,  81 /1,  82 /x,  83 /1,  08,  09
   9921    ADC        80 /2,  81 /2,  82 /2,  83 /2,  10,  11
   9922    SBB        81 /3,  81 /3,  82 /x,  83 /3,  18,  19
   9923    AND        80 /4,  81 /4,  82 /x,  83 /4,  20,  21
   9924    SUB        80 /5,  81 /5,  82 /x,  83 /5,  28,  29
   9925    XOR        80 /6,  81 /6,  82 /x,  83 /6,  30,  31
   9926 
   9927    DEC        FE /1,  FF /1
   9928    INC        FE /0,  FF /0
   9929 
   9930    NEG        F6 /3,  F7 /3
   9931    NOT        F6 /2,  F7 /2
   9932 
   9933    XCHG       86, 87
   9934 
   9935    BTC        0F BB,  0F BA /7
   9936    BTR        0F B3,  0F BA /6
   9937    BTS        0F AB,  0F BA /5
   9938 
   9939    CMPXCHG    0F B0,  0F B1
   9940    CMPXCHG8B  0F C7 /1
   9941 
   9942    XADD       0F C0,  0F C1
   9943 
   9944    ------------------------------
   9945 
   9946    80 /0  =  addb $imm8,  rm8
   9947    81 /0  =  addl $imm32, rm32  and  addw $imm16, rm16
   9948    82 /0  =  addb $imm8,  rm8
   9949    83 /0  =  addl $simm8, rm32  and  addw $simm8, rm16
   9950 
   9951    00     =  addb r8,  rm8
   9952    01     =  addl r32, rm32  and  addw r16, rm16
   9953 
   9954    Same for ADD OR ADC SBB AND SUB XOR
   9955 
   9956    FE /1  = dec rm8
   9957    FF /1  = dec rm32  and  dec rm16
   9958 
   9959    FE /0  = inc rm8
   9960    FF /0  = inc rm32  and  inc rm16
   9961 
   9962    F6 /3  = neg rm8
   9963    F7 /3  = neg rm32  and  neg rm16
   9964 
   9965    F6 /2  = not rm8
   9966    F7 /2  = not rm32  and  not rm16
   9967 
   9968    0F BB     = btcw r16, rm16    and  btcl r32, rm32
   9969    OF BA /7  = btcw $imm8, rm16  and  btcw $imm8, rm32
   9970 
   9971    Same for BTS, BTR
   9972 */
   9973 static Bool can_be_used_with_LOCK_prefix ( const UChar* opc )
   9974 {
   9975    switch (opc[0]) {
   9976       case 0x00: case 0x01: case 0x08: case 0x09:
   9977       case 0x10: case 0x11: case 0x18: case 0x19:
   9978       case 0x20: case 0x21: case 0x28: case 0x29:
   9979       case 0x30: case 0x31:
   9980          if (!epartIsReg(opc[1]))
   9981             return True;
   9982          break;
   9983 
   9984       case 0x80: case 0x81: case 0x82: case 0x83:
   9985          if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 6
   9986              && !epartIsReg(opc[1]))
   9987             return True;
   9988          break;
   9989 
   9990       case 0xFE: case 0xFF:
   9991          if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 1
   9992              && !epartIsReg(opc[1]))
   9993             return True;
   9994          break;
   9995 
   9996       case 0xF6: case 0xF7:
   9997          if (gregLO3ofRM(opc[1]) >= 2 && gregLO3ofRM(opc[1]) <= 3
   9998              && !epartIsReg(opc[1]))
   9999             return True;
   10000          break;
   10001 
   10002       case 0x86: case 0x87:
   10003          if (!epartIsReg(opc[1]))
   10004             return True;
   10005          break;
   10006 
   10007       case 0x0F: {
   10008          switch (opc[1]) {
   10009             case 0xBB: case 0xB3: case 0xAB:
   10010                if (!epartIsReg(opc[2]))
   10011                   return True;
   10012                break;
   10013             case 0xBA:
   10014                if (gregLO3ofRM(opc[2]) >= 5 && gregLO3ofRM(opc[2]) <= 7
   10015                    && !epartIsReg(opc[2]))
   10016                   return True;
   10017                break;
   10018             case 0xB0: case 0xB1:
   10019                if (!epartIsReg(opc[2]))
   10020                   return True;
   10021                break;
   10022             case 0xC7:
   10023                if (gregLO3ofRM(opc[2]) == 1 && !epartIsReg(opc[2]) )
   10024                   return True;
   10025                break;
   10026             case 0xC0: case 0xC1:
   10027                if (!epartIsReg(opc[2]))
   10028                   return True;
   10029                break;
   10030             default:
   10031                break;
   10032          } /* switch (opc[1]) */
   10033          break;
   10034       }
   10035 
   10036       default:
   10037          break;
   10038    } /* switch (opc[0]) */
   10039 
   10040    return False;
   10041 }
   10042 
   10043 
   10044 /*------------------------------------------------------------*/
   10045 /*---                                                      ---*/
   10046 /*--- Top-level SSE/SSE2: dis_ESC_0F__SSE2                 ---*/
   10047 /*---                                                      ---*/
   10048 /*------------------------------------------------------------*/
   10049 
   10050 static Long dis_COMISD ( const VexAbiInfo* vbi, Prefix pfx,
   10051                          Long delta, Bool isAvx, UChar opc )
   10052 {
   10053    vassert(opc == 0x2F/*COMISD*/ || opc == 0x2E/*UCOMISD*/);
   10054    Int    alen  = 0;
   10055    HChar  dis_buf[50];
   10056    IRTemp argL  = newTemp(Ity_F64);
   10057    IRTemp argR  = newTemp(Ity_F64);
   10058    UChar  modrm = getUChar(delta);
   10059    IRTemp addr  = IRTemp_INVALID;
   10060    if (epartIsReg(modrm)) {
   10061       assign( argR, getXMMRegLane64F( eregOfRexRM(pfx,modrm),
   10062                                       0/*lowest lane*/ ) );
   10063       delta += 1;
   10064       DIP("%s%scomisd %s,%s\n", isAvx ? "v" : "",
   10065                                 opc==0x2E ? "u" : "",
   10066                                 nameXMMReg(eregOfRexRM(pfx,modrm)),
   10067                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10068    } else {
   10069       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10070       assign( argR, loadLE(Ity_F64, mkexpr(addr)) );
   10071       delta += alen;
   10072       DIP("%s%scomisd %s,%s\n", isAvx ? "v" : "",
   10073                                 opc==0x2E ? "u" : "",
   10074                                 dis_buf,
   10075                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10076    }
   10077    assign( argL, getXMMRegLane64F( gregOfRexRM(pfx,modrm),
   10078                                    0/*lowest lane*/ ) );
   10079 
   10080    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   10081    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   10082    stmt( IRStmt_Put(
   10083             OFFB_CC_DEP1,
   10084             binop( Iop_And64,
   10085                    unop( Iop_32Uto64,
   10086                          binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)) ),
   10087                    mkU64(0x45)
   10088        )));
   10089    return delta;
   10090 }
   10091 
   10092 
   10093 static Long dis_COMISS ( const VexAbiInfo* vbi, Prefix pfx,
   10094                          Long delta, Bool isAvx, UChar opc )
   10095 {
   10096    vassert(opc == 0x2F/*COMISS*/ || opc == 0x2E/*UCOMISS*/);
   10097    Int    alen  = 0;
   10098    HChar  dis_buf[50];
   10099    IRTemp argL  = newTemp(Ity_F32);
   10100    IRTemp argR  = newTemp(Ity_F32);
   10101    UChar  modrm = getUChar(delta);
   10102    IRTemp addr  = IRTemp_INVALID;
   10103    if (epartIsReg(modrm)) {
   10104       assign( argR, getXMMRegLane32F( eregOfRexRM(pfx,modrm),
   10105                                       0/*lowest lane*/ ) );
   10106       delta += 1;
   10107       DIP("%s%scomiss %s,%s\n", isAvx ? "v" : "",
   10108                                 opc==0x2E ? "u" : "",
   10109                                 nameXMMReg(eregOfRexRM(pfx,modrm)),
   10110                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10111    } else {
   10112       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10113       assign( argR, loadLE(Ity_F32, mkexpr(addr)) );
   10114       delta += alen;
   10115       DIP("%s%scomiss %s,%s\n", isAvx ? "v" : "",
   10116                                 opc==0x2E ? "u" : "",
   10117                                 dis_buf,
   10118                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10119    }
   10120    assign( argL, getXMMRegLane32F( gregOfRexRM(pfx,modrm),
   10121                                    0/*lowest lane*/ ) );
   10122 
   10123    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   10124    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   10125    stmt( IRStmt_Put(
   10126             OFFB_CC_DEP1,
   10127             binop( Iop_And64,
   10128                    unop( Iop_32Uto64,
   10129                          binop(Iop_CmpF64,
   10130                                unop(Iop_F32toF64,mkexpr(argL)),
   10131                                unop(Iop_F32toF64,mkexpr(argR)))),
   10132                    mkU64(0x45)
   10133        )));
   10134    return delta;
   10135 }
   10136 
   10137 
   10138 static Long dis_PSHUFD_32x4 ( const VexAbiInfo* vbi, Prefix pfx,
   10139                               Long delta, Bool writesYmm )
   10140 {
   10141    Int    order;
   10142    Int    alen  = 0;
   10143    HChar  dis_buf[50];
   10144    IRTemp sV    = newTemp(Ity_V128);
   10145    UChar  modrm = getUChar(delta);
   10146    const HChar* strV  = writesYmm ? "v" : "";
   10147    IRTemp addr  = IRTemp_INVALID;
   10148    if (epartIsReg(modrm)) {
   10149       assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   10150       order = (Int)getUChar(delta+1);
   10151       delta += 1+1;
   10152       DIP("%spshufd $%d,%s,%s\n", strV, order,
   10153                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
   10154                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   10155    } else {
   10156       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   10157                         1/*byte after the amode*/ );
   10158       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   10159       order = (Int)getUChar(delta+alen);
   10160       delta += alen+1;
   10161       DIP("%spshufd $%d,%s,%s\n", strV, order,
   10162                                  dis_buf,
   10163                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   10164    }
   10165 
   10166    IRTemp s3, s2, s1, s0;
   10167    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   10168    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   10169 
   10170 #  define SEL(n)  ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   10171    IRTemp dV = newTemp(Ity_V128);
   10172    assign(dV,
   10173           mkV128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
   10174                          SEL((order>>2)&3), SEL((order>>0)&3) )
   10175    );
   10176 #  undef SEL
   10177 
   10178    (writesYmm ? putYMMRegLoAndZU : putXMMReg)
   10179       (gregOfRexRM(pfx,modrm), mkexpr(dV));
   10180    return delta;
   10181 }
   10182 
   10183 
   10184 static Long dis_PSHUFD_32x8 ( const VexAbiInfo* vbi, Prefix pfx, Long delta )
   10185 {
   10186    Int    order;
   10187    Int    alen  = 0;
   10188    HChar  dis_buf[50];
   10189    IRTemp sV    = newTemp(Ity_V256);
   10190    UChar  modrm = getUChar(delta);
   10191    IRTemp addr  = IRTemp_INVALID;
   10192    UInt   rG    = gregOfRexRM(pfx,modrm);
   10193    if (epartIsReg(modrm)) {
   10194       UInt rE = eregOfRexRM(pfx,modrm);
   10195       assign( sV, getYMMReg(rE) );
   10196       order = (Int)getUChar(delta+1);
   10197       delta += 1+1;
   10198       DIP("vpshufd $%d,%s,%s\n", order, nameYMMReg(rE), nameYMMReg(rG));
   10199    } else {
   10200       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   10201                         1/*byte after the amode*/ );
   10202       assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   10203       order = (Int)getUChar(delta+alen);
   10204       delta += alen+1;
   10205       DIP("vpshufd $%d,%s,%s\n", order,  dis_buf, nameYMMReg(rG));
   10206    }
   10207 
   10208    IRTemp s[8];
   10209    s[7] = s[6] = s[5] = s[4] = s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID;
   10210    breakupV256to32s( sV, &s[7], &s[6], &s[5], &s[4],
   10211                          &s[3], &s[2], &s[1], &s[0] );
   10212 
   10213    putYMMReg( rG, mkV256from32s( s[4 + ((order>>6)&3)],
   10214                                  s[4 + ((order>>4)&3)],
   10215                                  s[4 + ((order>>2)&3)],
   10216                                  s[4 + ((order>>0)&3)],
   10217                                  s[0 + ((order>>6)&3)],
   10218                                  s[0 + ((order>>4)&3)],
   10219                                  s[0 + ((order>>2)&3)],
   10220                                  s[0 + ((order>>0)&3)] ) );
   10221    return delta;
   10222 }
   10223 
   10224 
   10225 static IRTemp math_PSRLDQ ( IRTemp sV, Int imm )
   10226 {
   10227    IRTemp dV    = newTemp(Ity_V128);
   10228    IRTemp hi64  = newTemp(Ity_I64);
   10229    IRTemp lo64  = newTemp(Ity_I64);
   10230    IRTemp hi64r = newTemp(Ity_I64);
   10231    IRTemp lo64r = newTemp(Ity_I64);
   10232 
   10233    vassert(imm >= 0 && imm <= 255);
   10234    if (imm >= 16) {
   10235       assign(dV, mkV128(0x0000));
   10236       return dV;
   10237    }
   10238 
   10239    assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   10240    assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   10241 
   10242    if (imm == 0) {
   10243       assign( lo64r, mkexpr(lo64) );
   10244       assign( hi64r, mkexpr(hi64) );
   10245    }
   10246    else
   10247    if (imm == 8) {
   10248       assign( hi64r, mkU64(0) );
   10249       assign( lo64r, mkexpr(hi64) );
   10250    }
   10251    else
   10252    if (imm > 8) {
   10253       assign( hi64r, mkU64(0) );
   10254       assign( lo64r, binop( Iop_Shr64, mkexpr(hi64), mkU8( 8*(imm-8) ) ));
   10255    } else {
   10256       assign( hi64r, binop( Iop_Shr64, mkexpr(hi64), mkU8(8 * imm) ));
   10257       assign( lo64r,
   10258               binop( Iop_Or64,
   10259                      binop(Iop_Shr64, mkexpr(lo64),
   10260                            mkU8(8 * imm)),
   10261                      binop(Iop_Shl64, mkexpr(hi64),
   10262                            mkU8(8 * (8 - imm)) )
   10263                      )
   10264               );
   10265    }
   10266 
   10267    assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   10268    return dV;
   10269 }
   10270 
   10271 
   10272 static IRTemp math_PSLLDQ ( IRTemp sV, Int imm )
   10273 {
   10274    IRTemp       dV    = newTemp(Ity_V128);
   10275    IRTemp       hi64  = newTemp(Ity_I64);
   10276    IRTemp       lo64  = newTemp(Ity_I64);
   10277    IRTemp       hi64r = newTemp(Ity_I64);
   10278    IRTemp       lo64r = newTemp(Ity_I64);
   10279 
   10280    vassert(imm >= 0 && imm <= 255);
   10281    if (imm >= 16) {
   10282       assign(dV, mkV128(0x0000));
   10283       return dV;
   10284    }
   10285 
   10286    assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   10287    assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   10288 
   10289    if (imm == 0) {
   10290       assign( lo64r, mkexpr(lo64) );
   10291       assign( hi64r, mkexpr(hi64) );
   10292    }
   10293    else
   10294    if (imm == 8) {
   10295       assign( lo64r, mkU64(0) );
   10296       assign( hi64r, mkexpr(lo64) );
   10297    }
   10298    else
   10299    if (imm > 8) {
   10300       assign( lo64r, mkU64(0) );
   10301       assign( hi64r, binop( Iop_Shl64, mkexpr(lo64), mkU8( 8*(imm-8) ) ));
   10302    } else {
   10303       assign( lo64r, binop( Iop_Shl64, mkexpr(lo64), mkU8(8 * imm) ));
   10304       assign( hi64r,
   10305               binop( Iop_Or64,
   10306                      binop(Iop_Shl64, mkexpr(hi64),
   10307                            mkU8(8 * imm)),
   10308                      binop(Iop_Shr64, mkexpr(lo64),
   10309                            mkU8(8 * (8 - imm)) )
   10310                      )
   10311               );
   10312    }
   10313 
   10314    assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   10315    return dV;
   10316 }
   10317 
   10318 
   10319 static Long dis_CVTxSD2SI ( const VexAbiInfo* vbi, Prefix pfx,
   10320                             Long delta, Bool isAvx, UChar opc, Int sz )
   10321 {
   10322    vassert(opc == 0x2D/*CVTSD2SI*/ || opc == 0x2C/*CVTTSD2SI*/);
   10323    HChar  dis_buf[50];
   10324    Int    alen   = 0;
   10325    UChar  modrm  = getUChar(delta);
   10326    IRTemp addr   = IRTemp_INVALID;
   10327    IRTemp rmode  = newTemp(Ity_I32);
   10328    IRTemp f64lo  = newTemp(Ity_F64);
   10329    Bool   r2zero = toBool(opc == 0x2C);
   10330 
   10331    if (epartIsReg(modrm)) {
   10332       delta += 1;
   10333       assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
   10334       DIP("%scvt%ssd2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
   10335                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
   10336                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
   10337                                            False));
   10338    } else {
   10339       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10340       assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   10341       delta += alen;
   10342       DIP("%scvt%ssd2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
   10343                                   dis_buf,
   10344                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
   10345                                            False));
   10346    }
   10347 
   10348    if (r2zero) {
   10349       assign( rmode, mkU32((UInt)Irrm_ZERO) );
   10350    } else {
   10351       assign( rmode, get_sse_roundingmode() );
   10352    }
   10353 
   10354    if (sz == 4) {
   10355       putIReg32( gregOfRexRM(pfx,modrm),
   10356                  binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) );
   10357    } else {
   10358       vassert(sz == 8);
   10359       putIReg64( gregOfRexRM(pfx,modrm),
   10360                  binop( Iop_F64toI64S, mkexpr(rmode), mkexpr(f64lo)) );
   10361    }
   10362 
   10363    return delta;
   10364 }
   10365 
   10366 
   10367 static Long dis_CVTxSS2SI ( const VexAbiInfo* vbi, Prefix pfx,
   10368                             Long delta, Bool isAvx, UChar opc, Int sz )
   10369 {
   10370    vassert(opc == 0x2D/*CVTSS2SI*/ || opc == 0x2C/*CVTTSS2SI*/);
   10371    HChar  dis_buf[50];
   10372    Int    alen   = 0;
   10373    UChar  modrm  = getUChar(delta);
   10374    IRTemp addr   = IRTemp_INVALID;
   10375    IRTemp rmode  = newTemp(Ity_I32);
   10376    IRTemp f32lo  = newTemp(Ity_F32);
   10377    Bool   r2zero = toBool(opc == 0x2C);
   10378 
   10379    if (epartIsReg(modrm)) {
   10380       delta += 1;
   10381       assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
   10382       DIP("%scvt%sss2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
   10383                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
   10384                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
   10385                                            False));
   10386    } else {
   10387       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10388       assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   10389       delta += alen;
   10390       DIP("%scvt%sss2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
   10391                                   dis_buf,
   10392                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
   10393                                            False));
   10394    }
   10395 
   10396    if (r2zero) {
   10397       assign( rmode, mkU32((UInt)Irrm_ZERO) );
   10398    } else {
   10399       assign( rmode, get_sse_roundingmode() );
   10400    }
   10401 
   10402    if (sz == 4) {
   10403       putIReg32( gregOfRexRM(pfx,modrm),
   10404                  binop( Iop_F64toI32S,
   10405                         mkexpr(rmode),
   10406                         unop(Iop_F32toF64, mkexpr(f32lo))) );
   10407    } else {
   10408       vassert(sz == 8);
   10409       putIReg64( gregOfRexRM(pfx,modrm),
   10410                  binop( Iop_F64toI64S,
   10411                         mkexpr(rmode),
   10412                         unop(Iop_F32toF64, mkexpr(f32lo))) );
   10413    }
   10414 
   10415    return delta;
   10416 }
   10417 
   10418 
   10419 static Long dis_CVTPS2PD_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10420                                Long delta, Bool isAvx )
   10421 {
   10422    IRTemp addr  = IRTemp_INVALID;
   10423    Int    alen  = 0;
   10424    HChar  dis_buf[50];
   10425    IRTemp f32lo = newTemp(Ity_F32);
   10426    IRTemp f32hi = newTemp(Ity_F32);
   10427    UChar  modrm = getUChar(delta);
   10428    UInt   rG    = gregOfRexRM(pfx,modrm);
   10429    if (epartIsReg(modrm)) {
   10430       UInt rE = eregOfRexRM(pfx,modrm);
   10431       assign( f32lo, getXMMRegLane32F(rE, 0) );
   10432       assign( f32hi, getXMMRegLane32F(rE, 1) );
   10433       delta += 1;
   10434       DIP("%scvtps2pd %s,%s\n",
   10435           isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
   10436    } else {
   10437       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10438       assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) );
   10439       assign( f32hi, loadLE(Ity_F32,
   10440                             binop(Iop_Add64,mkexpr(addr),mkU64(4))) );
   10441       delta += alen;
   10442       DIP("%scvtps2pd %s,%s\n",
   10443           isAvx ? "v" : "", dis_buf, nameXMMReg(rG));
   10444    }
   10445 
   10446    putXMMRegLane64F( rG, 1, unop(Iop_F32toF64, mkexpr(f32hi)) );
   10447    putXMMRegLane64F( rG, 0, unop(Iop_F32toF64, mkexpr(f32lo)) );
   10448    if (isAvx)
   10449       putYMMRegLane128( rG, 1, mkV128(0));
   10450    return delta;
   10451 }
   10452 
   10453 
   10454 static Long dis_CVTPS2PD_256 ( const VexAbiInfo* vbi, Prefix pfx,
   10455                                Long delta )
   10456 {
   10457    IRTemp addr  = IRTemp_INVALID;
   10458    Int    alen  = 0;
   10459    HChar  dis_buf[50];
   10460    IRTemp f32_0 = newTemp(Ity_F32);
   10461    IRTemp f32_1 = newTemp(Ity_F32);
   10462    IRTemp f32_2 = newTemp(Ity_F32);
   10463    IRTemp f32_3 = newTemp(Ity_F32);
   10464    UChar  modrm = getUChar(delta);
   10465    UInt   rG    = gregOfRexRM(pfx,modrm);
   10466    if (epartIsReg(modrm)) {
   10467       UInt rE = eregOfRexRM(pfx,modrm);
   10468       assign( f32_0, getXMMRegLane32F(rE, 0) );
   10469       assign( f32_1, getXMMRegLane32F(rE, 1) );
   10470       assign( f32_2, getXMMRegLane32F(rE, 2) );
   10471       assign( f32_3, getXMMRegLane32F(rE, 3) );
   10472       delta += 1;
   10473       DIP("vcvtps2pd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   10474    } else {
   10475       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10476       assign( f32_0, loadLE(Ity_F32, mkexpr(addr)) );
   10477       assign( f32_1, loadLE(Ity_F32,
   10478                             binop(Iop_Add64,mkexpr(addr),mkU64(4))) );
   10479       assign( f32_2, loadLE(Ity_F32,
   10480                             binop(Iop_Add64,mkexpr(addr),mkU64(8))) );
   10481       assign( f32_3, loadLE(Ity_F32,
   10482                             binop(Iop_Add64,mkexpr(addr),mkU64(12))) );
   10483       delta += alen;
   10484       DIP("vcvtps2pd %s,%s\n", dis_buf, nameYMMReg(rG));
   10485    }
   10486 
   10487    putYMMRegLane64F( rG, 3, unop(Iop_F32toF64, mkexpr(f32_3)) );
   10488    putYMMRegLane64F( rG, 2, unop(Iop_F32toF64, mkexpr(f32_2)) );
   10489    putYMMRegLane64F( rG, 1, unop(Iop_F32toF64, mkexpr(f32_1)) );
   10490    putYMMRegLane64F( rG, 0, unop(Iop_F32toF64, mkexpr(f32_0)) );
   10491    return delta;
   10492 }
   10493 
   10494 
   10495 static Long dis_CVTPD2PS_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10496                                Long delta, Bool isAvx )
   10497 {
   10498    IRTemp addr  = IRTemp_INVALID;
   10499    Int    alen  = 0;
   10500    HChar  dis_buf[50];
   10501    UChar  modrm = getUChar(delta);
   10502    UInt   rG    = gregOfRexRM(pfx,modrm);
   10503    IRTemp argV  = newTemp(Ity_V128);
   10504    IRTemp rmode = newTemp(Ity_I32);
   10505    if (epartIsReg(modrm)) {
   10506       UInt rE = eregOfRexRM(pfx,modrm);
   10507       assign( argV, getXMMReg(rE) );
   10508       delta += 1;
   10509       DIP("%scvtpd2ps %s,%s\n", isAvx ? "v" : "",
   10510           nameXMMReg(rE), nameXMMReg(rG));
   10511    } else {
   10512       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10513       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10514       delta += alen;
   10515       DIP("%scvtpd2ps %s,%s\n", isAvx ? "v" : "",
   10516           dis_buf, nameXMMReg(rG) );
   10517    }
   10518 
   10519    assign( rmode, get_sse_roundingmode() );
   10520    IRTemp t0 = newTemp(Ity_F64);
   10521    IRTemp t1 = newTemp(Ity_F64);
   10522    assign( t0, unop(Iop_ReinterpI64asF64,
   10523                     unop(Iop_V128to64, mkexpr(argV))) );
   10524    assign( t1, unop(Iop_ReinterpI64asF64,
   10525                     unop(Iop_V128HIto64, mkexpr(argV))) );
   10526 
   10527 #  define CVT(_t)  binop( Iop_F64toF32, mkexpr(rmode), mkexpr(_t) )
   10528    putXMMRegLane32(  rG, 3, mkU32(0) );
   10529    putXMMRegLane32(  rG, 2, mkU32(0) );
   10530    putXMMRegLane32F( rG, 1, CVT(t1) );
   10531    putXMMRegLane32F( rG, 0, CVT(t0) );
   10532 #  undef CVT
   10533    if (isAvx)
   10534       putYMMRegLane128( rG, 1, mkV128(0) );
   10535 
   10536    return delta;
   10537 }
   10538 
   10539 
   10540 static Long dis_CVTxPS2DQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10541                                 Long delta, Bool isAvx, Bool r2zero )
   10542 {
   10543    IRTemp addr  = IRTemp_INVALID;
   10544    Int    alen  = 0;
   10545    HChar  dis_buf[50];
   10546    UChar  modrm = getUChar(delta);
   10547    IRTemp argV  = newTemp(Ity_V128);
   10548    IRTemp rmode = newTemp(Ity_I32);
   10549    UInt   rG    = gregOfRexRM(pfx,modrm);
   10550    IRTemp t0, t1, t2, t3;
   10551 
   10552    if (epartIsReg(modrm)) {
   10553       UInt rE = eregOfRexRM(pfx,modrm);
   10554       assign( argV, getXMMReg(rE) );
   10555       delta += 1;
   10556       DIP("%scvt%sps2dq %s,%s\n",
   10557           isAvx ? "v" : "", r2zero ? "t" : "", nameXMMReg(rE), nameXMMReg(rG));
   10558    } else {
   10559       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10560       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10561       delta += alen;
   10562       DIP("%scvt%sps2dq %s,%s\n",
   10563           isAvx ? "v" : "", r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
   10564    }
   10565 
   10566    assign( rmode, r2zero ? mkU32((UInt)Irrm_ZERO)
   10567                          : get_sse_roundingmode() );
   10568    t0 = t1 = t2 = t3 = IRTemp_INVALID;
   10569    breakupV128to32s( argV, &t3, &t2, &t1, &t0 );
   10570    /* This is less than ideal.  If it turns out to be a performance
   10571       bottleneck it can be improved. */
   10572 #  define CVT(_t)                             \
   10573       binop( Iop_F64toI32S,                   \
   10574              mkexpr(rmode),                   \
   10575              unop( Iop_F32toF64,              \
   10576                    unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
   10577 
   10578    putXMMRegLane32( rG, 3, CVT(t3) );
   10579    putXMMRegLane32( rG, 2, CVT(t2) );
   10580    putXMMRegLane32( rG, 1, CVT(t1) );
   10581    putXMMRegLane32( rG, 0, CVT(t0) );
   10582 #  undef CVT
   10583    if (isAvx)
   10584       putYMMRegLane128( rG, 1, mkV128(0) );
   10585 
   10586    return delta;
   10587 }
   10588 
   10589 
   10590 static Long dis_CVTxPS2DQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
   10591                                 Long delta, Bool r2zero )
   10592 {
   10593    IRTemp addr  = IRTemp_INVALID;
   10594    Int    alen  = 0;
   10595    HChar  dis_buf[50];
   10596    UChar  modrm = getUChar(delta);
   10597    IRTemp argV  = newTemp(Ity_V256);
   10598    IRTemp rmode = newTemp(Ity_I32);
   10599    UInt   rG    = gregOfRexRM(pfx,modrm);
   10600    IRTemp t0, t1, t2, t3, t4, t5, t6, t7;
   10601 
   10602    if (epartIsReg(modrm)) {
   10603       UInt rE = eregOfRexRM(pfx,modrm);
   10604       assign( argV, getYMMReg(rE) );
   10605       delta += 1;
   10606       DIP("vcvt%sps2dq %s,%s\n",
   10607           r2zero ? "t" : "", nameYMMReg(rE), nameYMMReg(rG));
   10608    } else {
   10609       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10610       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
   10611       delta += alen;
   10612       DIP("vcvt%sps2dq %s,%s\n",
   10613           r2zero ? "t" : "", dis_buf, nameYMMReg(rG) );
   10614    }
   10615 
   10616    assign( rmode, r2zero ? mkU32((UInt)Irrm_ZERO)
   10617                          : get_sse_roundingmode() );
   10618    t0 = t1 = t2 = t3 = t4 = t5 = t6 = t7 = IRTemp_INVALID;
   10619    breakupV256to32s( argV, &t7, &t6, &t5, &t4, &t3, &t2, &t1, &t0 );
   10620    /* This is less than ideal.  If it turns out to be a performance
   10621       bottleneck it can be improved. */
   10622 #  define CVT(_t)                             \
   10623       binop( Iop_F64toI32S,                   \
   10624              mkexpr(rmode),                   \
   10625              unop( Iop_F32toF64,              \
   10626                    unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
   10627 
   10628    putYMMRegLane32( rG, 7, CVT(t7) );
   10629    putYMMRegLane32( rG, 6, CVT(t6) );
   10630    putYMMRegLane32( rG, 5, CVT(t5) );
   10631    putYMMRegLane32( rG, 4, CVT(t4) );
   10632    putYMMRegLane32( rG, 3, CVT(t3) );
   10633    putYMMRegLane32( rG, 2, CVT(t2) );
   10634    putYMMRegLane32( rG, 1, CVT(t1) );
   10635    putYMMRegLane32( rG, 0, CVT(t0) );
   10636 #  undef CVT
   10637 
   10638    return delta;
   10639 }
   10640 
   10641 
   10642 static Long dis_CVTxPD2DQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10643                                 Long delta, Bool isAvx, Bool r2zero )
   10644 {
   10645    IRTemp addr  = IRTemp_INVALID;
   10646    Int    alen  = 0;
   10647    HChar  dis_buf[50];
   10648    UChar  modrm = getUChar(delta);
   10649    IRTemp argV  = newTemp(Ity_V128);
   10650    IRTemp rmode = newTemp(Ity_I32);
   10651    UInt   rG    = gregOfRexRM(pfx,modrm);
   10652    IRTemp t0, t1;
   10653 
   10654    if (epartIsReg(modrm)) {
   10655       UInt rE = eregOfRexRM(pfx,modrm);
   10656       assign( argV, getXMMReg(rE) );
   10657       delta += 1;
   10658       DIP("%scvt%spd2dq %s,%s\n",
   10659           isAvx ? "v" : "", r2zero ? "t" : "", nameXMMReg(rE), nameXMMReg(rG));
   10660    } else {
   10661       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10662       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10663       delta += alen;
   10664       DIP("%scvt%spd2dqx %s,%s\n",
   10665           isAvx ? "v" : "", r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
   10666    }
   10667 
   10668    if (r2zero) {
   10669       assign(rmode, mkU32((UInt)Irrm_ZERO) );
   10670    } else {
   10671       assign( rmode, get_sse_roundingmode() );
   10672    }
   10673 
   10674    t0 = newTemp(Ity_F64);
   10675    t1 = newTemp(Ity_F64);
   10676    assign( t0, unop(Iop_ReinterpI64asF64,
   10677                     unop(Iop_V128to64, mkexpr(argV))) );
   10678    assign( t1, unop(Iop_ReinterpI64asF64,
   10679                     unop(Iop_V128HIto64, mkexpr(argV))) );
   10680 
   10681 #  define CVT(_t)  binop( Iop_F64toI32S,                   \
   10682                           mkexpr(rmode),                   \
   10683                           mkexpr(_t) )
   10684 
   10685    putXMMRegLane32( rG, 3, mkU32(0) );
   10686    putXMMRegLane32( rG, 2, mkU32(0) );
   10687    putXMMRegLane32( rG, 1, CVT(t1) );
   10688    putXMMRegLane32( rG, 0, CVT(t0) );
   10689 #  undef CVT
   10690    if (isAvx)
   10691       putYMMRegLane128( rG, 1, mkV128(0) );
   10692 
   10693    return delta;
   10694 }
   10695 
   10696 
   10697 static Long dis_CVTxPD2DQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
   10698                                 Long delta, Bool r2zero )
   10699 {
   10700    IRTemp addr  = IRTemp_INVALID;
   10701    Int    alen  = 0;
   10702    HChar  dis_buf[50];
   10703    UChar  modrm = getUChar(delta);
   10704    IRTemp argV  = newTemp(Ity_V256);
   10705    IRTemp rmode = newTemp(Ity_I32);
   10706    UInt   rG    = gregOfRexRM(pfx,modrm);
   10707    IRTemp t0, t1, t2, t3;
   10708 
   10709    if (epartIsReg(modrm)) {
   10710       UInt rE = eregOfRexRM(pfx,modrm);
   10711       assign( argV, getYMMReg(rE) );
   10712       delta += 1;
   10713       DIP("vcvt%spd2dq %s,%s\n",
   10714           r2zero ? "t" : "", nameYMMReg(rE), nameXMMReg(rG));
   10715    } else {
   10716       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10717       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
   10718       delta += alen;
   10719       DIP("vcvt%spd2dqy %s,%s\n",
   10720           r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
   10721    }
   10722 
   10723    if (r2zero) {
   10724       assign(rmode, mkU32((UInt)Irrm_ZERO) );
   10725    } else {
   10726       assign( rmode, get_sse_roundingmode() );
   10727    }
   10728 
   10729    t0 = IRTemp_INVALID;
   10730    t1 = IRTemp_INVALID;
   10731    t2 = IRTemp_INVALID;
   10732    t3 = IRTemp_INVALID;
   10733    breakupV256to64s( argV, &t3, &t2, &t1, &t0 );
   10734 
   10735 #  define CVT(_t)  binop( Iop_F64toI32S,                   \
   10736                           mkexpr(rmode),                   \
   10737                           unop( Iop_ReinterpI64asF64,      \
   10738                                 mkexpr(_t) ) )
   10739 
   10740    putXMMRegLane32( rG, 3, CVT(t3) );
   10741    putXMMRegLane32( rG, 2, CVT(t2) );
   10742    putXMMRegLane32( rG, 1, CVT(t1) );
   10743    putXMMRegLane32( rG, 0, CVT(t0) );
   10744 #  undef CVT
   10745    putYMMRegLane128( rG, 1, mkV128(0) );
   10746 
   10747    return delta;
   10748 }
   10749 
   10750 
   10751 static Long dis_CVTDQ2PS_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10752                                Long delta, Bool isAvx )
   10753 {
   10754    IRTemp addr  = IRTemp_INVALID;
   10755    Int    alen  = 0;
   10756    HChar  dis_buf[50];
   10757    UChar  modrm = getUChar(delta);
   10758    IRTemp argV  = newTemp(Ity_V128);
   10759    IRTemp rmode = newTemp(Ity_I32);
   10760    UInt   rG    = gregOfRexRM(pfx,modrm);
   10761    IRTemp t0, t1, t2, t3;
   10762 
   10763    if (epartIsReg(modrm)) {
   10764       UInt rE = eregOfRexRM(pfx,modrm);
   10765       assign( argV, getXMMReg(rE) );
   10766       delta += 1;
   10767       DIP("%scvtdq2ps %s,%s\n",
   10768           isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
   10769    } else {
   10770       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10771       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10772       delta += alen;
   10773       DIP("%scvtdq2ps %s,%s\n",
   10774           isAvx ? "v" : "", dis_buf, nameXMMReg(rG) );
   10775    }
   10776 
   10777    assign( rmode, get_sse_roundingmode() );
   10778    t0 = IRTemp_INVALID;
   10779    t1 = IRTemp_INVALID;
   10780    t2 = IRTemp_INVALID;
   10781    t3 = IRTemp_INVALID;
   10782    breakupV128to32s( argV, &t3, &t2, &t1, &t0 );
   10783 
   10784 #  define CVT(_t)  binop( Iop_F64toF32,                    \
   10785                           mkexpr(rmode),                   \
   10786                           unop(Iop_I32StoF64,mkexpr(_t)))
   10787 
   10788    putXMMRegLane32F( rG, 3, CVT(t3) );
   10789    putXMMRegLane32F( rG, 2, CVT(t2) );
   10790    putXMMRegLane32F( rG, 1, CVT(t1) );
   10791    putXMMRegLane32F( rG, 0, CVT(t0) );
   10792 #  undef CVT
   10793    if (isAvx)
   10794       putYMMRegLane128( rG, 1, mkV128(0) );
   10795 
   10796    return delta;
   10797 }
   10798 
   10799 static Long dis_CVTDQ2PS_256 ( const VexAbiInfo* vbi, Prefix pfx,
   10800                                Long delta )
   10801 {
   10802    IRTemp addr   = IRTemp_INVALID;
   10803    Int    alen   = 0;
   10804    HChar  dis_buf[50];
   10805    UChar  modrm  = getUChar(delta);
   10806    IRTemp argV   = newTemp(Ity_V256);
   10807    IRTemp rmode  = newTemp(Ity_I32);
   10808    UInt   rG     = gregOfRexRM(pfx,modrm);
   10809    IRTemp t0, t1, t2, t3, t4, t5, t6, t7;
   10810 
   10811    if (epartIsReg(modrm)) {
   10812       UInt rE = eregOfRexRM(pfx,modrm);
   10813       assign( argV, getYMMReg(rE) );
   10814       delta += 1;
   10815       DIP("vcvtdq2ps %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   10816    } else {
   10817       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10818       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
   10819       delta += alen;
   10820       DIP("vcvtdq2ps %s,%s\n", dis_buf, nameYMMReg(rG) );
   10821    }
   10822 
   10823    assign( rmode, get_sse_roundingmode() );
   10824    t0 = IRTemp_INVALID;
   10825    t1 = IRTemp_INVALID;
   10826    t2 = IRTemp_INVALID;
   10827    t3 = IRTemp_INVALID;
   10828    t4 = IRTemp_INVALID;
   10829    t5 = IRTemp_INVALID;
   10830    t6 = IRTemp_INVALID;
   10831    t7 = IRTemp_INVALID;
   10832    breakupV256to32s( argV, &t7, &t6, &t5, &t4, &t3, &t2, &t1, &t0 );
   10833 
   10834 #  define CVT(_t)  binop( Iop_F64toF32,                    \
   10835                           mkexpr(rmode),                   \
   10836                           unop(Iop_I32StoF64,mkexpr(_t)))
   10837 
   10838    putYMMRegLane32F( rG, 7, CVT(t7) );
   10839    putYMMRegLane32F( rG, 6, CVT(t6) );
   10840    putYMMRegLane32F( rG, 5, CVT(t5) );
   10841    putYMMRegLane32F( rG, 4, CVT(t4) );
   10842    putYMMRegLane32F( rG, 3, CVT(t3) );
   10843    putYMMRegLane32F( rG, 2, CVT(t2) );
   10844    putYMMRegLane32F( rG, 1, CVT(t1) );
   10845    putYMMRegLane32F( rG, 0, CVT(t0) );
   10846 #  undef CVT
   10847 
   10848    return delta;
   10849 }
   10850 
   10851 
   10852 static Long dis_PMOVMSKB_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10853                                Long delta, Bool isAvx )
   10854 {
   10855    UChar modrm = getUChar(delta);
   10856    vassert(epartIsReg(modrm)); /* ensured by caller */
   10857    UInt   rE = eregOfRexRM(pfx,modrm);
   10858    UInt   rG = gregOfRexRM(pfx,modrm);
   10859    IRTemp t0 = newTemp(Ity_V128);
   10860    IRTemp t1 = newTemp(Ity_I32);
   10861    assign(t0, getXMMReg(rE));
   10862    assign(t1, unop(Iop_16Uto32, unop(Iop_GetMSBs8x16, mkexpr(t0))));
   10863    putIReg32(rG, mkexpr(t1));
   10864    DIP("%spmovmskb %s,%s\n", isAvx ? "v" : "", nameXMMReg(rE),
   10865        nameIReg32(rG));
   10866    delta += 1;
   10867    return delta;
   10868 }
   10869 
   10870 
   10871 static Long dis_PMOVMSKB_256 ( const VexAbiInfo* vbi, Prefix pfx,
   10872                                Long delta  )
   10873 {
   10874    UChar modrm = getUChar(delta);
   10875    vassert(epartIsReg(modrm)); /* ensured by caller */
   10876    UInt   rE = eregOfRexRM(pfx,modrm);
   10877    UInt   rG = gregOfRexRM(pfx,modrm);
   10878    IRTemp t0 = newTemp(Ity_V128);
   10879    IRTemp t1 = newTemp(Ity_V128);
   10880    IRTemp t2 = newTemp(Ity_I16);
   10881    IRTemp t3 = newTemp(Ity_I16);
   10882    assign(t0, getYMMRegLane128(rE, 0));
   10883    assign(t1, getYMMRegLane128(rE, 1));
   10884    assign(t2, unop(Iop_GetMSBs8x16, mkexpr(t0)));
   10885    assign(t3, unop(Iop_GetMSBs8x16, mkexpr(t1)));
   10886    putIReg32(rG, binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)));
   10887    DIP("vpmovmskb %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
   10888    delta += 1;
   10889    return delta;
   10890 }
   10891 
   10892 
   10893 /* FIXME: why not just use InterleaveLO / InterleaveHI?  I think the
   10894    relevant ops are "xIsH ? InterleaveHI32x4 : InterleaveLO32x4". */
   10895 /* Does the maths for 128 bit versions of UNPCKLPS and UNPCKHPS */
   10896 static IRTemp math_UNPCKxPS_128 ( IRTemp sV, IRTemp dV, Bool xIsH )
   10897 {
   10898    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   10899    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   10900    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   10901    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   10902    IRTemp res = newTemp(Ity_V128);
   10903    assign(res,  xIsH ? mkV128from32s( s3, d3, s2, d2 )
   10904                      : mkV128from32s( s1, d1, s0, d0 ));
   10905    return res;
   10906 }
   10907 
   10908 
   10909 /* FIXME: why not just use InterleaveLO / InterleaveHI ?? */
   10910 /* Does the maths for 128 bit versions of UNPCKLPD and UNPCKHPD */
   10911 static IRTemp math_UNPCKxPD_128 ( IRTemp sV, IRTemp dV, Bool xIsH )
   10912 {
   10913    IRTemp s1 = newTemp(Ity_I64);
   10914    IRTemp s0 = newTemp(Ity_I64);
   10915    IRTemp d1 = newTemp(Ity_I64);
   10916    IRTemp d0 = newTemp(Ity_I64);
   10917    assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   10918    assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   10919    assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   10920    assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   10921    IRTemp res = newTemp(Ity_V128);
   10922    assign(res, xIsH ? binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1))
   10923                     : binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)));
   10924    return res;
   10925 }
   10926 
   10927 
   10928 /* Does the maths for 256 bit versions of UNPCKLPD and UNPCKHPD.
   10929    Doesn't seem like this fits in either of the Iop_Interleave{LO,HI}
   10930    or the Iop_Cat{Odd,Even}Lanes idioms, hence just do it the stupid
   10931    way. */
   10932 static IRTemp math_UNPCKxPD_256 ( IRTemp sV, IRTemp dV, Bool xIsH )
   10933 {
   10934    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   10935    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   10936    breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
   10937    breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
   10938    IRTemp res = newTemp(Ity_V256);
   10939    assign(res, xIsH
   10940                ? IRExpr_Qop(Iop_64x4toV256, mkexpr(s3), mkexpr(d3),
   10941                                             mkexpr(s1), mkexpr(d1))
   10942                : IRExpr_Qop(Iop_64x4toV256, mkexpr(s2), mkexpr(d2),
   10943                                             mkexpr(s0), mkexpr(d0)));
   10944    return res;
   10945 }
   10946 
   10947 
   10948 /* FIXME: this is really bad.  Surely can do something better here?
   10949    One observation is that the steering in the upper and lower 128 bit
   10950    halves is the same as with math_UNPCKxPS_128, so we simply split
   10951    into two halves, and use that.  Consequently any improvement in
   10952    math_UNPCKxPS_128 (probably, to use interleave-style primops)
   10953    benefits this too. */
   10954 static IRTemp math_UNPCKxPS_256 ( IRTemp sV, IRTemp dV, Bool xIsH )
   10955 {
   10956    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   10957    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   10958    breakupV256toV128s( sV, &sVhi, &sVlo );
   10959    breakupV256toV128s( dV, &dVhi, &dVlo );
   10960    IRTemp rVhi = math_UNPCKxPS_128(sVhi, dVhi, xIsH);
   10961    IRTemp rVlo = math_UNPCKxPS_128(sVlo, dVlo, xIsH);
   10962    IRTemp rV   = newTemp(Ity_V256);
   10963    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   10964    return rV;
   10965 }
   10966 
   10967 
   10968 static IRTemp math_SHUFPS_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   10969 {
   10970    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   10971    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   10972    vassert(imm8 < 256);
   10973 
   10974    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   10975    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   10976 
   10977 #  define SELD(n) ((n)==0 ? d0 : ((n)==1 ? d1 : ((n)==2 ? d2 : d3)))
   10978 #  define SELS(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   10979    IRTemp res = newTemp(Ity_V128);
   10980    assign(res,
   10981           mkV128from32s( SELS((imm8>>6)&3), SELS((imm8>>4)&3),
   10982                          SELD((imm8>>2)&3), SELD((imm8>>0)&3) ) );
   10983 #  undef SELD
   10984 #  undef SELS
   10985    return res;
   10986 }
   10987 
   10988 
   10989 /* 256-bit SHUFPS appears to steer each of the 128-bit halves
   10990    identically.  Hence do the clueless thing and use math_SHUFPS_128
   10991    twice. */
   10992 static IRTemp math_SHUFPS_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
   10993 {
   10994    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   10995    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   10996    breakupV256toV128s( sV, &sVhi, &sVlo );
   10997    breakupV256toV128s( dV, &dVhi, &dVlo );
   10998    IRTemp rVhi = math_SHUFPS_128(sVhi, dVhi, imm8);
   10999    IRTemp rVlo = math_SHUFPS_128(sVlo, dVlo, imm8);
   11000    IRTemp rV   = newTemp(Ity_V256);
   11001    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   11002    return rV;
   11003 }
   11004 
   11005 
   11006 static IRTemp math_SHUFPD_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11007 {
   11008    IRTemp s1 = newTemp(Ity_I64);
   11009    IRTemp s0 = newTemp(Ity_I64);
   11010    IRTemp d1 = newTemp(Ity_I64);
   11011    IRTemp d0 = newTemp(Ity_I64);
   11012 
   11013    assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   11014    assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   11015    assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   11016    assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   11017 
   11018 #  define SELD(n) mkexpr((n)==0 ? d0 : d1)
   11019 #  define SELS(n) mkexpr((n)==0 ? s0 : s1)
   11020 
   11021    IRTemp res = newTemp(Ity_V128);
   11022    assign(res, binop( Iop_64HLtoV128,
   11023                       SELS((imm8>>1)&1), SELD((imm8>>0)&1) ) );
   11024 
   11025 #  undef SELD
   11026 #  undef SELS
   11027    return res;
   11028 }
   11029 
   11030 
   11031 static IRTemp math_SHUFPD_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11032 {
   11033    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   11034    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   11035    breakupV256toV128s( sV, &sVhi, &sVlo );
   11036    breakupV256toV128s( dV, &dVhi, &dVlo );
   11037    IRTemp rVhi = math_SHUFPD_128(sVhi, dVhi, (imm8 >> 2) & 3);
   11038    IRTemp rVlo = math_SHUFPD_128(sVlo, dVlo, imm8 & 3);
   11039    IRTemp rV   = newTemp(Ity_V256);
   11040    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   11041    return rV;
   11042 }
   11043 
   11044 
   11045 static IRTemp math_BLENDPD_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11046 {
   11047    UShort imm8_mask_16;
   11048    IRTemp imm8_mask = newTemp(Ity_V128);
   11049 
   11050    switch( imm8 & 3 ) {
   11051       case 0:  imm8_mask_16 = 0x0000; break;
   11052       case 1:  imm8_mask_16 = 0x00FF; break;
   11053       case 2:  imm8_mask_16 = 0xFF00; break;
   11054       case 3:  imm8_mask_16 = 0xFFFF; break;
   11055       default: vassert(0);            break;
   11056    }
   11057    assign( imm8_mask, mkV128( imm8_mask_16 ) );
   11058 
   11059    IRTemp res = newTemp(Ity_V128);
   11060    assign ( res, binop( Iop_OrV128,
   11061                         binop( Iop_AndV128, mkexpr(sV),
   11062                                             mkexpr(imm8_mask) ),
   11063                         binop( Iop_AndV128, mkexpr(dV),
   11064                                unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
   11065    return res;
   11066 }
   11067 
   11068 
   11069 static IRTemp math_BLENDPD_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11070 {
   11071    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   11072    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   11073    breakupV256toV128s( sV, &sVhi, &sVlo );
   11074    breakupV256toV128s( dV, &dVhi, &dVlo );
   11075    IRTemp rVhi = math_BLENDPD_128(sVhi, dVhi, (imm8 >> 2) & 3);
   11076    IRTemp rVlo = math_BLENDPD_128(sVlo, dVlo, imm8 & 3);
   11077    IRTemp rV   = newTemp(Ity_V256);
   11078    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   11079    return rV;
   11080 }
   11081 
   11082 
   11083 static IRTemp math_BLENDPS_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11084 {
   11085    UShort imm8_perms[16] = { 0x0000, 0x000F, 0x00F0, 0x00FF, 0x0F00,
   11086                              0x0F0F, 0x0FF0, 0x0FFF, 0xF000, 0xF00F,
   11087                              0xF0F0, 0xF0FF, 0xFF00, 0xFF0F, 0xFFF0,
   11088                              0xFFFF };
   11089    IRTemp imm8_mask = newTemp(Ity_V128);
   11090    assign( imm8_mask, mkV128( imm8_perms[ (imm8 & 15) ] ) );
   11091 
   11092    IRTemp res = newTemp(Ity_V128);
   11093    assign ( res, binop( Iop_OrV128,
   11094                         binop( Iop_AndV128, mkexpr(sV),
   11095                                             mkexpr(imm8_mask) ),
   11096                         binop( Iop_AndV128, mkexpr(dV),
   11097                                unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
   11098    return res;
   11099 }
   11100 
   11101 
   11102 static IRTemp math_BLENDPS_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11103 {
   11104    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   11105    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   11106    breakupV256toV128s( sV, &sVhi, &sVlo );
   11107    breakupV256toV128s( dV, &dVhi, &dVlo );
   11108    IRTemp rVhi = math_BLENDPS_128(sVhi, dVhi, (imm8 >> 4) & 15);
   11109    IRTemp rVlo = math_BLENDPS_128(sVlo, dVlo, imm8 & 15);
   11110    IRTemp rV   = newTemp(Ity_V256);
   11111    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   11112    return rV;
   11113 }
   11114 
   11115 
   11116 static IRTemp math_PBLENDW_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11117 {
   11118    /* Make w be a 16-bit version of imm8, formed by duplicating each
   11119       bit in imm8. */
   11120    Int i;
   11121    UShort imm16 = 0;
   11122    for (i = 0; i < 8; i++) {
   11123       if (imm8 & (1 << i))
   11124          imm16 |= (3 << (2*i));
   11125    }
   11126    IRTemp imm16_mask = newTemp(Ity_V128);
   11127    assign( imm16_mask, mkV128( imm16 ));
   11128 
   11129    IRTemp res = newTemp(Ity_V128);
   11130    assign ( res, binop( Iop_OrV128,
   11131                         binop( Iop_AndV128, mkexpr(sV),
   11132                                             mkexpr(imm16_mask) ),
   11133                         binop( Iop_AndV128, mkexpr(dV),
   11134                                unop( Iop_NotV128, mkexpr(imm16_mask) ) ) ) );
   11135    return res;
   11136 }
   11137 
   11138 
   11139 static IRTemp math_PMULUDQ_128 ( IRTemp sV, IRTemp dV )
   11140 {
   11141    /* This is a really poor translation -- could be improved if
   11142       performance critical */
   11143    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   11144    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   11145    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   11146    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   11147    IRTemp res = newTemp(Ity_V128);
   11148    assign(res, binop(Iop_64HLtoV128,
   11149                      binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)),
   11150                      binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) ));
   11151    return res;
   11152 }
   11153 
   11154 
   11155 static IRTemp math_PMULUDQ_256 ( IRTemp sV, IRTemp dV )
   11156 {
   11157    /* This is a really poor translation -- could be improved if
   11158       performance critical */
   11159    IRTemp sHi, sLo, dHi, dLo;
   11160    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   11161    breakupV256toV128s( dV, &dHi, &dLo);
   11162    breakupV256toV128s( sV, &sHi, &sLo);
   11163    IRTemp res = newTemp(Ity_V256);
   11164    assign(res, binop(Iop_V128HLtoV256,
   11165                      mkexpr(math_PMULUDQ_128(sHi, dHi)),
   11166                      mkexpr(math_PMULUDQ_128(sLo, dLo))));
   11167    return res;
   11168 }
   11169 
   11170 
   11171 static IRTemp math_PMULDQ_128 ( IRTemp dV, IRTemp sV )
   11172 {
   11173    /* This is a really poor translation -- could be improved if
   11174       performance critical */
   11175    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   11176    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   11177    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   11178    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   11179    IRTemp res = newTemp(Ity_V128);
   11180    assign(res, binop(Iop_64HLtoV128,
   11181                      binop( Iop_MullS32, mkexpr(d2), mkexpr(s2)),
   11182                      binop( Iop_MullS32, mkexpr(d0), mkexpr(s0)) ));
   11183    return res;
   11184 }
   11185 
   11186 
   11187 static IRTemp math_PMULDQ_256 ( IRTemp sV, IRTemp dV )
   11188 {
   11189    /* This is a really poor translation -- could be improved if
   11190       performance critical */
   11191    IRTemp sHi, sLo, dHi, dLo;
   11192    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   11193    breakupV256toV128s( dV, &dHi, &dLo);
   11194    breakupV256toV128s( sV, &sHi, &sLo);
   11195    IRTemp res = newTemp(Ity_V256);
   11196    assign(res, binop(Iop_V128HLtoV256,
   11197                      mkexpr(math_PMULDQ_128(sHi, dHi)),
   11198                      mkexpr(math_PMULDQ_128(sLo, dLo))));
   11199    return res;
   11200 }
   11201 
   11202 
   11203 static IRTemp math_PMADDWD_128 ( IRTemp dV, IRTemp sV )
   11204 {
   11205    IRTemp sVhi, sVlo, dVhi, dVlo;
   11206    IRTemp resHi = newTemp(Ity_I64);
   11207    IRTemp resLo = newTemp(Ity_I64);
   11208    sVhi = sVlo = dVhi = dVlo = IRTemp_INVALID;
   11209    breakupV128to64s( sV, &sVhi, &sVlo );
   11210    breakupV128to64s( dV, &dVhi, &dVlo );
   11211    assign( resHi, mkIRExprCCall(Ity_I64, 0/*regparms*/,
   11212                                 "amd64g_calculate_mmx_pmaddwd",
   11213                                 &amd64g_calculate_mmx_pmaddwd,
   11214                                 mkIRExprVec_2( mkexpr(sVhi), mkexpr(dVhi))));
   11215    assign( resLo, mkIRExprCCall(Ity_I64, 0/*regparms*/,
   11216                                 "amd64g_calculate_mmx_pmaddwd",
   11217                                 &amd64g_calculate_mmx_pmaddwd,
   11218                                 mkIRExprVec_2( mkexpr(sVlo), mkexpr(dVlo))));
   11219    IRTemp res = newTemp(Ity_V128);
   11220    assign( res, binop(Iop_64HLtoV128, mkexpr(resHi), mkexpr(resLo))) ;
   11221    return res;
   11222 }
   11223 
   11224 
   11225 static IRTemp math_PMADDWD_256 ( IRTemp dV, IRTemp sV )
   11226 {
   11227    IRTemp sHi, sLo, dHi, dLo;
   11228    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   11229    breakupV256toV128s( dV, &dHi, &dLo);
   11230    breakupV256toV128s( sV, &sHi, &sLo);
   11231    IRTemp res = newTemp(Ity_V256);
   11232    assign(res, binop(Iop_V128HLtoV256,
   11233                      mkexpr(math_PMADDWD_128(dHi, sHi)),
   11234                      mkexpr(math_PMADDWD_128(dLo, sLo))));
   11235    return res;
   11236 }
   11237 
   11238 
   11239 static IRTemp math_ADDSUBPD_128 ( IRTemp dV, IRTemp sV )
   11240 {
   11241    IRTemp addV = newTemp(Ity_V128);
   11242    IRTemp subV = newTemp(Ity_V128);
   11243    IRTemp a1   = newTemp(Ity_I64);
   11244    IRTemp s0   = newTemp(Ity_I64);
   11245    IRTemp rm   = newTemp(Ity_I32);
   11246 
   11247    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11248    assign( addV, triop(Iop_Add64Fx2, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11249    assign( subV, triop(Iop_Sub64Fx2, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11250 
   11251    assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
   11252    assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
   11253 
   11254    IRTemp res = newTemp(Ity_V128);
   11255    assign( res, binop(Iop_64HLtoV128, mkexpr(a1), mkexpr(s0)) );
   11256    return res;
   11257 }
   11258 
   11259 
   11260 static IRTemp math_ADDSUBPD_256 ( IRTemp dV, IRTemp sV )
   11261 {
   11262    IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
   11263    IRTemp addV = newTemp(Ity_V256);
   11264    IRTemp subV = newTemp(Ity_V256);
   11265    IRTemp rm   = newTemp(Ity_I32);
   11266    a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11267 
   11268    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11269    assign( addV, triop(Iop_Add64Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11270    assign( subV, triop(Iop_Sub64Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11271 
   11272    breakupV256to64s( addV, &a3, &a2, &a1, &a0 );
   11273    breakupV256to64s( subV, &s3, &s2, &s1, &s0 );
   11274 
   11275    IRTemp res = newTemp(Ity_V256);
   11276    assign( res, mkV256from64s( a3, s2, a1, s0 ) );
   11277    return res;
   11278 }
   11279 
   11280 
   11281 static IRTemp math_ADDSUBPS_128 ( IRTemp dV, IRTemp sV )
   11282 {
   11283    IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
   11284    IRTemp addV = newTemp(Ity_V128);
   11285    IRTemp subV = newTemp(Ity_V128);
   11286    IRTemp rm   = newTemp(Ity_I32);
   11287    a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11288 
   11289    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11290    assign( addV, triop(Iop_Add32Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11291    assign( subV, triop(Iop_Sub32Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11292 
   11293    breakupV128to32s( addV, &a3, &a2, &a1, &a0 );
   11294    breakupV128to32s( subV, &s3, &s2, &s1, &s0 );
   11295 
   11296    IRTemp res = newTemp(Ity_V128);
   11297    assign( res, mkV128from32s( a3, s2, a1, s0 ) );
   11298    return res;
   11299 }
   11300 
   11301 
   11302 static IRTemp math_ADDSUBPS_256 ( IRTemp dV, IRTemp sV )
   11303 {
   11304    IRTemp a7, a6, a5, a4, a3, a2, a1, a0;
   11305    IRTemp s7, s6, s5, s4, s3, s2, s1, s0;
   11306    IRTemp addV = newTemp(Ity_V256);
   11307    IRTemp subV = newTemp(Ity_V256);
   11308    IRTemp rm   = newTemp(Ity_I32);
   11309    a7 = a6 = a5 = a4 = a3 = a2 = a1 = a0 = IRTemp_INVALID;
   11310    s7 = s6 = s5 = s4 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11311 
   11312    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11313    assign( addV, triop(Iop_Add32Fx8, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11314    assign( subV, triop(Iop_Sub32Fx8, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11315 
   11316    breakupV256to32s( addV, &a7, &a6, &a5, &a4, &a3, &a2, &a1, &a0 );
   11317    breakupV256to32s( subV, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
   11318 
   11319    IRTemp res = newTemp(Ity_V256);
   11320    assign( res, mkV256from32s( a7, s6, a5, s4, a3, s2, a1, s0 ) );
   11321    return res;
   11322 }
   11323 
   11324 
   11325 /* Handle 128 bit PSHUFLW and PSHUFHW. */
   11326 static Long dis_PSHUFxW_128 ( const VexAbiInfo* vbi, Prefix pfx,
   11327                               Long delta, Bool isAvx, Bool xIsH )
   11328 {
   11329    IRTemp addr  = IRTemp_INVALID;
   11330    Int    alen  = 0;
   11331    HChar  dis_buf[50];
   11332    UChar  modrm = getUChar(delta);
   11333    UInt   rG = gregOfRexRM(pfx,modrm);
   11334    UInt   imm8;
   11335    IRTemp sVmut, dVmut, sVcon, sV, dV, s3, s2, s1, s0;
   11336    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11337    sV    = newTemp(Ity_V128);
   11338    dV    = newTemp(Ity_V128);
   11339    sVmut = newTemp(Ity_I64);
   11340    dVmut = newTemp(Ity_I64);
   11341    sVcon = newTemp(Ity_I64);
   11342    if (epartIsReg(modrm)) {
   11343       UInt rE = eregOfRexRM(pfx,modrm);
   11344       assign( sV, getXMMReg(rE) );
   11345       imm8 = (UInt)getUChar(delta+1);
   11346       delta += 1+1;
   11347       DIP("%spshuf%cw $%u,%s,%s\n",
   11348           isAvx ? "v" : "", xIsH ? 'h' : 'l',
   11349           imm8, nameXMMReg(rE), nameXMMReg(rG));
   11350    } else {
   11351       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   11352       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11353       imm8 = (UInt)getUChar(delta+alen);
   11354       delta += alen+1;
   11355       DIP("%spshuf%cw $%u,%s,%s\n",
   11356           isAvx ? "v" : "", xIsH ? 'h' : 'l',
   11357           imm8, dis_buf, nameXMMReg(rG));
   11358    }
   11359 
   11360    /* Get the to-be-changed (mut) and unchanging (con) bits of the
   11361       source. */
   11362    assign( sVmut, unop(xIsH ? Iop_V128HIto64 : Iop_V128to64,   mkexpr(sV)) );
   11363    assign( sVcon, unop(xIsH ? Iop_V128to64   : Iop_V128HIto64, mkexpr(sV)) );
   11364 
   11365    breakup64to16s( sVmut, &s3, &s2, &s1, &s0 );
   11366 #  define SEL(n) \
   11367              ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   11368    assign(dVmut, mk64from16s( SEL((imm8>>6)&3), SEL((imm8>>4)&3),
   11369                               SEL((imm8>>2)&3), SEL((imm8>>0)&3) ));
   11370 #  undef SEL
   11371 
   11372    assign(dV, xIsH ? binop(Iop_64HLtoV128, mkexpr(dVmut), mkexpr(sVcon))
   11373                    : binop(Iop_64HLtoV128, mkexpr(sVcon), mkexpr(dVmut)) );
   11374 
   11375    (isAvx ? putYMMRegLoAndZU : putXMMReg)(rG, mkexpr(dV));
   11376    return delta;
   11377 }
   11378 
   11379 
   11380 /* Handle 256 bit PSHUFLW and PSHUFHW. */
   11381 static Long dis_PSHUFxW_256 ( const VexAbiInfo* vbi, Prefix pfx,
   11382                               Long delta, Bool xIsH )
   11383 {
   11384    IRTemp addr  = IRTemp_INVALID;
   11385    Int    alen  = 0;
   11386    HChar  dis_buf[50];
   11387    UChar  modrm = getUChar(delta);
   11388    UInt   rG = gregOfRexRM(pfx,modrm);
   11389    UInt   imm8;
   11390    IRTemp sV, s[8], sV64[4], dVhi, dVlo;
   11391    sV64[3] = sV64[2] = sV64[1] = sV64[0] = IRTemp_INVALID;
   11392    s[7] = s[6] = s[5] = s[4] = s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID;
   11393    sV    = newTemp(Ity_V256);
   11394    dVhi  = newTemp(Ity_I64);
   11395    dVlo  = newTemp(Ity_I64);
   11396    if (epartIsReg(modrm)) {
   11397       UInt rE = eregOfRexRM(pfx,modrm);
   11398       assign( sV, getYMMReg(rE) );
   11399       imm8 = (UInt)getUChar(delta+1);
   11400       delta += 1+1;
   11401       DIP("vpshuf%cw $%u,%s,%s\n", xIsH ? 'h' : 'l',
   11402           imm8, nameYMMReg(rE), nameYMMReg(rG));
   11403    } else {
   11404       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   11405       assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   11406       imm8 = (UInt)getUChar(delta+alen);
   11407       delta += alen+1;
   11408       DIP("vpshuf%cw $%u,%s,%s\n", xIsH ? 'h' : 'l',
   11409           imm8, dis_buf, nameYMMReg(rG));
   11410    }
   11411 
   11412    breakupV256to64s( sV, &sV64[3], &sV64[2], &sV64[1], &sV64[0] );
   11413    breakup64to16s( sV64[xIsH ? 3 : 2], &s[7], &s[6], &s[5], &s[4] );
   11414    breakup64to16s( sV64[xIsH ? 1 : 0], &s[3], &s[2], &s[1], &s[0] );
   11415 
   11416    assign( dVhi, mk64from16s( s[4 + ((imm8>>6)&3)], s[4 + ((imm8>>4)&3)],
   11417                               s[4 + ((imm8>>2)&3)], s[4 + ((imm8>>0)&3)] ) );
   11418    assign( dVlo, mk64from16s( s[0 + ((imm8>>6)&3)], s[0 + ((imm8>>4)&3)],
   11419                               s[0 + ((imm8>>2)&3)], s[0 + ((imm8>>0)&3)] ) );
   11420    putYMMReg( rG, mkV256from64s( xIsH ? dVhi : sV64[3],
   11421                                  xIsH ? sV64[2] : dVhi,
   11422                                  xIsH ? dVlo : sV64[1],
   11423                                  xIsH ? sV64[0] : dVlo ) );
   11424    return delta;
   11425 }
   11426 
   11427 
   11428 static Long dis_PEXTRW_128_EregOnly_toG ( const VexAbiInfo* vbi, Prefix pfx,
   11429                                           Long delta, Bool isAvx )
   11430 {
   11431    Long   deltaIN = delta;
   11432    UChar  modrm   = getUChar(delta);
   11433    UInt   rG      = gregOfRexRM(pfx,modrm);
   11434    IRTemp sV      = newTemp(Ity_V128);
   11435    IRTemp d16     = newTemp(Ity_I16);
   11436    UInt   imm8;
   11437    IRTemp s0, s1, s2, s3;
   11438    if (epartIsReg(modrm)) {
   11439       UInt rE = eregOfRexRM(pfx,modrm);
   11440       assign(sV, getXMMReg(rE));
   11441       imm8 = getUChar(delta+1) & 7;
   11442       delta += 1+1;
   11443       DIP("%spextrw $%u,%s,%s\n", isAvx ? "v" : "",
   11444           imm8, nameXMMReg(rE), nameIReg32(rG));
   11445    } else {
   11446       /* The memory case is disallowed, apparently. */
   11447       return deltaIN; /* FAIL */
   11448    }
   11449    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11450    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   11451    switch (imm8) {
   11452       case 0:  assign(d16, unop(Iop_32to16,   mkexpr(s0))); break;
   11453       case 1:  assign(d16, unop(Iop_32HIto16, mkexpr(s0))); break;
   11454       case 2:  assign(d16, unop(Iop_32to16,   mkexpr(s1))); break;
   11455       case 3:  assign(d16, unop(Iop_32HIto16, mkexpr(s1))); break;
   11456       case 4:  assign(d16, unop(Iop_32to16,   mkexpr(s2))); break;
   11457       case 5:  assign(d16, unop(Iop_32HIto16, mkexpr(s2))); break;
   11458       case 6:  assign(d16, unop(Iop_32to16,   mkexpr(s3))); break;
   11459       case 7:  assign(d16, unop(Iop_32HIto16, mkexpr(s3))); break;
   11460       default: vassert(0);
   11461    }
   11462    putIReg32(rG, unop(Iop_16Uto32, mkexpr(d16)));
   11463    return delta;
   11464 }
   11465 
   11466 
   11467 static Long dis_CVTDQ2PD_128 ( const VexAbiInfo* vbi, Prefix pfx,
   11468                                Long delta, Bool isAvx )
   11469 {
   11470    IRTemp addr  = IRTemp_INVALID;
   11471    Int    alen  = 0;
   11472    HChar  dis_buf[50];
   11473    UChar  modrm = getUChar(delta);
   11474    IRTemp arg64 = newTemp(Ity_I64);
   11475    UInt   rG    = gregOfRexRM(pfx,modrm);
   11476    const HChar* mbV   = isAvx ? "v" : "";
   11477    if (epartIsReg(modrm)) {
   11478       UInt rE = eregOfRexRM(pfx,modrm);
   11479       assign( arg64, getXMMRegLane64(rE, 0) );
   11480       delta += 1;
   11481       DIP("%scvtdq2pd %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG));
   11482    } else {
   11483       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11484       assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   11485       delta += alen;
   11486       DIP("%scvtdq2pd %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
   11487    }
   11488    putXMMRegLane64F(
   11489       rG, 0,
   11490       unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
   11491    );
   11492    putXMMRegLane64F(
   11493       rG, 1,
   11494       unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
   11495    );
   11496    if (isAvx)
   11497       putYMMRegLane128(rG, 1, mkV128(0));
   11498    return delta;
   11499 }
   11500 
   11501 
   11502 static Long dis_STMXCSR ( const VexAbiInfo* vbi, Prefix pfx,
   11503                           Long delta, Bool isAvx )
   11504 {
   11505    IRTemp addr  = IRTemp_INVALID;
   11506    Int    alen  = 0;
   11507    HChar  dis_buf[50];
   11508    UChar  modrm = getUChar(delta);
   11509    vassert(!epartIsReg(modrm)); /* ensured by caller */
   11510    vassert(gregOfRexRM(pfx,modrm) == 3); /* ditto */
   11511 
   11512    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11513    delta += alen;
   11514 
   11515    /* Fake up a native SSE mxcsr word.  The only thing it depends on
   11516       is SSEROUND[1:0], so call a clean helper to cook it up.
   11517    */
   11518    /* ULong amd64h_create_mxcsr ( ULong sseround ) */
   11519    DIP("%sstmxcsr %s\n",  isAvx ? "v" : "", dis_buf);
   11520    storeLE(
   11521       mkexpr(addr),
   11522       unop(Iop_64to32,
   11523            mkIRExprCCall(
   11524               Ity_I64, 0/*regp*/,
   11525               "amd64g_create_mxcsr", &amd64g_create_mxcsr,
   11526               mkIRExprVec_1( unop(Iop_32Uto64,get_sse_roundingmode()) )
   11527            )
   11528       )
   11529    );
   11530    return delta;
   11531 }
   11532 
   11533 
   11534 static Long dis_LDMXCSR ( const VexAbiInfo* vbi, Prefix pfx,
   11535                           Long delta, Bool isAvx )
   11536 {
   11537    IRTemp addr  = IRTemp_INVALID;
   11538    Int    alen  = 0;
   11539    HChar  dis_buf[50];
   11540    UChar  modrm = getUChar(delta);
   11541    vassert(!epartIsReg(modrm)); /* ensured by caller */
   11542    vassert(gregOfRexRM(pfx,modrm) == 2); /* ditto */
   11543 
   11544    IRTemp t64 = newTemp(Ity_I64);
   11545    IRTemp ew  = newTemp(Ity_I32);
   11546 
   11547    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11548    delta += alen;
   11549    DIP("%sldmxcsr %s\n",  isAvx ? "v" : "", dis_buf);
   11550 
   11551    /* The only thing we observe in %mxcsr is the rounding mode.
   11552       Therefore, pass the 32-bit value (SSE native-format control
   11553       word) to a clean helper, getting back a 64-bit value, the
   11554       lower half of which is the SSEROUND value to store, and the
   11555       upper half of which is the emulation-warning token which may
   11556       be generated.
   11557    */
   11558    /* ULong amd64h_check_ldmxcsr ( ULong ); */
   11559    assign( t64, mkIRExprCCall(
   11560                    Ity_I64, 0/*regparms*/,
   11561                    "amd64g_check_ldmxcsr",
   11562                    &amd64g_check_ldmxcsr,
   11563                    mkIRExprVec_1(
   11564                       unop(Iop_32Uto64,
   11565                            loadLE(Ity_I32, mkexpr(addr))
   11566                       )
   11567                    )
   11568                 )
   11569          );
   11570 
   11571    put_sse_roundingmode( unop(Iop_64to32, mkexpr(t64)) );
   11572    assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   11573    put_emwarn( mkexpr(ew) );
   11574    /* Finally, if an emulation warning was reported, side-exit to
   11575       the next insn, reporting the warning, so that Valgrind's
   11576       dispatcher sees the warning. */
   11577    stmt(
   11578       IRStmt_Exit(
   11579          binop(Iop_CmpNE64, unop(Iop_32Uto64,mkexpr(ew)), mkU64(0)),
   11580          Ijk_EmWarn,
   11581          IRConst_U64(guest_RIP_bbstart+delta),
   11582          OFFB_RIP
   11583       )
   11584    );
   11585    return delta;
   11586 }
   11587 
   11588 
   11589 static void gen_XSAVE_SEQUENCE ( IRTemp addr, IRTemp rfbm )
   11590 {
   11591    /* ------ rfbm[0] gates the x87 state ------ */
   11592 
   11593    /* Uses dirty helper:
   11594          void amd64g_do_XSAVE_COMPONENT_0 ( VexGuestAMD64State*, ULong )
   11595    */
   11596    IRDirty* d0 = unsafeIRDirty_0_N (
   11597                     0/*regparms*/,
   11598                     "amd64g_dirtyhelper_XSAVE_COMPONENT_0",
   11599                     &amd64g_dirtyhelper_XSAVE_COMPONENT_0,
   11600                     mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   11601                  );
   11602    d0->guard = binop(Iop_CmpEQ64, binop(Iop_And64, mkexpr(rfbm), mkU64(1)),
   11603                      mkU64(1));
   11604 
   11605    /* Declare we're writing memory.  Really, bytes 24 through 31
   11606       (MXCSR and MXCSR_MASK) aren't written, but we can't express more
   11607       than 1 memory area here, so just mark the whole thing as
   11608       written. */
   11609    d0->mFx   = Ifx_Write;
   11610    d0->mAddr = mkexpr(addr);
   11611    d0->mSize = 160;
   11612 
   11613    /* declare we're reading guest state */
   11614    d0->nFxState = 5;
   11615    vex_bzero(&d0->fxState, sizeof(d0->fxState));
   11616 
   11617    d0->fxState[0].fx     = Ifx_Read;
   11618    d0->fxState[0].offset = OFFB_FTOP;
   11619    d0->fxState[0].size   = sizeof(UInt);
   11620 
   11621    d0->fxState[1].fx     = Ifx_Read;
   11622    d0->fxState[1].offset = OFFB_FPREGS;
   11623    d0->fxState[1].size   = 8 * sizeof(ULong);
   11624 
   11625    d0->fxState[2].fx     = Ifx_Read;
   11626    d0->fxState[2].offset = OFFB_FPTAGS;
   11627    d0->fxState[2].size   = 8 * sizeof(UChar);
   11628 
   11629    d0->fxState[3].fx     = Ifx_Read;
   11630    d0->fxState[3].offset = OFFB_FPROUND;
   11631    d0->fxState[3].size   = sizeof(ULong);
   11632 
   11633    d0->fxState[4].fx     = Ifx_Read;
   11634    d0->fxState[4].offset = OFFB_FC3210;
   11635    d0->fxState[4].size   = sizeof(ULong);
   11636 
   11637    stmt( IRStmt_Dirty(d0) );
   11638 
   11639    /* ------ rfbm[1] gates the SSE state ------ */
   11640 
   11641    IRTemp rfbm_1    = newTemp(Ity_I64);
   11642    IRTemp rfbm_1or2 = newTemp(Ity_I64);
   11643    assign(rfbm_1,    binop(Iop_And64, mkexpr(rfbm), mkU64(2)));
   11644    assign(rfbm_1or2, binop(Iop_And64, mkexpr(rfbm), mkU64(6)));
   11645 
   11646    IRExpr* guard_1    = binop(Iop_CmpEQ64, mkexpr(rfbm_1),    mkU64(2));
   11647    IRExpr* guard_1or2 = binop(Iop_CmpNE64, mkexpr(rfbm_1or2), mkU64(0));
   11648 
   11649    /* Uses dirty helper:
   11650          void amd64g_do_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS
   11651                  ( VexGuestAMD64State*, ULong )
   11652       This creates only MXCSR and MXCSR_MASK.  We need to do this if
   11653       either components 1 (SSE) or 2 (AVX) are requested.  Hence the
   11654       guard condition is a bit more complex.
   11655    */
   11656    IRDirty* d1 = unsafeIRDirty_0_N (
   11657                     0/*regparms*/,
   11658                     "amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS",
   11659                     &amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS,
   11660                     mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   11661                  );
   11662    d1->guard = guard_1or2;
   11663 
   11664    /* Declare we're writing memory: MXCSR and MXCSR_MASK.  Note that
   11665       the code for rbfm[0] just above claims a write of 0 .. 159, so
   11666       this duplicates it.  But at least correctly connects 24 .. 31 to
   11667       the MXCSR guest state representation (SSEROUND field). */
   11668    d1->mFx   = Ifx_Write;
   11669    d1->mAddr = binop(Iop_Add64, mkexpr(addr), mkU64(24));
   11670    d1->mSize = 8;
   11671 
   11672    /* declare we're reading guest state */
   11673    d1->nFxState = 1;
   11674    vex_bzero(&d1->fxState, sizeof(d1->fxState));
   11675 
   11676    d1->fxState[0].fx     = Ifx_Read;
   11677    d1->fxState[0].offset = OFFB_SSEROUND;
   11678    d1->fxState[0].size   = sizeof(ULong);
   11679 
   11680    /* Call the helper.  This creates MXCSR and MXCSR_MASK but nothing
   11681       else.  We do the actual register array, XMM[0..15], separately,
   11682       in order that any undefinedness in the XMM registers is tracked
   11683       separately by Memcheck and does not "infect" the in-memory
   11684       shadow for the other parts of the image. */
   11685    stmt( IRStmt_Dirty(d1) );
   11686 
   11687    /* And now the XMMs themselves. */
   11688    UInt reg;
   11689    for (reg = 0; reg < 16; reg++) {
   11690       stmt( IRStmt_StoreG(
   11691                Iend_LE,
   11692                binop(Iop_Add64, mkexpr(addr), mkU64(160 + reg * 16)),
   11693                getXMMReg(reg),
   11694                guard_1
   11695       ));
   11696    }
   11697 
   11698    /* ------ rfbm[2] gates the AVX state ------ */
   11699    /* Component 2 is just a bunch of register saves, so we'll do it
   11700       inline, just to be simple and to be Memcheck friendly. */
   11701 
   11702    IRTemp rfbm_2 = newTemp(Ity_I64);
   11703    assign(rfbm_2, binop(Iop_And64, mkexpr(rfbm), mkU64(4)));
   11704 
   11705    IRExpr* guard_2 = binop(Iop_CmpEQ64, mkexpr(rfbm_2), mkU64(4));
   11706 
   11707    for (reg = 0; reg < 16; reg++) {
   11708       stmt( IRStmt_StoreG(
   11709                Iend_LE,
   11710                binop(Iop_Add64, mkexpr(addr), mkU64(576 + reg * 16)),
   11711                getYMMRegLane128(reg,1),
   11712                guard_2
   11713       ));
   11714    }
   11715 }
   11716 
   11717 
   11718 static Long dis_XSAVE ( const VexAbiInfo* vbi,
   11719                         Prefix pfx, Long delta, Int sz )
   11720 {
   11721    /* Note that the presence or absence of REX.W (indicated here by
   11722       |sz|) slightly affects the written format: whether the saved FPU
   11723       IP and DP pointers are 64 or 32 bits.  But the helper function
   11724       we call simply writes zero bits in the relevant fields, which
   11725       are 64 bits regardless of what REX.W is, and so it's good enough
   11726       (iow, equally broken) in both cases. */
   11727    IRTemp addr  = IRTemp_INVALID;
   11728    Int    alen  = 0;
   11729    HChar  dis_buf[50];
   11730    UChar  modrm = getUChar(delta);
   11731    vassert(!epartIsReg(modrm)); /* ensured by caller */
   11732    vassert(sz == 4 || sz == 8); /* ditto */
   11733 
   11734    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11735    delta += alen;
   11736    gen_SEGV_if_not_64_aligned(addr);
   11737 
   11738    DIP("%sxsave %s\n", sz==8 ? "rex64/" : "", dis_buf);
   11739 
   11740    /* VEX's caller is assumed to have checked this. */
   11741    const ULong aSSUMED_XCR0_VALUE = 7;
   11742 
   11743    IRTemp rfbm = newTemp(Ity_I64);
   11744    assign(rfbm,
   11745           binop(Iop_And64,
   11746                 binop(Iop_Or64,
   11747                       binop(Iop_Shl64,
   11748                             unop(Iop_32Uto64, getIRegRDX(4)), mkU8(32)),
   11749                       unop(Iop_32Uto64, getIRegRAX(4))),
   11750                 mkU64(aSSUMED_XCR0_VALUE)));
   11751 
   11752    gen_XSAVE_SEQUENCE(addr, rfbm);
   11753 
   11754    /* Finally, we need to update XSTATE_BV in the XSAVE header area, by
   11755       OR-ing the RFBM value into it. */
   11756    IRTemp addr_plus_512 = newTemp(Ity_I64);
   11757    assign(addr_plus_512, binop(Iop_Add64, mkexpr(addr), mkU64(512)));
   11758    storeLE( mkexpr(addr_plus_512),
   11759             binop(Iop_Or8,
   11760                   unop(Iop_64to8, mkexpr(rfbm)),
   11761                   loadLE(Ity_I8, mkexpr(addr_plus_512))) );
   11762 
   11763    return delta;
   11764 }
   11765 
   11766 
   11767 static Long dis_FXSAVE ( const VexAbiInfo* vbi,
   11768                          Prefix pfx, Long delta, Int sz )
   11769 {
   11770    /* See comment in dis_XSAVE about the significance of REX.W. */
   11771    IRTemp addr  = IRTemp_INVALID;
   11772    Int    alen  = 0;
   11773    HChar  dis_buf[50];
   11774    UChar  modrm = getUChar(delta);
   11775    vassert(!epartIsReg(modrm)); /* ensured by caller */
   11776    vassert(sz == 4 || sz == 8); /* ditto */
   11777 
   11778    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11779    delta += alen;
   11780    gen_SEGV_if_not_16_aligned(addr);
   11781 
   11782    DIP("%sfxsave %s\n", sz==8 ? "rex64/" : "", dis_buf);
   11783 
   11784    /* FXSAVE is just XSAVE with components 0 and 1 selected.  Set rfbm
   11785       to 0b011, generate the XSAVE sequence accordingly, and let iropt
   11786       fold out the unused (AVX) parts accordingly. */
   11787    IRTemp rfbm = newTemp(Ity_I64);
   11788    assign(rfbm, mkU64(3));
   11789    gen_XSAVE_SEQUENCE(addr, rfbm);
   11790 
   11791    return delta;
   11792 }
   11793 
   11794 
   11795 static void gen_XRSTOR_SEQUENCE ( IRTemp addr, IRTemp xstate_bv, IRTemp rfbm )
   11796 {
   11797    /* ------ rfbm[0] gates the x87 state ------ */
   11798 
   11799    /* If rfbm[0] == 1, we have to write the x87 state.  If
   11800       xstate_bv[0] == 1, we will read it from the memory image, else
   11801       we'll set it to initial values.  Doing this with a helper
   11802       function and getting the definedness flow annotations correct is
   11803       too difficult, so generate stupid but simple code: first set the
   11804       registers to initial values, regardless of xstate_bv[0].  Then,
   11805       conditionally restore from the memory image. */
   11806 
   11807    IRTemp rfbm_0       = newTemp(Ity_I64);
   11808    IRTemp xstate_bv_0  = newTemp(Ity_I64);
   11809    IRTemp restore_0    = newTemp(Ity_I64);
   11810    assign(rfbm_0,      binop(Iop_And64, mkexpr(rfbm), mkU64(1)));
   11811    assign(xstate_bv_0, binop(Iop_And64, mkexpr(xstate_bv), mkU64(1)));
   11812    assign(restore_0,   binop(Iop_And64, mkexpr(rfbm_0), mkexpr(xstate_bv_0)));
   11813 
   11814    gen_FINIT_SEQUENCE( binop(Iop_CmpNE64, mkexpr(rfbm_0), mkU64(0)) );
   11815 
   11816    /* Uses dirty helper:
   11817          void amd64g_do_XRSTOR_COMPONENT_0 ( VexGuestAMD64State*, ULong )
   11818    */
   11819    IRDirty* d0 = unsafeIRDirty_0_N (
   11820                     0/*regparms*/,
   11821                     "amd64g_dirtyhelper_XRSTOR_COMPONENT_0",
   11822                     &amd64g_dirtyhelper_XRSTOR_COMPONENT_0,
   11823                     mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   11824                  );
   11825    d0->guard = binop(Iop_CmpNE64, mkexpr(restore_0), mkU64(0));
   11826 
   11827    /* Declare we're reading memory.  Really, bytes 24 through 31
   11828       (MXCSR and MXCSR_MASK) aren't read, but we can't express more
   11829       than 1 memory area here, so just mark the whole thing as
   11830       read. */
   11831    d0->mFx   = Ifx_Read;
   11832    d0->mAddr = mkexpr(addr);
   11833    d0->mSize = 160;
   11834 
   11835    /* declare we're writing guest state */
   11836    d0->nFxState = 5;
   11837    vex_bzero(&d0->fxState, sizeof(d0->fxState));
   11838 
   11839    d0->fxState[0].fx     = Ifx_Write;
   11840    d0->fxState[0].offset = OFFB_FTOP;
   11841    d0->fxState[0].size   = sizeof(UInt);
   11842 
   11843    d0->fxState[1].fx     = Ifx_Write;
   11844    d0->fxState[1].offset = OFFB_FPREGS;
   11845    d0->fxState[1].size   = 8 * sizeof(ULong);
   11846 
   11847    d0->fxState[2].fx     = Ifx_Write;
   11848    d0->fxState[2].offset = OFFB_FPTAGS;
   11849    d0->fxState[2].size   = 8 * sizeof(UChar);
   11850 
   11851    d0->fxState[3].fx     = Ifx_Write;
   11852    d0->fxState[3].offset = OFFB_FPROUND;
   11853    d0->fxState[3].size   = sizeof(ULong);
   11854 
   11855    d0->fxState[4].fx     = Ifx_Write;
   11856    d0->fxState[4].offset = OFFB_FC3210;
   11857    d0->fxState[4].size   = sizeof(ULong);
   11858 
   11859    stmt( IRStmt_Dirty(d0) );
   11860 
   11861    /* ------ rfbm[1] gates the SSE state ------ */
   11862 
   11863    /* Same scheme as component 0: first zero it out, and then possibly
   11864       restore from the memory area. */
   11865    IRTemp rfbm_1       = newTemp(Ity_I64);
   11866    IRTemp xstate_bv_1  = newTemp(Ity_I64);
   11867    IRTemp restore_1    = newTemp(Ity_I64);
   11868    assign(rfbm_1,      binop(Iop_And64, mkexpr(rfbm), mkU64(2)));
   11869    assign(xstate_bv_1, binop(Iop_And64, mkexpr(xstate_bv), mkU64(2)));
   11870    assign(restore_1,   binop(Iop_And64, mkexpr(rfbm_1), mkexpr(xstate_bv_1)));
   11871    IRExpr* rfbm_1e     = binop(Iop_CmpNE64, mkexpr(rfbm_1),    mkU64(0));
   11872    IRExpr* restore_1e  = binop(Iop_CmpNE64, mkexpr(restore_1), mkU64(0));
   11873 
   11874    IRTemp rfbm_1or2       = newTemp(Ity_I64);
   11875    IRTemp xstate_bv_1or2  = newTemp(Ity_I64);
   11876    IRTemp restore_1or2    = newTemp(Ity_I64);
   11877    assign(rfbm_1or2,      binop(Iop_And64, mkexpr(rfbm), mkU64(6)));
   11878    assign(xstate_bv_1or2, binop(Iop_And64, mkexpr(xstate_bv), mkU64(6)));
   11879    assign(restore_1or2,   binop(Iop_And64, mkexpr(rfbm_1or2),
   11880                                            mkexpr(xstate_bv_1or2)));
   11881    IRExpr* rfbm_1or2e     = binop(Iop_CmpNE64, mkexpr(rfbm_1or2),    mkU64(0));
   11882    IRExpr* restore_1or2e  = binop(Iop_CmpNE64, mkexpr(restore_1or2), mkU64(0));
   11883 
   11884    /* The areas in question are: SSEROUND, and the XMM register array. */
   11885    putGuarded(OFFB_SSEROUND, rfbm_1or2e, mkU64(Irrm_NEAREST));
   11886 
   11887    UInt reg;
   11888    for (reg = 0; reg < 16; reg++) {
   11889       putGuarded(xmmGuestRegOffset(reg), rfbm_1e, mkV128(0));
   11890    }
   11891 
   11892    /* And now possibly restore from MXCSR/MXCSR_MASK */
   11893    /* Uses dirty helper:
   11894          void amd64g_do_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS
   11895                  ( VexGuestAMD64State*, ULong )
   11896       This restores from only MXCSR and MXCSR_MASK.  We need to do
   11897       this if either components 1 (SSE) or 2 (AVX) are requested.
   11898       Hence the guard condition is a bit more complex.
   11899    */
   11900    IRDirty* d1 = unsafeIRDirty_0_N (
   11901                     0/*regparms*/,
   11902                     "amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS",
   11903                     &amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS,
   11904                     mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   11905                 ) ;
   11906    d1->guard = restore_1or2e;
   11907 
   11908    /* Declare we're reading memory: MXCSR and MXCSR_MASK.  Note that
   11909       the code for rbfm[0] just above claims a read of 0 .. 159, so
   11910       this duplicates it.  But at least correctly connects 24 .. 31 to
   11911       the MXCSR guest state representation (SSEROUND field). */
   11912    d1->mFx   = Ifx_Read;
   11913    d1->mAddr = binop(Iop_Add64, mkexpr(addr), mkU64(24));
   11914    d1->mSize = 8;
   11915 
   11916    /* declare we're writing guest state */
   11917    d1->nFxState = 1;
   11918    vex_bzero(&d1->fxState, sizeof(d1->fxState));
   11919 
   11920    d1->fxState[0].fx     = Ifx_Write;
   11921    d1->fxState[0].offset = OFFB_SSEROUND;
   11922    d1->fxState[0].size   = sizeof(ULong);
   11923 
   11924    /* Call the helper.  This creates SSEROUND but nothing
   11925       else.  We do the actual register array, XMM[0..15], separately,
   11926       in order that any undefinedness in the XMM registers is tracked
   11927       separately by Memcheck and is not "infected" by the in-memory
   11928       shadow for the other parts of the image. */
   11929    stmt( IRStmt_Dirty(d1) );
   11930 
   11931    /* And now the XMMs themselves.  For each register, we PUT either
   11932       its old value, or the value loaded from memory.  One convenient
   11933       way to do that is with a conditional load that has its the
   11934       default value, the old value of the register. */
   11935    for (reg = 0; reg < 16; reg++) {
   11936       IRExpr* ea  = binop(Iop_Add64, mkexpr(addr), mkU64(160 + reg * 16));
   11937       IRExpr* alt = getXMMReg(reg);
   11938       IRTemp  loadedValue = newTemp(Ity_V128);
   11939       stmt( IRStmt_LoadG(Iend_LE,
   11940                          ILGop_IdentV128,
   11941                          loadedValue, ea, alt, restore_1e) );
   11942       putXMMReg(reg, mkexpr(loadedValue));
   11943    }
   11944 
   11945    /* ------ rfbm[2] gates the AVX state ------ */
   11946    /* Component 2 is just a bunch of register loads, so we'll do it
   11947       inline, just to be simple and to be Memcheck friendly. */
   11948 
   11949    /* Same scheme as component 0: first zero it out, and then possibly
   11950       restore from the memory area. */
   11951    IRTemp rfbm_2      = newTemp(Ity_I64);
   11952    IRTemp xstate_bv_2 = newTemp(Ity_I64);
   11953    IRTemp restore_2   = newTemp(Ity_I64);
   11954    assign(rfbm_2,      binop(Iop_And64, mkexpr(rfbm), mkU64(4)));
   11955    assign(xstate_bv_2, binop(Iop_And64, mkexpr(xstate_bv), mkU64(4)));
   11956    assign(restore_2,   binop(Iop_And64, mkexpr(rfbm_2), mkexpr(xstate_bv_2)));
   11957 
   11958    IRExpr* rfbm_2e    = binop(Iop_CmpNE64, mkexpr(rfbm_2),    mkU64(0));
   11959    IRExpr* restore_2e = binop(Iop_CmpNE64, mkexpr(restore_2), mkU64(0));
   11960 
   11961    for (reg = 0; reg < 16; reg++) {
   11962       putGuarded(ymmGuestRegLane128offset(reg, 1), rfbm_2e, mkV128(0));
   11963    }
   11964 
   11965    for (reg = 0; reg < 16; reg++) {
   11966       IRExpr* ea  = binop(Iop_Add64, mkexpr(addr), mkU64(576 + reg * 16));
   11967       IRExpr* alt = getYMMRegLane128(reg, 1);
   11968       IRTemp  loadedValue = newTemp(Ity_V128);
   11969       stmt( IRStmt_LoadG(Iend_LE,
   11970                          ILGop_IdentV128,
   11971                          loadedValue, ea, alt, restore_2e) );
   11972       putYMMRegLane128(reg, 1, mkexpr(loadedValue));
   11973    }
   11974 }
   11975 
   11976 
   11977 static Long dis_XRSTOR ( const VexAbiInfo* vbi,
   11978                          Prefix pfx, Long delta, Int sz )
   11979 {
   11980    /* As with XRSTOR above we ignore the value of REX.W since we're
   11981       not bothering with the FPU DP and IP fields. */
   11982    IRTemp addr  = IRTemp_INVALID;
   11983    Int    alen  = 0;
   11984    HChar  dis_buf[50];
   11985    UChar  modrm = getUChar(delta);
   11986    vassert(!epartIsReg(modrm)); /* ensured by caller */
   11987    vassert(sz == 4 || sz == 8); /* ditto */
   11988 
   11989    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11990    delta += alen;
   11991    gen_SEGV_if_not_64_aligned(addr);
   11992 
   11993    DIP("%sxrstor %s\n", sz==8 ? "rex64/" : "", dis_buf);
   11994 
   11995    /* VEX's caller is assumed to have checked this. */
   11996    const ULong aSSUMED_XCR0_VALUE = 7;
   11997 
   11998    IRTemp rfbm = newTemp(Ity_I64);
   11999    assign(rfbm,
   12000           binop(Iop_And64,
   12001                 binop(Iop_Or64,
   12002                       binop(Iop_Shl64,
   12003                             unop(Iop_32Uto64, getIRegRDX(4)), mkU8(32)),
   12004                       unop(Iop_32Uto64, getIRegRAX(4))),
   12005                 mkU64(aSSUMED_XCR0_VALUE)));
   12006 
   12007    IRTemp xstate_bv = newTemp(Ity_I64);
   12008    assign(xstate_bv, loadLE(Ity_I64,
   12009                             binop(Iop_Add64, mkexpr(addr), mkU64(512+0))));
   12010 
   12011    IRTemp xcomp_bv = newTemp(Ity_I64);
   12012    assign(xcomp_bv, loadLE(Ity_I64,
   12013                            binop(Iop_Add64, mkexpr(addr), mkU64(512+8))));
   12014 
   12015    IRTemp xsavehdr_23_16 = newTemp(Ity_I64);
   12016    assign( xsavehdr_23_16,
   12017            loadLE(Ity_I64,
   12018                   binop(Iop_Add64, mkexpr(addr), mkU64(512+16))));
   12019 
   12020    /* We must fault if
   12021       * xcomp_bv[63] == 1, since this simulated CPU does not support
   12022         the compaction extension.
   12023       * xstate_bv sets a bit outside of XCR0 (which we assume to be 7).
   12024       * any of the xsave header bytes 23 .. 8 are nonzero.  This seems to
   12025         imply that xcomp_bv must be zero.
   12026       xcomp_bv is header bytes 15 .. 8 and xstate_bv is header bytes 7 .. 0
   12027    */
   12028    IRTemp fault_if_nonzero = newTemp(Ity_I64);
   12029    assign(fault_if_nonzero,
   12030           binop(Iop_Or64,
   12031                 binop(Iop_And64, mkexpr(xstate_bv), mkU64(~aSSUMED_XCR0_VALUE)),
   12032                 binop(Iop_Or64, mkexpr(xcomp_bv), mkexpr(xsavehdr_23_16))));
   12033    stmt( IRStmt_Exit(binop(Iop_CmpNE64, mkexpr(fault_if_nonzero), mkU64(0)),
   12034                      Ijk_SigSEGV,
   12035                      IRConst_U64(guest_RIP_curr_instr),
   12036                      OFFB_RIP
   12037    ));
   12038 
   12039    /* We are guaranteed now that both xstate_bv and rfbm are in the
   12040       range 0 .. 7.  Generate the restore sequence proper. */
   12041    gen_XRSTOR_SEQUENCE(addr, xstate_bv, rfbm);
   12042 
   12043    return delta;
   12044 }
   12045 
   12046 
   12047 static Long dis_FXRSTOR ( const VexAbiInfo* vbi,
   12048                           Prefix pfx, Long delta, Int sz )
   12049 {
   12050    /* As with FXSAVE above we ignore the value of REX.W since we're
   12051       not bothering with the FPU DP and IP fields. */
   12052    IRTemp addr  = IRTemp_INVALID;
   12053    Int    alen  = 0;
   12054    HChar  dis_buf[50];
   12055    UChar  modrm = getUChar(delta);
   12056    vassert(!epartIsReg(modrm)); /* ensured by caller */
   12057    vassert(sz == 4 || sz == 8); /* ditto */
   12058 
   12059    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12060    delta += alen;
   12061    gen_SEGV_if_not_16_aligned(addr);
   12062 
   12063    DIP("%sfxrstor %s\n", sz==8 ? "rex64/" : "", dis_buf);
   12064 
   12065    /* FXRSTOR is just XRSTOR with components 0 and 1 selected and also
   12066       as if components 0 and 1 are set as present in XSTATE_BV in the
   12067       XSAVE header.  Set both rfbm and xstate_bv to 0b011 therefore,
   12068       generate the XRSTOR sequence accordingly, and let iropt fold out
   12069       the unused (AVX) parts accordingly. */
   12070    IRTemp three = newTemp(Ity_I64);
   12071    assign(three, mkU64(3));
   12072    gen_XRSTOR_SEQUENCE(addr, three/*xstate_bv*/, three/*rfbm*/);
   12073 
   12074    return delta;
   12075 }
   12076 
   12077 
   12078 static IRTemp math_PINSRW_128 ( IRTemp v128, IRTemp u16, UInt imm8 )
   12079 {
   12080    vassert(imm8 >= 0 && imm8 <= 7);
   12081 
   12082    // Create a V128 value which has the selected word in the
   12083    // specified lane, and zeroes everywhere else.
   12084    IRTemp tmp128    = newTemp(Ity_V128);
   12085    IRTemp halfshift = newTemp(Ity_I64);
   12086    assign(halfshift, binop(Iop_Shl64,
   12087                            unop(Iop_16Uto64, mkexpr(u16)),
   12088                            mkU8(16 * (imm8 & 3))));
   12089    if (imm8 < 4) {
   12090       assign(tmp128, binop(Iop_64HLtoV128, mkU64(0), mkexpr(halfshift)));
   12091    } else {
   12092       assign(tmp128, binop(Iop_64HLtoV128, mkexpr(halfshift), mkU64(0)));
   12093    }
   12094 
   12095    UShort mask = ~(3 << (imm8 * 2));
   12096    IRTemp res  = newTemp(Ity_V128);
   12097    assign( res, binop(Iop_OrV128,
   12098                       mkexpr(tmp128),
   12099                       binop(Iop_AndV128, mkexpr(v128), mkV128(mask))) );
   12100    return res;
   12101 }
   12102 
   12103 
   12104 static IRTemp math_PSADBW_128 ( IRTemp dV, IRTemp sV )
   12105 {
   12106    IRTemp s1, s0, d1, d0;
   12107    s1 = s0 = d1 = d0 = IRTemp_INVALID;
   12108 
   12109    breakupV128to64s( sV, &s1, &s0 );
   12110    breakupV128to64s( dV, &d1, &d0 );
   12111 
   12112    IRTemp res = newTemp(Ity_V128);
   12113    assign( res,
   12114            binop(Iop_64HLtoV128,
   12115                  mkIRExprCCall(Ity_I64, 0/*regparms*/,
   12116                                "amd64g_calculate_mmx_psadbw",
   12117                                &amd64g_calculate_mmx_psadbw,
   12118                                mkIRExprVec_2( mkexpr(s1), mkexpr(d1))),
   12119                  mkIRExprCCall(Ity_I64, 0/*regparms*/,
   12120                                "amd64g_calculate_mmx_psadbw",
   12121                                &amd64g_calculate_mmx_psadbw,
   12122                                mkIRExprVec_2( mkexpr(s0), mkexpr(d0)))) );
   12123    return res;
   12124 }
   12125 
   12126 
   12127 static IRTemp math_PSADBW_256 ( IRTemp dV, IRTemp sV )
   12128 {
   12129    IRTemp sHi, sLo, dHi, dLo;
   12130    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   12131    breakupV256toV128s( dV, &dHi, &dLo);
   12132    breakupV256toV128s( sV, &sHi, &sLo);
   12133    IRTemp res = newTemp(Ity_V256);
   12134    assign(res, binop(Iop_V128HLtoV256,
   12135                      mkexpr(math_PSADBW_128(dHi, sHi)),
   12136                      mkexpr(math_PSADBW_128(dLo, sLo))));
   12137    return res;
   12138 }
   12139 
   12140 
   12141 static Long dis_MASKMOVDQU ( const VexAbiInfo* vbi, Prefix pfx,
   12142                              Long delta, Bool isAvx )
   12143 {
   12144    IRTemp regD    = newTemp(Ity_V128);
   12145    IRTemp mask    = newTemp(Ity_V128);
   12146    IRTemp olddata = newTemp(Ity_V128);
   12147    IRTemp newdata = newTemp(Ity_V128);
   12148    IRTemp addr    = newTemp(Ity_I64);
   12149    UChar  modrm   = getUChar(delta);
   12150    UInt   rG      = gregOfRexRM(pfx,modrm);
   12151    UInt   rE      = eregOfRexRM(pfx,modrm);
   12152 
   12153    assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
   12154    assign( regD, getXMMReg( rG ));
   12155 
   12156    /* Unfortunately can't do the obvious thing with SarN8x16
   12157       here since that can't be re-emitted as SSE2 code - no such
   12158       insn. */
   12159    assign( mask,
   12160            binop(Iop_64HLtoV128,
   12161                  binop(Iop_SarN8x8,
   12162                        getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ),
   12163                        mkU8(7) ),
   12164                  binop(Iop_SarN8x8,
   12165                        getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ),
   12166                        mkU8(7) ) ));
   12167    assign( olddata, loadLE( Ity_V128, mkexpr(addr) ));
   12168    assign( newdata, binop(Iop_OrV128,
   12169                           binop(Iop_AndV128,
   12170                                 mkexpr(regD),
   12171                                 mkexpr(mask) ),
   12172                           binop(Iop_AndV128,
   12173                                 mkexpr(olddata),
   12174                                 unop(Iop_NotV128, mkexpr(mask)))) );
   12175    storeLE( mkexpr(addr), mkexpr(newdata) );
   12176 
   12177    delta += 1;
   12178    DIP("%smaskmovdqu %s,%s\n", isAvx ? "v" : "",
   12179        nameXMMReg(rE), nameXMMReg(rG) );
   12180    return delta;
   12181 }
   12182 
   12183 
   12184 static Long dis_MOVMSKPS_128 ( const VexAbiInfo* vbi, Prefix pfx,
   12185                                Long delta, Bool isAvx )
   12186 {
   12187    UChar modrm = getUChar(delta);
   12188    UInt   rG   = gregOfRexRM(pfx,modrm);
   12189    UInt   rE   = eregOfRexRM(pfx,modrm);
   12190    IRTemp t0   = newTemp(Ity_I32);
   12191    IRTemp t1   = newTemp(Ity_I32);
   12192    IRTemp t2   = newTemp(Ity_I32);
   12193    IRTemp t3   = newTemp(Ity_I32);
   12194    delta += 1;
   12195    assign( t0, binop( Iop_And32,
   12196                       binop(Iop_Shr32, getXMMRegLane32(rE,0), mkU8(31)),
   12197                       mkU32(1) ));
   12198    assign( t1, binop( Iop_And32,
   12199                       binop(Iop_Shr32, getXMMRegLane32(rE,1), mkU8(30)),
   12200                       mkU32(2) ));
   12201    assign( t2, binop( Iop_And32,
   12202                       binop(Iop_Shr32, getXMMRegLane32(rE,2), mkU8(29)),
   12203                       mkU32(4) ));
   12204    assign( t3, binop( Iop_And32,
   12205                       binop(Iop_Shr32, getXMMRegLane32(rE,3), mkU8(28)),
   12206                       mkU32(8) ));
   12207    putIReg32( rG, binop(Iop_Or32,
   12208                         binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
   12209                         binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ) );
   12210    DIP("%smovmskps %s,%s\n", isAvx ? "v" : "",
   12211        nameXMMReg(rE), nameIReg32(rG));
   12212    return delta;
   12213 }
   12214 
   12215 
   12216 static Long dis_MOVMSKPS_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta )
   12217 {
   12218    UChar modrm = getUChar(delta);
   12219    UInt   rG   = gregOfRexRM(pfx,modrm);
   12220    UInt   rE   = eregOfRexRM(pfx,modrm);
   12221    IRTemp t0   = newTemp(Ity_I32);
   12222    IRTemp t1   = newTemp(Ity_I32);
   12223    IRTemp t2   = newTemp(Ity_I32);
   12224    IRTemp t3   = newTemp(Ity_I32);
   12225    IRTemp t4   = newTemp(Ity_I32);
   12226    IRTemp t5   = newTemp(Ity_I32);
   12227    IRTemp t6   = newTemp(Ity_I32);
   12228    IRTemp t7   = newTemp(Ity_I32);
   12229    delta += 1;
   12230    assign( t0, binop( Iop_And32,
   12231                       binop(Iop_Shr32, getYMMRegLane32(rE,0), mkU8(31)),
   12232                       mkU32(1) ));
   12233    assign( t1, binop( Iop_And32,
   12234                       binop(Iop_Shr32, getYMMRegLane32(rE,1), mkU8(30)),
   12235                       mkU32(2) ));
   12236    assign( t2, binop( Iop_And32,
   12237                       binop(Iop_Shr32, getYMMRegLane32(rE,2), mkU8(29)),
   12238                       mkU32(4) ));
   12239    assign( t3, binop( Iop_And32,
   12240                       binop(Iop_Shr32, getYMMRegLane32(rE,3), mkU8(28)),
   12241                       mkU32(8) ));
   12242    assign( t4, binop( Iop_And32,
   12243                       binop(Iop_Shr32, getYMMRegLane32(rE,4), mkU8(27)),
   12244                       mkU32(16) ));
   12245    assign( t5, binop( Iop_And32,
   12246                       binop(Iop_Shr32, getYMMRegLane32(rE,5), mkU8(26)),
   12247                       mkU32(32) ));
   12248    assign( t6, binop( Iop_And32,
   12249                       binop(Iop_Shr32, getYMMRegLane32(rE,6), mkU8(25)),
   12250                       mkU32(64) ));
   12251    assign( t7, binop( Iop_And32,
   12252                       binop(Iop_Shr32, getYMMRegLane32(rE,7), mkU8(24)),
   12253                       mkU32(128) ));
   12254    putIReg32( rG, binop(Iop_Or32,
   12255                         binop(Iop_Or32,
   12256                               binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
   12257                               binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ),
   12258                         binop(Iop_Or32,
   12259                               binop(Iop_Or32, mkexpr(t4), mkexpr(t5)),
   12260                               binop(Iop_Or32, mkexpr(t6), mkexpr(t7)) ) ) );
   12261    DIP("vmovmskps %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
   12262    return delta;
   12263 }
   12264 
   12265 
   12266 static Long dis_MOVMSKPD_128 ( const VexAbiInfo* vbi, Prefix pfx,
   12267                                Long delta, Bool isAvx )
   12268 {
   12269    UChar modrm = getUChar(delta);
   12270    UInt   rG   = gregOfRexRM(pfx,modrm);
   12271    UInt   rE   = eregOfRexRM(pfx,modrm);
   12272    IRTemp t0   = newTemp(Ity_I32);
   12273    IRTemp t1   = newTemp(Ity_I32);
   12274    delta += 1;
   12275    assign( t0, binop( Iop_And32,
   12276                       binop(Iop_Shr32, getXMMRegLane32(rE,1), mkU8(31)),
   12277                       mkU32(1) ));
   12278    assign( t1, binop( Iop_And32,
   12279                       binop(Iop_Shr32, getXMMRegLane32(rE,3), mkU8(30)),
   12280                       mkU32(2) ));
   12281    putIReg32( rG, binop(Iop_Or32, mkexpr(t0), mkexpr(t1) ) );
   12282    DIP("%smovmskpd %s,%s\n", isAvx ? "v" : "",
   12283        nameXMMReg(rE), nameIReg32(rG));
   12284    return delta;
   12285 }
   12286 
   12287 
   12288 static Long dis_MOVMSKPD_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta )
   12289 {
   12290    UChar modrm = getUChar(delta);
   12291    UInt   rG   = gregOfRexRM(pfx,modrm);
   12292    UInt   rE   = eregOfRexRM(pfx,modrm);
   12293    IRTemp t0   = newTemp(Ity_I32);
   12294    IRTemp t1   = newTemp(Ity_I32);
   12295    IRTemp t2   = newTemp(Ity_I32);
   12296    IRTemp t3   = newTemp(Ity_I32);
   12297    delta += 1;
   12298    assign( t0, binop( Iop_And32,
   12299                       binop(Iop_Shr32, getYMMRegLane32(rE,1), mkU8(31)),
   12300                       mkU32(1) ));
   12301    assign( t1, binop( Iop_And32,
   12302                       binop(Iop_Shr32, getYMMRegLane32(rE,3), mkU8(30)),
   12303                       mkU32(2) ));
   12304    assign( t2, binop( Iop_And32,
   12305                       binop(Iop_Shr32, getYMMRegLane32(rE,5), mkU8(29)),
   12306                       mkU32(4) ));
   12307    assign( t3, binop( Iop_And32,
   12308                       binop(Iop_Shr32, getYMMRegLane32(rE,7), mkU8(28)),
   12309                       mkU32(8) ));
   12310    putIReg32( rG, binop(Iop_Or32,
   12311                         binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
   12312                         binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ) );
   12313    DIP("vmovmskps %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
   12314    return delta;
   12315 }
   12316 
   12317 
   12318 /* Note, this also handles SSE(1) insns. */
   12319 __attribute__((noinline))
   12320 static
   12321 Long dis_ESC_0F__SSE2 ( Bool* decode_OK,
   12322                         const VexArchInfo* archinfo,
   12323                         const VexAbiInfo* vbi,
   12324                         Prefix pfx, Int sz, Long deltaIN,
   12325                         DisResult* dres )
   12326 {
   12327    IRTemp addr  = IRTemp_INVALID;
   12328    IRTemp t0    = IRTemp_INVALID;
   12329    IRTemp t1    = IRTemp_INVALID;
   12330    IRTemp t2    = IRTemp_INVALID;
   12331    IRTemp t3    = IRTemp_INVALID;
   12332    IRTemp t4    = IRTemp_INVALID;
   12333    IRTemp t5    = IRTemp_INVALID;
   12334    IRTemp t6    = IRTemp_INVALID;
   12335    UChar  modrm = 0;
   12336    Int    alen  = 0;
   12337    HChar  dis_buf[50];
   12338 
   12339    *decode_OK = False;
   12340 
   12341    Long   delta = deltaIN;
   12342    UChar  opc   = getUChar(delta);
   12343    delta++;
   12344    switch (opc) {
   12345 
   12346    case 0x10:
   12347       if (have66noF2noF3(pfx)
   12348           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12349          /* 66 0F 10 = MOVUPD -- move from E (mem or xmm) to G (xmm). */
   12350          modrm = getUChar(delta);
   12351          if (epartIsReg(modrm)) {
   12352             putXMMReg( gregOfRexRM(pfx,modrm),
   12353                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   12354             DIP("movupd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12355                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12356             delta += 1;
   12357          } else {
   12358             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12359             putXMMReg( gregOfRexRM(pfx,modrm),
   12360                        loadLE(Ity_V128, mkexpr(addr)) );
   12361             DIP("movupd %s,%s\n", dis_buf,
   12362                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12363             delta += alen;
   12364          }
   12365          goto decode_success;
   12366       }
   12367       /* F2 0F 10 = MOVSD -- move 64 bits from E (mem or lo half xmm) to
   12368          G (lo half xmm).  If E is mem, upper half of G is zeroed out.
   12369          If E is reg, upper half of G is unchanged. */
   12370       if (haveF2no66noF3(pfx)
   12371           && (sz == 4 || /* ignore redundant REX.W */ sz == 8) ) {
   12372          modrm = getUChar(delta);
   12373          if (epartIsReg(modrm)) {
   12374             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   12375                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
   12376             DIP("movsd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12377                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   12378             delta += 1;
   12379          } else {
   12380             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12381             putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
   12382             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   12383                              loadLE(Ity_I64, mkexpr(addr)) );
   12384             DIP("movsd %s,%s\n", dis_buf,
   12385                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   12386             delta += alen;
   12387          }
   12388          goto decode_success;
   12389       }
   12390       /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
   12391          (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
   12392       if (haveF3no66noF2(pfx)
   12393           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12394          modrm = getUChar(delta);
   12395          if (epartIsReg(modrm)) {
   12396             putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
   12397                              getXMMRegLane32( eregOfRexRM(pfx,modrm), 0 ));
   12398             DIP("movss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12399                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   12400             delta += 1;
   12401          } else {
   12402             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12403             putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
   12404             putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
   12405                              loadLE(Ity_I32, mkexpr(addr)) );
   12406             DIP("movss %s,%s\n", dis_buf,
   12407                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   12408             delta += alen;
   12409          }
   12410          goto decode_success;
   12411       }
   12412       /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
   12413       if (haveNo66noF2noF3(pfx)
   12414           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12415          modrm = getUChar(delta);
   12416          if (epartIsReg(modrm)) {
   12417             putXMMReg( gregOfRexRM(pfx,modrm),
   12418                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   12419             DIP("movups %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12420                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12421             delta += 1;
   12422          } else {
   12423             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12424             putXMMReg( gregOfRexRM(pfx,modrm),
   12425                        loadLE(Ity_V128, mkexpr(addr)) );
   12426             DIP("movups %s,%s\n", dis_buf,
   12427                                      nameXMMReg(gregOfRexRM(pfx,modrm)));
   12428             delta += alen;
   12429          }
   12430          goto decode_success;
   12431       }
   12432       break;
   12433 
   12434    case 0x11:
   12435       /* F2 0F 11 = MOVSD -- move 64 bits from G (lo half xmm) to E (mem
   12436          or lo half xmm). */
   12437       if (haveF2no66noF3(pfx)
   12438           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12439          modrm = getUChar(delta);
   12440          if (epartIsReg(modrm)) {
   12441             putXMMRegLane64( eregOfRexRM(pfx,modrm), 0,
   12442                              getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
   12443             DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12444                                  nameXMMReg(eregOfRexRM(pfx,modrm)));
   12445             delta += 1;
   12446          } else {
   12447             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12448             storeLE( mkexpr(addr),
   12449                      getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
   12450             DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12451                                  dis_buf);
   12452             delta += alen;
   12453          }
   12454          goto decode_success;
   12455       }
   12456       /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
   12457          or lo 1/4 xmm). */
   12458       if (haveF3no66noF2(pfx) && sz == 4) {
   12459          modrm = getUChar(delta);
   12460          if (epartIsReg(modrm)) {
   12461             /* fall through, we don't yet have a test case */
   12462          } else {
   12463             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12464             storeLE( mkexpr(addr),
   12465                      getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
   12466             DIP("movss %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12467                                  dis_buf);
   12468             delta += alen;
   12469             goto decode_success;
   12470          }
   12471       }
   12472       /* 66 0F 11 = MOVUPD -- move from G (xmm) to E (mem or xmm). */
   12473       if (have66noF2noF3(pfx)
   12474           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12475          modrm = getUChar(delta);
   12476          if (epartIsReg(modrm)) {
   12477             putXMMReg( eregOfRexRM(pfx,modrm),
   12478                        getXMMReg( gregOfRexRM(pfx,modrm) ) );
   12479             DIP("movupd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12480                                   nameXMMReg(eregOfRexRM(pfx,modrm)));
   12481             delta += 1;
   12482          } else {
   12483             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12484             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   12485             DIP("movupd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12486                                   dis_buf );
   12487             delta += alen;
   12488          }
   12489          goto decode_success;
   12490       }
   12491       /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
   12492       if (haveNo66noF2noF3(pfx)
   12493           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12494          modrm = getUChar(delta);
   12495          if (epartIsReg(modrm)) {
   12496             /* fall through; awaiting test case */
   12497          } else {
   12498             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12499             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   12500             DIP("movups %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12501                                   dis_buf );
   12502             delta += alen;
   12503             goto decode_success;
   12504          }
   12505       }
   12506       break;
   12507 
   12508    case 0x12:
   12509       /* 66 0F 12 = MOVLPD -- move from mem to low half of XMM. */
   12510       /* Identical to MOVLPS ? */
   12511       if (have66noF2noF3(pfx)
   12512           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12513          modrm = getUChar(delta);
   12514          if (epartIsReg(modrm)) {
   12515             /* fall through; apparently reg-reg is not possible */
   12516          } else {
   12517             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12518             delta += alen;
   12519             putXMMRegLane64( gregOfRexRM(pfx,modrm),
   12520                              0/*lower lane*/,
   12521                              loadLE(Ity_I64, mkexpr(addr)) );
   12522             DIP("movlpd %s, %s\n",
   12523                 dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
   12524             goto decode_success;
   12525          }
   12526       }
   12527       /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
   12528       /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
   12529       if (haveNo66noF2noF3(pfx)
   12530           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12531          modrm = getUChar(delta);
   12532          if (epartIsReg(modrm)) {
   12533             delta += 1;
   12534             putXMMRegLane64( gregOfRexRM(pfx,modrm),
   12535                              0/*lower lane*/,
   12536                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ));
   12537             DIP("movhlps %s, %s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12538                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12539          } else {
   12540             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12541             delta += alen;
   12542             putXMMRegLane64( gregOfRexRM(pfx,modrm),  0/*lower lane*/,
   12543                              loadLE(Ity_I64, mkexpr(addr)) );
   12544             DIP("movlps %s, %s\n",
   12545                 dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
   12546          }
   12547          goto decode_success;
   12548       }
   12549       break;
   12550 
   12551    case 0x13:
   12552       /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
   12553       if (haveNo66noF2noF3(pfx)
   12554           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12555          modrm = getUChar(delta);
   12556          if (!epartIsReg(modrm)) {
   12557             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12558             delta += alen;
   12559             storeLE( mkexpr(addr),
   12560                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
   12561                                       0/*lower lane*/ ) );
   12562             DIP("movlps %s, %s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   12563                                    dis_buf);
   12564             goto decode_success;
   12565          }
   12566          /* else fall through */
   12567       }
   12568       /* 66 0F 13 = MOVLPD -- move from low half of XMM to mem. */
   12569       /* Identical to MOVLPS ? */
   12570       if (have66noF2noF3(pfx)
   12571           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12572          modrm = getUChar(delta);
   12573          if (!epartIsReg(modrm)) {
   12574             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12575             delta += alen;
   12576             storeLE( mkexpr(addr),
   12577                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
   12578                                       0/*lower lane*/ ) );
   12579             DIP("movlpd %s, %s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   12580                                    dis_buf);
   12581             goto decode_success;
   12582          }
   12583          /* else fall through */
   12584       }
   12585       break;
   12586 
   12587    case 0x14:
   12588    case 0x15:
   12589       /* 0F 14 = UNPCKLPS -- unpack and interleave low part F32s */
   12590       /* 0F 15 = UNPCKHPS -- unpack and interleave high part F32s */
   12591       /* These just appear to be special cases of SHUFPS */
   12592       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12593          Bool   hi = toBool(opc == 0x15);
   12594          IRTemp sV = newTemp(Ity_V128);
   12595          IRTemp dV = newTemp(Ity_V128);
   12596          modrm = getUChar(delta);
   12597          UInt   rG = gregOfRexRM(pfx,modrm);
   12598          assign( dV, getXMMReg(rG) );
   12599          if (epartIsReg(modrm)) {
   12600             UInt rE = eregOfRexRM(pfx,modrm);
   12601             assign( sV, getXMMReg(rE) );
   12602             delta += 1;
   12603             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   12604                 nameXMMReg(rE), nameXMMReg(rG));
   12605          } else {
   12606             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12607             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12608             delta += alen;
   12609             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   12610                 dis_buf, nameXMMReg(rG));
   12611          }
   12612          IRTemp res = math_UNPCKxPS_128( sV, dV, hi );
   12613          putXMMReg( rG, mkexpr(res) );
   12614          goto decode_success;
   12615       }
   12616       /* 66 0F 15 = UNPCKHPD -- unpack and interleave high part F64s */
   12617       /* 66 0F 14 = UNPCKLPD -- unpack and interleave low part F64s */
   12618       /* These just appear to be special cases of SHUFPS */
   12619       if (have66noF2noF3(pfx)
   12620           && sz == 2 /* could be 8 if rex also present */) {
   12621          Bool   hi = toBool(opc == 0x15);
   12622          IRTemp sV = newTemp(Ity_V128);
   12623          IRTemp dV = newTemp(Ity_V128);
   12624          modrm = getUChar(delta);
   12625          UInt   rG = gregOfRexRM(pfx,modrm);
   12626          assign( dV, getXMMReg(rG) );
   12627          if (epartIsReg(modrm)) {
   12628             UInt rE = eregOfRexRM(pfx,modrm);
   12629             assign( sV, getXMMReg(rE) );
   12630             delta += 1;
   12631             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   12632                 nameXMMReg(rE), nameXMMReg(rG));
   12633          } else {
   12634             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12635             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12636             delta += alen;
   12637             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   12638                 dis_buf, nameXMMReg(rG));
   12639          }
   12640          IRTemp res = math_UNPCKxPD_128( sV, dV, hi );
   12641          putXMMReg( rG, mkexpr(res) );
   12642          goto decode_success;
   12643       }
   12644       break;
   12645 
   12646    case 0x16:
   12647       /* 66 0F 16 = MOVHPD -- move from mem to high half of XMM. */
   12648       /* These seems identical to MOVHPS.  This instruction encoding is
   12649          completely crazy. */
   12650       if (have66noF2noF3(pfx)
   12651           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12652          modrm = getUChar(delta);
   12653          if (epartIsReg(modrm)) {
   12654             /* fall through; apparently reg-reg is not possible */
   12655          } else {
   12656             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12657             delta += alen;
   12658             putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
   12659                              loadLE(Ity_I64, mkexpr(addr)) );
   12660             DIP("movhpd %s,%s\n", dis_buf,
   12661                                   nameXMMReg( gregOfRexRM(pfx,modrm) ));
   12662             goto decode_success;
   12663          }
   12664       }
   12665       /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
   12666       /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
   12667       if (haveNo66noF2noF3(pfx)
   12668           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12669          modrm = getUChar(delta);
   12670          if (epartIsReg(modrm)) {
   12671             delta += 1;
   12672             putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
   12673                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ) );
   12674             DIP("movhps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12675                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12676          } else {
   12677             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12678             delta += alen;
   12679             putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
   12680                              loadLE(Ity_I64, mkexpr(addr)) );
   12681             DIP("movhps %s,%s\n", dis_buf,
   12682                                   nameXMMReg( gregOfRexRM(pfx,modrm) ));
   12683          }
   12684          goto decode_success;
   12685       }
   12686       break;
   12687 
   12688    case 0x17:
   12689       /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
   12690       if (haveNo66noF2noF3(pfx)
   12691           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12692          modrm = getUChar(delta);
   12693          if (!epartIsReg(modrm)) {
   12694             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12695             delta += alen;
   12696             storeLE( mkexpr(addr),
   12697                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
   12698                                       1/*upper lane*/ ) );
   12699             DIP("movhps %s,%s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   12700                                   dis_buf);
   12701             goto decode_success;
   12702          }
   12703          /* else fall through */
   12704       }
   12705       /* 66 0F 17 = MOVHPD -- move from high half of XMM to mem. */
   12706       /* Again, this seems identical to MOVHPS. */
   12707       if (have66noF2noF3(pfx)
   12708           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12709          modrm = getUChar(delta);
   12710          if (!epartIsReg(modrm)) {
   12711             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12712             delta += alen;
   12713             storeLE( mkexpr(addr),
   12714                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
   12715                                       1/*upper lane*/ ) );
   12716             DIP("movhpd %s,%s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   12717                                   dis_buf);
   12718             goto decode_success;
   12719          }
   12720          /* else fall through */
   12721       }
   12722       break;
   12723 
   12724    case 0x18:
   12725       /* 0F 18 /0 = PREFETCHNTA -- prefetch into caches, */
   12726       /* 0F 18 /1 = PREFETCH0   -- with various different hints */
   12727       /* 0F 18 /2 = PREFETCH1 */
   12728       /* 0F 18 /3 = PREFETCH2 */
   12729       if (haveNo66noF2noF3(pfx)
   12730           && !epartIsReg(getUChar(delta))
   12731           && gregLO3ofRM(getUChar(delta)) >= 0
   12732           && gregLO3ofRM(getUChar(delta)) <= 3) {
   12733          const HChar* hintstr = "??";
   12734 
   12735          modrm = getUChar(delta);
   12736          vassert(!epartIsReg(modrm));
   12737 
   12738          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12739          delta += alen;
   12740 
   12741          switch (gregLO3ofRM(modrm)) {
   12742             case 0: hintstr = "nta"; break;
   12743             case 1: hintstr = "t0"; break;
   12744             case 2: hintstr = "t1"; break;
   12745             case 3: hintstr = "t2"; break;
   12746             default: vassert(0);
   12747          }
   12748 
   12749          DIP("prefetch%s %s\n", hintstr, dis_buf);
   12750          goto decode_success;
   12751       }
   12752       break;
   12753 
   12754    case 0x28:
   12755       /* 66 0F 28 = MOVAPD -- move from E (mem or xmm) to G (xmm). */
   12756       if (have66noF2noF3(pfx)
   12757           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12758          modrm = getUChar(delta);
   12759          if (epartIsReg(modrm)) {
   12760             putXMMReg( gregOfRexRM(pfx,modrm),
   12761                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   12762             DIP("movapd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12763                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12764             delta += 1;
   12765          } else {
   12766             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12767             gen_SEGV_if_not_16_aligned( addr );
   12768             putXMMReg( gregOfRexRM(pfx,modrm),
   12769                        loadLE(Ity_V128, mkexpr(addr)) );
   12770             DIP("movapd %s,%s\n", dis_buf,
   12771                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12772             delta += alen;
   12773          }
   12774          goto decode_success;
   12775       }
   12776       /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
   12777       if (haveNo66noF2noF3(pfx)
   12778           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12779          modrm = getUChar(delta);
   12780          if (epartIsReg(modrm)) {
   12781             putXMMReg( gregOfRexRM(pfx,modrm),
   12782                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   12783             DIP("movaps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12784                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12785             delta += 1;
   12786          } else {
   12787             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12788             gen_SEGV_if_not_16_aligned( addr );
   12789             putXMMReg( gregOfRexRM(pfx,modrm),
   12790                        loadLE(Ity_V128, mkexpr(addr)) );
   12791             DIP("movaps %s,%s\n", dis_buf,
   12792                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12793             delta += alen;
   12794          }
   12795          goto decode_success;
   12796       }
   12797       break;
   12798 
   12799    case 0x29:
   12800       /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
   12801       if (haveNo66noF2noF3(pfx)
   12802           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12803          modrm = getUChar(delta);
   12804          if (epartIsReg(modrm)) {
   12805             putXMMReg( eregOfRexRM(pfx,modrm),
   12806                        getXMMReg( gregOfRexRM(pfx,modrm) ));
   12807             DIP("movaps %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12808                                   nameXMMReg(eregOfRexRM(pfx,modrm)));
   12809             delta += 1;
   12810          } else {
   12811             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12812             gen_SEGV_if_not_16_aligned( addr );
   12813             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   12814             DIP("movaps %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12815                                   dis_buf );
   12816             delta += alen;
   12817          }
   12818          goto decode_success;
   12819       }
   12820       /* 66 0F 29 = MOVAPD -- move from G (xmm) to E (mem or xmm). */
   12821       if (have66noF2noF3(pfx)
   12822           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12823          modrm = getUChar(delta);
   12824          if (epartIsReg(modrm)) {
   12825             putXMMReg( eregOfRexRM(pfx,modrm),
   12826                        getXMMReg( gregOfRexRM(pfx,modrm) ) );
   12827             DIP("movapd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12828                                   nameXMMReg(eregOfRexRM(pfx,modrm)));
   12829             delta += 1;
   12830          } else {
   12831             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12832             gen_SEGV_if_not_16_aligned( addr );
   12833             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   12834             DIP("movapd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12835                                   dis_buf );
   12836             delta += alen;
   12837          }
   12838          goto decode_success;
   12839       }
   12840       break;
   12841 
   12842    case 0x2A:
   12843       /* 0F 2A = CVTPI2PS -- convert 2 x I32 in mem/mmx to 2 x F32 in low
   12844          half xmm */
   12845       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12846          IRTemp arg64 = newTemp(Ity_I64);
   12847          IRTemp rmode = newTemp(Ity_I32);
   12848 
   12849          modrm = getUChar(delta);
   12850          do_MMX_preamble();
   12851          if (epartIsReg(modrm)) {
   12852             assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
   12853             delta += 1;
   12854             DIP("cvtpi2ps %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   12855                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12856          } else {
   12857             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12858             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   12859             delta += alen;
   12860             DIP("cvtpi2ps %s,%s\n", dis_buf,
   12861                                     nameXMMReg(gregOfRexRM(pfx,modrm)) );
   12862          }
   12863 
   12864          assign( rmode, get_sse_roundingmode() );
   12865 
   12866          putXMMRegLane32F(
   12867             gregOfRexRM(pfx,modrm), 0,
   12868             binop(Iop_F64toF32,
   12869                   mkexpr(rmode),
   12870                   unop(Iop_I32StoF64,
   12871                        unop(Iop_64to32, mkexpr(arg64)) )) );
   12872 
   12873          putXMMRegLane32F(
   12874             gregOfRexRM(pfx,modrm), 1,
   12875             binop(Iop_F64toF32,
   12876                   mkexpr(rmode),
   12877                   unop(Iop_I32StoF64,
   12878                        unop(Iop_64HIto32, mkexpr(arg64)) )) );
   12879 
   12880          goto decode_success;
   12881       }
   12882       /* F3 0F 2A = CVTSI2SS
   12883          -- sz==4: convert I32 in mem/ireg to F32 in low quarter xmm
   12884          -- sz==8: convert I64 in mem/ireg to F32 in low quarter xmm */
   12885       if (haveF3no66noF2(pfx) && (sz == 4 || sz == 8)) {
   12886          IRTemp rmode = newTemp(Ity_I32);
   12887          assign( rmode, get_sse_roundingmode() );
   12888          modrm = getUChar(delta);
   12889          if (sz == 4) {
   12890             IRTemp arg32 = newTemp(Ity_I32);
   12891             if (epartIsReg(modrm)) {
   12892                assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
   12893                delta += 1;
   12894                DIP("cvtsi2ss %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   12895                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
   12896             } else {
   12897                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12898                assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   12899                delta += alen;
   12900                DIP("cvtsi2ss %s,%s\n", dis_buf,
   12901                                        nameXMMReg(gregOfRexRM(pfx,modrm)) );
   12902             }
   12903             putXMMRegLane32F(
   12904                gregOfRexRM(pfx,modrm), 0,
   12905                binop(Iop_F64toF32,
   12906                      mkexpr(rmode),
   12907                      unop(Iop_I32StoF64, mkexpr(arg32)) ) );
   12908          } else {
   12909             /* sz == 8 */
   12910             IRTemp arg64 = newTemp(Ity_I64);
   12911             if (epartIsReg(modrm)) {
   12912                assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
   12913                delta += 1;
   12914                DIP("cvtsi2ssq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   12915                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   12916             } else {
   12917                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12918                assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   12919                delta += alen;
   12920                DIP("cvtsi2ssq %s,%s\n", dis_buf,
   12921                                         nameXMMReg(gregOfRexRM(pfx,modrm)) );
   12922             }
   12923             putXMMRegLane32F(
   12924                gregOfRexRM(pfx,modrm), 0,
   12925                binop(Iop_F64toF32,
   12926                      mkexpr(rmode),
   12927                      binop(Iop_I64StoF64, mkexpr(rmode), mkexpr(arg64)) ) );
   12928          }
   12929          goto decode_success;
   12930       }
   12931       /* F2 0F 2A = CVTSI2SD
   12932          when sz==4 -- convert I32 in mem/ireg to F64 in low half xmm
   12933          when sz==8 -- convert I64 in mem/ireg to F64 in low half xmm
   12934       */
   12935       if (haveF2no66noF3(pfx) && (sz == 4 || sz == 8)) {
   12936          modrm = getUChar(delta);
   12937          if (sz == 4) {
   12938             IRTemp arg32 = newTemp(Ity_I32);
   12939             if (epartIsReg(modrm)) {
   12940                assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
   12941                delta += 1;
   12942                DIP("cvtsi2sdl %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   12943                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   12944             } else {
   12945                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12946                assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   12947                delta += alen;
   12948                DIP("cvtsi2sdl %s,%s\n", dis_buf,
   12949                                         nameXMMReg(gregOfRexRM(pfx,modrm)) );
   12950             }
   12951             putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
   12952                               unop(Iop_I32StoF64, mkexpr(arg32))
   12953             );
   12954          } else {
   12955             /* sz == 8 */
   12956             IRTemp arg64 = newTemp(Ity_I64);
   12957             if (epartIsReg(modrm)) {
   12958                assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
   12959                delta += 1;
   12960                DIP("cvtsi2sdq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   12961                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   12962             } else {
   12963                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12964                assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   12965                delta += alen;
   12966                DIP("cvtsi2sdq %s,%s\n", dis_buf,
   12967                                         nameXMMReg(gregOfRexRM(pfx,modrm)) );
   12968             }
   12969             putXMMRegLane64F(
   12970                gregOfRexRM(pfx,modrm),
   12971                0,
   12972                binop( Iop_I64StoF64,
   12973                       get_sse_roundingmode(),
   12974                       mkexpr(arg64)
   12975                )
   12976             );
   12977          }
   12978          goto decode_success;
   12979       }
   12980       /* 66 0F 2A = CVTPI2PD -- convert 2 x I32 in mem/mmx to 2 x F64 in
   12981          xmm(G) */
   12982       if (have66noF2noF3(pfx) && sz == 2) {
   12983          IRTemp arg64 = newTemp(Ity_I64);
   12984 
   12985          modrm = getUChar(delta);
   12986          if (epartIsReg(modrm)) {
   12987             /* Only switch to MMX mode if the source is a MMX register.
   12988                This is inconsistent with all other instructions which
   12989                convert between XMM and (M64 or MMX), which always switch
   12990                to MMX mode even if 64-bit operand is M64 and not MMX.  At
   12991                least, that's what the Intel docs seem to me to say.
   12992                Fixes #210264. */
   12993             do_MMX_preamble();
   12994             assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
   12995             delta += 1;
   12996             DIP("cvtpi2pd %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   12997                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12998          } else {
   12999             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13000             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   13001             delta += alen;
   13002             DIP("cvtpi2pd %s,%s\n", dis_buf,
   13003                                     nameXMMReg(gregOfRexRM(pfx,modrm)) );
   13004          }
   13005 
   13006          putXMMRegLane64F(
   13007             gregOfRexRM(pfx,modrm), 0,
   13008             unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)) )
   13009          );
   13010 
   13011          putXMMRegLane64F(
   13012             gregOfRexRM(pfx,modrm), 1,
   13013             unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)) )
   13014          );
   13015 
   13016          goto decode_success;
   13017       }
   13018       break;
   13019 
   13020    case 0x2B:
   13021       /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
   13022       /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
   13023       if ( (haveNo66noF2noF3(pfx) && sz == 4)
   13024            || (have66noF2noF3(pfx) && sz == 2) ) {
   13025          modrm = getUChar(delta);
   13026          if (!epartIsReg(modrm)) {
   13027             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13028             gen_SEGV_if_not_16_aligned( addr );
   13029             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   13030             DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
   13031                                     dis_buf,
   13032                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13033             delta += alen;
   13034             goto decode_success;
   13035          }
   13036          /* else fall through */
   13037       }
   13038       break;
   13039 
   13040    case 0x2C:
   13041    case 0x2D:
   13042       /* 0F 2D = CVTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   13043          I32 in mmx, according to prevailing SSE rounding mode */
   13044       /* 0F 2C = CVTTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   13045          I32 in mmx, rounding towards zero */
   13046       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13047          IRTemp dst64  = newTemp(Ity_I64);
   13048          IRTemp rmode  = newTemp(Ity_I32);
   13049          IRTemp f32lo  = newTemp(Ity_F32);
   13050          IRTemp f32hi  = newTemp(Ity_F32);
   13051          Bool   r2zero = toBool(opc == 0x2C);
   13052 
   13053          do_MMX_preamble();
   13054          modrm = getUChar(delta);
   13055 
   13056          if (epartIsReg(modrm)) {
   13057             delta += 1;
   13058             assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
   13059             assign(f32hi, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 1));
   13060             DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   13061                                       nameXMMReg(eregOfRexRM(pfx,modrm)),
   13062                                       nameMMXReg(gregLO3ofRM(modrm)));
   13063          } else {
   13064             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13065             assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   13066             assign(f32hi, loadLE(Ity_F32, binop( Iop_Add64,
   13067                                                  mkexpr(addr),
   13068                                                  mkU64(4) )));
   13069             delta += alen;
   13070             DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   13071                                       dis_buf,
   13072                                       nameMMXReg(gregLO3ofRM(modrm)));
   13073          }
   13074 
   13075          if (r2zero) {
   13076             assign(rmode, mkU32((UInt)Irrm_ZERO) );
   13077          } else {
   13078             assign( rmode, get_sse_roundingmode() );
   13079          }
   13080 
   13081          assign(
   13082             dst64,
   13083             binop( Iop_32HLto64,
   13084                    binop( Iop_F64toI32S,
   13085                           mkexpr(rmode),
   13086                           unop( Iop_F32toF64, mkexpr(f32hi) ) ),
   13087                    binop( Iop_F64toI32S,
   13088                           mkexpr(rmode),
   13089                           unop( Iop_F32toF64, mkexpr(f32lo) ) )
   13090                  )
   13091          );
   13092 
   13093          putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
   13094          goto decode_success;
   13095       }
   13096       /* F3 0F 2D = CVTSS2SI
   13097          when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg,
   13098                        according to prevailing SSE rounding mode
   13099          when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg,
   13100                        according to prevailing SSE rounding mode
   13101       */
   13102       /* F3 0F 2C = CVTTSS2SI
   13103          when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg,
   13104                        truncating towards zero
   13105          when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg,
   13106                        truncating towards zero
   13107       */
   13108       if (haveF3no66noF2(pfx) && (sz == 4 || sz == 8)) {
   13109          delta = dis_CVTxSS2SI( vbi, pfx, delta, False/*!isAvx*/, opc, sz);
   13110          goto decode_success;
   13111       }
   13112       /* F2 0F 2D = CVTSD2SI
   13113          when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg,
   13114                        according to prevailing SSE rounding mode
   13115          when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg,
   13116                        according to prevailing SSE rounding mode
   13117       */
   13118       /* F2 0F 2C = CVTTSD2SI
   13119          when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg,
   13120                        truncating towards zero
   13121          when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg,
   13122                        truncating towards zero
   13123       */
   13124       if (haveF2no66noF3(pfx) && (sz == 4 || sz == 8)) {
   13125          delta = dis_CVTxSD2SI( vbi, pfx, delta, False/*!isAvx*/, opc, sz);
   13126          goto decode_success;
   13127       }
   13128       /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   13129          I32 in mmx, according to prevailing SSE rounding mode */
   13130       /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   13131          I32 in mmx, rounding towards zero */
   13132       if (have66noF2noF3(pfx) && sz == 2) {
   13133          IRTemp dst64  = newTemp(Ity_I64);
   13134          IRTemp rmode  = newTemp(Ity_I32);
   13135          IRTemp f64lo  = newTemp(Ity_F64);
   13136          IRTemp f64hi  = newTemp(Ity_F64);
   13137          Bool   r2zero = toBool(opc == 0x2C);
   13138 
   13139          do_MMX_preamble();
   13140          modrm = getUChar(delta);
   13141 
   13142          if (epartIsReg(modrm)) {
   13143             delta += 1;
   13144             assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
   13145             assign(f64hi, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 1));
   13146             DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "",
   13147                                       nameXMMReg(eregOfRexRM(pfx,modrm)),
   13148                                       nameMMXReg(gregLO3ofRM(modrm)));
   13149          } else {
   13150             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13151             assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   13152             assign(f64hi, loadLE(Ity_F64, binop( Iop_Add64,
   13153                                                  mkexpr(addr),
   13154                                                  mkU64(8) )));
   13155             delta += alen;
   13156             DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "",
   13157                                       dis_buf,
   13158                                       nameMMXReg(gregLO3ofRM(modrm)));
   13159          }
   13160 
   13161          if (r2zero) {
   13162             assign(rmode, mkU32((UInt)Irrm_ZERO) );
   13163          } else {
   13164             assign( rmode, get_sse_roundingmode() );
   13165          }
   13166 
   13167          assign(
   13168             dst64,
   13169             binop( Iop_32HLto64,
   13170                    binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64hi) ),
   13171                    binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo) )
   13172                  )
   13173          );
   13174 
   13175          putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
   13176          goto decode_success;
   13177       }
   13178       break;
   13179 
   13180    case 0x2E:
   13181    case 0x2F:
   13182       /* 66 0F 2E = UCOMISD -- 64F0x2 comparison G,E, and set ZCP */
   13183       /* 66 0F 2F = COMISD  -- 64F0x2 comparison G,E, and set ZCP */
   13184       if (have66noF2noF3(pfx) && sz == 2) {
   13185          delta = dis_COMISD( vbi, pfx, delta, False/*!isAvx*/, opc );
   13186          goto decode_success;
   13187       }
   13188       /* 0F 2E = UCOMISS -- 32F0x4 comparison G,E, and set ZCP */
   13189       /* 0F 2F = COMISS  -- 32F0x4 comparison G,E, and set ZCP */
   13190       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13191          delta = dis_COMISS( vbi, pfx, delta, False/*!isAvx*/, opc );
   13192          goto decode_success;
   13193       }
   13194       break;
   13195 
   13196    case 0x50:
   13197       /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
   13198          to 4 lowest bits of ireg(G) */
   13199       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   13200           && epartIsReg(getUChar(delta))) {
   13201          /* sz == 8 is a kludge to handle insns with REX.W redundantly
   13202             set to 1, which has been known to happen:
   13203 
   13204             4c 0f 50 d9             rex64X movmskps %xmm1,%r11d
   13205 
   13206             20071106: Intel docs say that REX.W isn't redundant: when
   13207             present, a 64-bit register is written; when not present, only
   13208             the 32-bit half is written.  However, testing on a Core2
   13209             machine suggests the entire 64 bit register is written
   13210             irrespective of the status of REX.W.  That could be because
   13211             of the default rule that says "if the lower half of a 32-bit
   13212             register is written, the upper half is zeroed".  By using
   13213             putIReg32 here we inadvertantly produce the same behaviour as
   13214             the Core2, for the same reason -- putIReg32 implements said
   13215             rule.
   13216 
   13217             AMD docs give no indication that REX.W is even valid for this
   13218             insn. */
   13219          delta = dis_MOVMSKPS_128( vbi, pfx, delta, False/*!isAvx*/ );
   13220          goto decode_success;
   13221       }
   13222       /* 66 0F 50 = MOVMSKPD - move 2 sign bits from 2 x F64 in xmm(E) to
   13223          2 lowest bits of ireg(G) */
   13224       if (have66noF2noF3(pfx) && (sz == 2 || sz == 8)) {
   13225          /* sz == 8 is a kludge to handle insns with REX.W redundantly
   13226             set to 1, which has been known to happen:
   13227             66 4c 0f 50 d9          rex64X movmskpd %xmm1,%r11d
   13228             20071106: see further comments on MOVMSKPS implementation above.
   13229          */
   13230          delta = dis_MOVMSKPD_128( vbi, pfx, delta, False/*!isAvx*/ );
   13231          goto decode_success;
   13232       }
   13233       break;
   13234 
   13235    case 0x51:
   13236       /* F3 0F 51 = SQRTSS -- approx sqrt 32F0x4 from R/M to R */
   13237       if (haveF3no66noF2(pfx) && sz == 4) {
   13238          delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
   13239                                             "sqrtss", Iop_Sqrt32F0x4 );
   13240          goto decode_success;
   13241       }
   13242       /* 0F 51 = SQRTPS -- approx sqrt 32Fx4 from R/M to R */
   13243       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13244          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
   13245                                            "sqrtps", Iop_Sqrt32Fx4 );
   13246          goto decode_success;
   13247       }
   13248       /* F2 0F 51 = SQRTSD -- approx sqrt 64F0x2 from R/M to R */
   13249       if (haveF2no66noF3(pfx) && sz == 4) {
   13250          delta = dis_SSE_E_to_G_unary_lo64( vbi, pfx, delta,
   13251                                             "sqrtsd", Iop_Sqrt64F0x2 );
   13252          goto decode_success;
   13253       }
   13254       /* 66 0F 51 = SQRTPD -- approx sqrt 64Fx2 from R/M to R */
   13255       if (have66noF2noF3(pfx) && sz == 2) {
   13256          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
   13257                                            "sqrtpd", Iop_Sqrt64Fx2 );
   13258          goto decode_success;
   13259       }
   13260       break;
   13261 
   13262    case 0x52:
   13263       /* F3 0F 52 = RSQRTSS -- approx reciprocal sqrt 32F0x4 from R/M to R */
   13264       if (haveF3no66noF2(pfx) && sz == 4) {
   13265          delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
   13266                                             "rsqrtss", Iop_RSqrtEst32F0x4 );
   13267          goto decode_success;
   13268       }
   13269       /* 0F 52 = RSQRTPS -- approx reciprocal sqrt 32Fx4 from R/M to R */
   13270       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13271          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
   13272                                            "rsqrtps", Iop_RSqrtEst32Fx4 );
   13273          goto decode_success;
   13274       }
   13275       break;
   13276 
   13277    case 0x53:
   13278       /* F3 0F 53 = RCPSS -- approx reciprocal 32F0x4 from R/M to R */
   13279       if (haveF3no66noF2(pfx) && sz == 4) {
   13280          delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
   13281                                             "rcpss", Iop_RecipEst32F0x4 );
   13282          goto decode_success;
   13283       }
   13284       /* 0F 53 = RCPPS -- approx reciprocal 32Fx4 from R/M to R */
   13285       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13286          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
   13287                                            "rcpps", Iop_RecipEst32Fx4 );
   13288          goto decode_success;
   13289       }
   13290       break;
   13291 
   13292    case 0x54:
   13293       /* 0F 54 = ANDPS -- G = G and E */
   13294       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13295          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "andps", Iop_AndV128 );
   13296          goto decode_success;
   13297       }
   13298       /* 66 0F 54 = ANDPD -- G = G and E */
   13299       if (have66noF2noF3(pfx) && sz == 2) {
   13300          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "andpd", Iop_AndV128 );
   13301          goto decode_success;
   13302       }
   13303       break;
   13304 
   13305    case 0x55:
   13306       /* 0F 55 = ANDNPS -- G = (not G) and E */
   13307       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13308          delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "andnps",
   13309                                                            Iop_AndV128 );
   13310          goto decode_success;
   13311       }
   13312       /* 66 0F 55 = ANDNPD -- G = (not G) and E */
   13313       if (have66noF2noF3(pfx) && sz == 2) {
   13314          delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "andnpd",
   13315                                                            Iop_AndV128 );
   13316          goto decode_success;
   13317       }
   13318       break;
   13319 
   13320    case 0x56:
   13321       /* 0F 56 = ORPS -- G = G and E */
   13322       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13323          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "orps", Iop_OrV128 );
   13324          goto decode_success;
   13325       }
   13326       /* 66 0F 56 = ORPD -- G = G and E */
   13327       if (have66noF2noF3(pfx) && sz == 2) {
   13328          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "orpd", Iop_OrV128 );
   13329          goto decode_success;
   13330       }
   13331       break;
   13332 
   13333    case 0x57:
   13334       /* 66 0F 57 = XORPD -- G = G xor E */
   13335       if (have66noF2noF3(pfx) && sz == 2) {
   13336          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "xorpd", Iop_XorV128 );
   13337          goto decode_success;
   13338       }
   13339       /* 0F 57 = XORPS -- G = G xor E */
   13340       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13341          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "xorps", Iop_XorV128 );
   13342          goto decode_success;
   13343       }
   13344       break;
   13345 
   13346    case 0x58:
   13347       /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
   13348       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13349          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "addps", Iop_Add32Fx4 );
   13350          goto decode_success;
   13351       }
   13352       /* F3 0F 58 = ADDSS -- add 32F0x4 from R/M to R */
   13353       if (haveF3no66noF2(pfx) && sz == 4) {
   13354          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "addss", Iop_Add32F0x4 );
   13355          goto decode_success;
   13356       }
   13357       /* F2 0F 58 = ADDSD -- add 64F0x2 from R/M to R */
   13358       if (haveF2no66noF3(pfx)
   13359           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   13360          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "addsd", Iop_Add64F0x2 );
   13361          goto decode_success;
   13362       }
   13363       /* 66 0F 58 = ADDPD -- add 32Fx4 from R/M to R */
   13364       if (have66noF2noF3(pfx)
   13365           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   13366          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "addpd", Iop_Add64Fx2 );
   13367          goto decode_success;
   13368       }
   13369       break;
   13370 
   13371    case 0x59:
   13372       /* F2 0F 59 = MULSD -- mul 64F0x2 from R/M to R */
   13373       if (haveF2no66noF3(pfx)
   13374           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   13375          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "mulsd", Iop_Mul64F0x2 );
   13376          goto decode_success;
   13377       }
   13378       /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
   13379       if (haveF3no66noF2(pfx) && sz == 4) {
   13380          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "mulss", Iop_Mul32F0x4 );
   13381          goto decode_success;
   13382       }
   13383       /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
   13384       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13385          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "mulps", Iop_Mul32Fx4 );
   13386          goto decode_success;
   13387       }
   13388       /* 66 0F 59 = MULPD -- mul 64Fx2 from R/M to R */
   13389       if (have66noF2noF3(pfx)
   13390           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   13391          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "mulpd", Iop_Mul64Fx2 );
   13392          goto decode_success;
   13393       }
   13394       break;
   13395 
   13396    case 0x5A:
   13397       /* 0F 5A = CVTPS2PD -- convert 2 x F32 in low half mem/xmm to 2 x
   13398          F64 in xmm(G). */
   13399       if (haveNo66noF2noF3(pfx)
   13400           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   13401          delta = dis_CVTPS2PD_128( vbi, pfx, delta, False/*!isAvx*/ );
   13402          goto decode_success;
   13403       }
   13404       /* F3 0F 5A = CVTSS2SD -- convert F32 in mem/low 1/4 xmm to F64 in
   13405          low half xmm(G) */
   13406       if (haveF3no66noF2(pfx) && sz == 4) {
   13407          IRTemp f32lo = newTemp(Ity_F32);
   13408 
   13409          modrm = getUChar(delta);
   13410          if (epartIsReg(modrm)) {
   13411             delta += 1;
   13412             assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
   13413             DIP("cvtss2sd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13414                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13415          } else {
   13416             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13417             assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   13418             delta += alen;
   13419             DIP("cvtss2sd %s,%s\n", dis_buf,
   13420                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13421          }
   13422 
   13423          putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
   13424                            unop( Iop_F32toF64, mkexpr(f32lo) ) );
   13425 
   13426          goto decode_success;
   13427       }
   13428       /* F2 0F 5A = CVTSD2SS -- convert F64 in mem/low half xmm to F32 in
   13429          low 1/4 xmm(G), according to prevailing SSE rounding mode */
   13430       if (haveF2no66noF3(pfx) && sz == 4) {
   13431          IRTemp rmode = newTemp(Ity_I32);
   13432          IRTemp f64lo = newTemp(Ity_F64);
   13433 
   13434          modrm = getUChar(delta);
   13435          if (epartIsReg(modrm)) {
   13436             delta += 1;
   13437             assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
   13438             DIP("cvtsd2ss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13439                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13440          } else {
   13441             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13442             assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   13443             delta += alen;
   13444             DIP("cvtsd2ss %s,%s\n", dis_buf,
   13445                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13446          }
   13447 
   13448          assign( rmode, get_sse_roundingmode() );
   13449          putXMMRegLane32F(
   13450             gregOfRexRM(pfx,modrm), 0,
   13451             binop( Iop_F64toF32, mkexpr(rmode), mkexpr(f64lo) )
   13452          );
   13453 
   13454          goto decode_success;
   13455       }
   13456       /* 66 0F 5A = CVTPD2PS -- convert 2 x F64 in mem/xmm to 2 x F32 in
   13457          lo half xmm(G), rounding according to prevailing SSE rounding
   13458          mode, and zero upper half */
   13459       /* Note, this is practically identical to CVTPD2DQ.  It would have
   13460          be nice to merge them together. */
   13461       if (have66noF2noF3(pfx) && sz == 2) {
   13462          delta = dis_CVTPD2PS_128( vbi, pfx, delta, False/*!isAvx*/ );
   13463          goto decode_success;
   13464       }
   13465       break;
   13466 
   13467    case 0x5B:
   13468       /* F3 0F 5B = CVTTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   13469          xmm(G), rounding towards zero */
   13470       /* 66 0F 5B = CVTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   13471          xmm(G), as per the prevailing rounding mode */
   13472       if ( (have66noF2noF3(pfx) && sz == 2)
   13473            || (haveF3no66noF2(pfx) && sz == 4) ) {
   13474          Bool r2zero = toBool(sz == 4); // FIXME -- unreliable (???)
   13475          delta = dis_CVTxPS2DQ_128( vbi, pfx, delta, False/*!isAvx*/, r2zero );
   13476          goto decode_success;
   13477       }
   13478       /* 0F 5B = CVTDQ2PS -- convert 4 x I32 in mem/xmm to 4 x F32 in
   13479          xmm(G) */
   13480       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13481          delta = dis_CVTDQ2PS_128( vbi, pfx, delta, False/*!isAvx*/ );
   13482          goto decode_success;
   13483       }
   13484       break;
   13485 
   13486    case 0x5C:
   13487       /* F3 0F 5C = SUBSS -- sub 32F0x4 from R/M to R */
   13488       if (haveF3no66noF2(pfx) && sz == 4) {
   13489          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "subss", Iop_Sub32F0x4 );
   13490          goto decode_success;
   13491       }
   13492       /* F2 0F 5C = SUBSD -- sub 64F0x2 from R/M to R */
   13493       if (haveF2no66noF3(pfx)
   13494           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   13495          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "subsd", Iop_Sub64F0x2 );
   13496          goto decode_success;
   13497       }
   13498       /* 0F 5C = SUBPS -- sub 32Fx4 from R/M to R */
   13499       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13500          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "subps", Iop_Sub32Fx4 );
   13501          goto decode_success;
   13502       }
   13503       /* 66 0F 5C = SUBPD -- sub 64Fx2 from R/M to R */
   13504       if (have66noF2noF3(pfx) && sz == 2) {
   13505          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "subpd", Iop_Sub64Fx2 );
   13506          goto decode_success;
   13507       }
   13508       break;
   13509 
   13510    case 0x5D:
   13511       /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
   13512       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13513          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "minps", Iop_Min32Fx4 );
   13514          goto decode_success;
   13515       }
   13516       /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
   13517       if (haveF3no66noF2(pfx) && sz == 4) {
   13518          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "minss", Iop_Min32F0x4 );
   13519          goto decode_success;
   13520       }
   13521       /* F2 0F 5D = MINSD -- min 64F0x2 from R/M to R */
   13522       if (haveF2no66noF3(pfx) && sz == 4) {
   13523          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "minsd", Iop_Min64F0x2 );
   13524          goto decode_success;
   13525       }
   13526       /* 66 0F 5D = MINPD -- min 64Fx2 from R/M to R */
   13527       if (have66noF2noF3(pfx) && sz == 2) {
   13528          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "minpd", Iop_Min64Fx2 );
   13529          goto decode_success;
   13530       }
   13531       break;
   13532 
   13533    case 0x5E:
   13534       /* F2 0F 5E = DIVSD -- div 64F0x2 from R/M to R */
   13535       if (haveF2no66noF3(pfx) && sz == 4) {
   13536          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "divsd", Iop_Div64F0x2 );
   13537          goto decode_success;
   13538       }
   13539       /* 0F 5E = DIVPS -- div 32Fx4 from R/M to R */
   13540       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13541          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "divps", Iop_Div32Fx4 );
   13542          goto decode_success;
   13543       }
   13544       /* F3 0F 5E = DIVSS -- div 32F0x4 from R/M to R */
   13545       if (haveF3no66noF2(pfx) && sz == 4) {
   13546          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "divss", Iop_Div32F0x4 );
   13547          goto decode_success;
   13548       }
   13549       /* 66 0F 5E = DIVPD -- div 64Fx2 from R/M to R */
   13550       if (have66noF2noF3(pfx) && sz == 2) {
   13551          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "divpd", Iop_Div64Fx2 );
   13552          goto decode_success;
   13553       }
   13554       break;
   13555 
   13556    case 0x5F:
   13557       /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
   13558       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13559          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "maxps", Iop_Max32Fx4 );
   13560          goto decode_success;
   13561       }
   13562       /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
   13563       if (haveF3no66noF2(pfx) && sz == 4) {
   13564          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "maxss", Iop_Max32F0x4 );
   13565          goto decode_success;
   13566       }
   13567       /* F2 0F 5F = MAXSD -- max 64F0x2 from R/M to R */
   13568       if (haveF2no66noF3(pfx) && sz == 4) {
   13569          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "maxsd", Iop_Max64F0x2 );
   13570          goto decode_success;
   13571       }
   13572       /* 66 0F 5F = MAXPD -- max 64Fx2 from R/M to R */
   13573       if (have66noF2noF3(pfx) && sz == 2) {
   13574          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "maxpd", Iop_Max64Fx2 );
   13575          goto decode_success;
   13576       }
   13577       break;
   13578 
   13579    case 0x60:
   13580       /* 66 0F 60 = PUNPCKLBW */
   13581       if (have66noF2noF3(pfx) && sz == 2) {
   13582          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13583                                     "punpcklbw",
   13584                                     Iop_InterleaveLO8x16, True );
   13585          goto decode_success;
   13586       }
   13587       break;
   13588 
   13589    case 0x61:
   13590       /* 66 0F 61 = PUNPCKLWD */
   13591       if (have66noF2noF3(pfx) && sz == 2) {
   13592          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13593                                     "punpcklwd",
   13594                                     Iop_InterleaveLO16x8, True );
   13595          goto decode_success;
   13596       }
   13597       break;
   13598 
   13599    case 0x62:
   13600       /* 66 0F 62 = PUNPCKLDQ */
   13601       if (have66noF2noF3(pfx) && sz == 2) {
   13602          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13603                                     "punpckldq",
   13604                                     Iop_InterleaveLO32x4, True );
   13605          goto decode_success;
   13606       }
   13607       break;
   13608 
   13609    case 0x63:
   13610       /* 66 0F 63 = PACKSSWB */
   13611       if (have66noF2noF3(pfx) && sz == 2) {
   13612          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13613                                     "packsswb",
   13614                                     Iop_QNarrowBin16Sto8Sx16, True );
   13615          goto decode_success;
   13616       }
   13617       break;
   13618 
   13619    case 0x64:
   13620       /* 66 0F 64 = PCMPGTB */
   13621       if (have66noF2noF3(pfx) && sz == 2) {
   13622          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13623                                     "pcmpgtb", Iop_CmpGT8Sx16, False );
   13624          goto decode_success;
   13625       }
   13626       break;
   13627 
   13628    case 0x65:
   13629       /* 66 0F 65 = PCMPGTW */
   13630       if (have66noF2noF3(pfx) && sz == 2) {
   13631          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13632                                     "pcmpgtw", Iop_CmpGT16Sx8, False );
   13633          goto decode_success;
   13634       }
   13635       break;
   13636 
   13637    case 0x66:
   13638       /* 66 0F 66 = PCMPGTD */
   13639       if (have66noF2noF3(pfx) && sz == 2) {
   13640          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13641                                     "pcmpgtd", Iop_CmpGT32Sx4, False );
   13642          goto decode_success;
   13643       }
   13644       break;
   13645 
   13646    case 0x67:
   13647       /* 66 0F 67 = PACKUSWB */
   13648       if (have66noF2noF3(pfx) && sz == 2) {
   13649          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13650                                     "packuswb",
   13651                                     Iop_QNarrowBin16Sto8Ux16, True );
   13652          goto decode_success;
   13653       }
   13654       break;
   13655 
   13656    case 0x68:
   13657       /* 66 0F 68 = PUNPCKHBW */
   13658       if (have66noF2noF3(pfx) && sz == 2) {
   13659          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13660                                     "punpckhbw",
   13661                                     Iop_InterleaveHI8x16, True );
   13662          goto decode_success;
   13663       }
   13664       break;
   13665 
   13666    case 0x69:
   13667       /* 66 0F 69 = PUNPCKHWD */
   13668       if (have66noF2noF3(pfx) && sz == 2) {
   13669          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13670                                     "punpckhwd",
   13671                                     Iop_InterleaveHI16x8, True );
   13672          goto decode_success;
   13673       }
   13674       break;
   13675 
   13676    case 0x6A:
   13677       /* 66 0F 6A = PUNPCKHDQ */
   13678       if (have66noF2noF3(pfx) && sz == 2) {
   13679          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13680                                     "punpckhdq",
   13681                                     Iop_InterleaveHI32x4, True );
   13682          goto decode_success;
   13683       }
   13684       break;
   13685 
   13686    case 0x6B:
   13687       /* 66 0F 6B = PACKSSDW */
   13688       if (have66noF2noF3(pfx) && sz == 2) {
   13689          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13690                                     "packssdw",
   13691                                     Iop_QNarrowBin32Sto16Sx8, True );
   13692          goto decode_success;
   13693       }
   13694       break;
   13695 
   13696    case 0x6C:
   13697       /* 66 0F 6C = PUNPCKLQDQ */
   13698       if (have66noF2noF3(pfx) && sz == 2) {
   13699          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13700                                     "punpcklqdq",
   13701                                     Iop_InterleaveLO64x2, True );
   13702          goto decode_success;
   13703       }
   13704       break;
   13705 
   13706    case 0x6D:
   13707       /* 66 0F 6D = PUNPCKHQDQ */
   13708       if (have66noF2noF3(pfx) && sz == 2) {
   13709          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13710                                     "punpckhqdq",
   13711                                     Iop_InterleaveHI64x2, True );
   13712          goto decode_success;
   13713       }
   13714       break;
   13715 
   13716    case 0x6E:
   13717       /* 66 0F 6E = MOVD from ireg32/m32 to xmm lo 1/4,
   13718                     zeroing high 3/4 of xmm. */
   13719       /*              or from ireg64/m64 to xmm lo 1/2,
   13720                     zeroing high 1/2 of xmm. */
   13721       if (have66noF2noF3(pfx)) {
   13722          vassert(sz == 2 || sz == 8);
   13723          if (sz == 2) sz = 4;
   13724          modrm = getUChar(delta);
   13725          if (epartIsReg(modrm)) {
   13726             delta += 1;
   13727             if (sz == 4) {
   13728                putXMMReg(
   13729                   gregOfRexRM(pfx,modrm),
   13730                   unop( Iop_32UtoV128, getIReg32(eregOfRexRM(pfx,modrm)) )
   13731                );
   13732                DIP("movd %s, %s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   13733                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13734             } else {
   13735                putXMMReg(
   13736                   gregOfRexRM(pfx,modrm),
   13737                   unop( Iop_64UtoV128, getIReg64(eregOfRexRM(pfx,modrm)) )
   13738                );
   13739                DIP("movq %s, %s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   13740                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13741             }
   13742          } else {
   13743             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   13744             delta += alen;
   13745             putXMMReg(
   13746                gregOfRexRM(pfx,modrm),
   13747                sz == 4
   13748                   ?  unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)) )
   13749                   :  unop( Iop_64UtoV128,loadLE(Ity_I64, mkexpr(addr)) )
   13750             );
   13751             DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q', dis_buf,
   13752                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13753          }
   13754          goto decode_success;
   13755       }
   13756       break;
   13757 
   13758    case 0x6F:
   13759       if (have66noF2noF3(pfx)
   13760           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   13761          /* 66 0F 6F = MOVDQA -- move from E (mem or xmm) to G (xmm). */
   13762          modrm = getUChar(delta);
   13763          if (epartIsReg(modrm)) {
   13764             putXMMReg( gregOfRexRM(pfx,modrm),
   13765                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   13766             DIP("movdqa %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13767                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13768             delta += 1;
   13769          } else {
   13770             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13771             gen_SEGV_if_not_16_aligned( addr );
   13772             putXMMReg( gregOfRexRM(pfx,modrm),
   13773                        loadLE(Ity_V128, mkexpr(addr)) );
   13774             DIP("movdqa %s,%s\n", dis_buf,
   13775                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13776             delta += alen;
   13777          }
   13778          goto decode_success;
   13779       }
   13780       if (haveF3no66noF2(pfx) && sz == 4) {
   13781          /* F3 0F 6F = MOVDQU -- move from E (mem or xmm) to G (xmm). */
   13782          modrm = getUChar(delta);
   13783          if (epartIsReg(modrm)) {
   13784             putXMMReg( gregOfRexRM(pfx,modrm),
   13785                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   13786             DIP("movdqu %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13787                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13788             delta += 1;
   13789          } else {
   13790             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13791             putXMMReg( gregOfRexRM(pfx,modrm),
   13792                        loadLE(Ity_V128, mkexpr(addr)) );
   13793             DIP("movdqu %s,%s\n", dis_buf,
   13794                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13795             delta += alen;
   13796          }
   13797          goto decode_success;
   13798       }
   13799       break;
   13800 
   13801    case 0x70:
   13802       /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */
   13803       if (have66noF2noF3(pfx) && sz == 2) {
   13804          delta = dis_PSHUFD_32x4( vbi, pfx, delta, False/*!writesYmm*/);
   13805          goto decode_success;
   13806       }
   13807       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   13808       /* 0F 70 = PSHUFW -- rearrange 4x16 from E(mmx or mem) to G(mmx) */
   13809       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13810          Int order;
   13811          IRTemp sV, dV, s3, s2, s1, s0;
   13812          s3 = s2 = s1 = s0 = IRTemp_INVALID;
   13813          sV = newTemp(Ity_I64);
   13814          dV = newTemp(Ity_I64);
   13815          do_MMX_preamble();
   13816          modrm = getUChar(delta);
   13817          if (epartIsReg(modrm)) {
   13818             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   13819             order = (Int)getUChar(delta+1);
   13820             delta += 1+1;
   13821             DIP("pshufw $%d,%s,%s\n", order,
   13822                                       nameMMXReg(eregLO3ofRM(modrm)),
   13823                                       nameMMXReg(gregLO3ofRM(modrm)));
   13824          } else {
   13825             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   13826                               1/*extra byte after amode*/ );
   13827             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   13828             order = (Int)getUChar(delta+alen);
   13829             delta += 1+alen;
   13830             DIP("pshufw $%d,%s,%s\n", order,
   13831                                       dis_buf,
   13832                                       nameMMXReg(gregLO3ofRM(modrm)));
   13833          }
   13834          breakup64to16s( sV, &s3, &s2, &s1, &s0 );
   13835 #        define SEL(n) \
   13836                    ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   13837          assign(dV,
   13838                 mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   13839                              SEL((order>>2)&3), SEL((order>>0)&3) )
   13840          );
   13841          putMMXReg(gregLO3ofRM(modrm), mkexpr(dV));
   13842 #        undef SEL
   13843          goto decode_success;
   13844       }
   13845       /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or
   13846          mem) to G(xmm), and copy upper half */
   13847       if (haveF2no66noF3(pfx) && sz == 4) {
   13848          delta = dis_PSHUFxW_128( vbi, pfx, delta,
   13849                                   False/*!isAvx*/, False/*!xIsH*/ );
   13850          goto decode_success;
   13851       }
   13852       /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or
   13853          mem) to G(xmm), and copy lower half */
   13854       if (haveF3no66noF2(pfx) && sz == 4) {
   13855          delta = dis_PSHUFxW_128( vbi, pfx, delta,
   13856                                   False/*!isAvx*/, True/*xIsH*/ );
   13857          goto decode_success;
   13858       }
   13859       break;
   13860 
   13861    case 0x71:
   13862       /* 66 0F 71 /2 ib = PSRLW by immediate */
   13863       if (have66noF2noF3(pfx) && sz == 2
   13864           && epartIsReg(getUChar(delta))
   13865           && gregLO3ofRM(getUChar(delta)) == 2) {
   13866          delta = dis_SSE_shiftE_imm( pfx, delta, "psrlw", Iop_ShrN16x8 );
   13867          goto decode_success;
   13868       }
   13869       /* 66 0F 71 /4 ib = PSRAW by immediate */
   13870       if (have66noF2noF3(pfx) && sz == 2
   13871           && epartIsReg(getUChar(delta))
   13872           && gregLO3ofRM(getUChar(delta)) == 4) {
   13873          delta = dis_SSE_shiftE_imm( pfx, delta, "psraw", Iop_SarN16x8 );
   13874          goto decode_success;
   13875       }
   13876       /* 66 0F 71 /6 ib = PSLLW by immediate */
   13877       if (have66noF2noF3(pfx) && sz == 2
   13878           && epartIsReg(getUChar(delta))
   13879           && gregLO3ofRM(getUChar(delta)) == 6) {
   13880          delta = dis_SSE_shiftE_imm( pfx, delta, "psllw", Iop_ShlN16x8 );
   13881          goto decode_success;
   13882       }
   13883       break;
   13884 
   13885    case 0x72:
   13886       /* 66 0F 72 /2 ib = PSRLD by immediate */
   13887       if (have66noF2noF3(pfx) && sz == 2
   13888           && epartIsReg(getUChar(delta))
   13889           && gregLO3ofRM(getUChar(delta)) == 2) {
   13890          delta = dis_SSE_shiftE_imm( pfx, delta, "psrld", Iop_ShrN32x4 );
   13891          goto decode_success;
   13892       }
   13893       /* 66 0F 72 /4 ib = PSRAD by immediate */
   13894       if (have66noF2noF3(pfx) && sz == 2
   13895           && epartIsReg(getUChar(delta))
   13896           && gregLO3ofRM(getUChar(delta)) == 4) {
   13897          delta = dis_SSE_shiftE_imm( pfx, delta, "psrad", Iop_SarN32x4 );
   13898          goto decode_success;
   13899       }
   13900       /* 66 0F 72 /6 ib = PSLLD by immediate */
   13901       if (have66noF2noF3(pfx) && sz == 2
   13902           && epartIsReg(getUChar(delta))
   13903           && gregLO3ofRM(getUChar(delta)) == 6) {
   13904          delta = dis_SSE_shiftE_imm( pfx, delta, "pslld", Iop_ShlN32x4 );
   13905          goto decode_success;
   13906       }
   13907       break;
   13908 
   13909    case 0x73:
   13910       /* 66 0F 73 /3 ib = PSRLDQ by immediate */
   13911       /* note, if mem case ever filled in, 1 byte after amode */
   13912       if (have66noF2noF3(pfx) && sz == 2
   13913           && epartIsReg(getUChar(delta))
   13914           && gregLO3ofRM(getUChar(delta)) == 3) {
   13915          Int imm = (Int)getUChar(delta+1);
   13916          Int reg = eregOfRexRM(pfx,getUChar(delta));
   13917          DIP("psrldq $%d,%s\n", imm, nameXMMReg(reg));
   13918          delta += 2;
   13919          IRTemp sV = newTemp(Ity_V128);
   13920          assign( sV, getXMMReg(reg) );
   13921          putXMMReg(reg, mkexpr(math_PSRLDQ( sV, imm )));
   13922          goto decode_success;
   13923       }
   13924       /* 66 0F 73 /7 ib = PSLLDQ by immediate */
   13925       /* note, if mem case ever filled in, 1 byte after amode */
   13926       if (have66noF2noF3(pfx) && sz == 2
   13927           && epartIsReg(getUChar(delta))
   13928           && gregLO3ofRM(getUChar(delta)) == 7) {
   13929          Int imm = (Int)getUChar(delta+1);
   13930          Int reg = eregOfRexRM(pfx,getUChar(delta));
   13931          DIP("pslldq $%d,%s\n", imm, nameXMMReg(reg));
   13932          vassert(imm >= 0 && imm <= 255);
   13933          delta += 2;
   13934          IRTemp sV = newTemp(Ity_V128);
   13935          assign( sV, getXMMReg(reg) );
   13936          putXMMReg(reg, mkexpr(math_PSLLDQ( sV, imm )));
   13937          goto decode_success;
   13938       }
   13939       /* 66 0F 73 /2 ib = PSRLQ by immediate */
   13940       if (have66noF2noF3(pfx) && sz == 2
   13941           && epartIsReg(getUChar(delta))
   13942           && gregLO3ofRM(getUChar(delta)) == 2) {
   13943          delta = dis_SSE_shiftE_imm( pfx, delta, "psrlq", Iop_ShrN64x2 );
   13944          goto decode_success;
   13945       }
   13946       /* 66 0F 73 /6 ib = PSLLQ by immediate */
   13947       if (have66noF2noF3(pfx) && sz == 2
   13948           && epartIsReg(getUChar(delta))
   13949           && gregLO3ofRM(getUChar(delta)) == 6) {
   13950          delta = dis_SSE_shiftE_imm( pfx, delta, "psllq", Iop_ShlN64x2 );
   13951          goto decode_success;
   13952       }
   13953       break;
   13954 
   13955    case 0x74:
   13956       /* 66 0F 74 = PCMPEQB */
   13957       if (have66noF2noF3(pfx) && sz == 2) {
   13958          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13959                                     "pcmpeqb", Iop_CmpEQ8x16, False );
   13960          goto decode_success;
   13961       }
   13962       break;
   13963 
   13964    case 0x75:
   13965       /* 66 0F 75 = PCMPEQW */
   13966       if (have66noF2noF3(pfx) && sz == 2) {
   13967          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13968                                     "pcmpeqw", Iop_CmpEQ16x8, False );
   13969          goto decode_success;
   13970       }
   13971       break;
   13972 
   13973    case 0x76:
   13974       /* 66 0F 76 = PCMPEQD */
   13975       if (have66noF2noF3(pfx) && sz == 2) {
   13976          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13977                                     "pcmpeqd", Iop_CmpEQ32x4, False );
   13978          goto decode_success;
   13979       }
   13980       break;
   13981 
   13982    case 0x7E:
   13983       /* F3 0F 7E = MOVQ -- move 64 bits from E (mem or lo half xmm) to
   13984          G (lo half xmm).  Upper half of G is zeroed out. */
   13985       if (haveF3no66noF2(pfx)
   13986           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   13987          modrm = getUChar(delta);
   13988          if (epartIsReg(modrm)) {
   13989             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   13990                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
   13991                /* zero bits 127:64 */
   13992                putXMMRegLane64( gregOfRexRM(pfx,modrm), 1, mkU64(0) );
   13993             DIP("movsd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13994                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   13995             delta += 1;
   13996          } else {
   13997             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13998             putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
   13999             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   14000                              loadLE(Ity_I64, mkexpr(addr)) );
   14001             DIP("movsd %s,%s\n", dis_buf,
   14002                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   14003             delta += alen;
   14004          }
   14005          goto decode_success;
   14006       }
   14007       /* 66 0F 7E = MOVD from xmm low 1/4 to ireg32 or m32. */
   14008       /*              or from xmm low 1/2 to ireg64 or m64. */
   14009          if (have66noF2noF3(pfx) && (sz == 2 || sz == 8)) {
   14010          if (sz == 2) sz = 4;
   14011          modrm = getUChar(delta);
   14012          if (epartIsReg(modrm)) {
   14013             delta += 1;
   14014             if (sz == 4) {
   14015                putIReg32( eregOfRexRM(pfx,modrm),
   14016                           getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
   14017                DIP("movd %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   14018                                     nameIReg32(eregOfRexRM(pfx,modrm)));
   14019             } else {
   14020                putIReg64( eregOfRexRM(pfx,modrm),
   14021                           getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
   14022                DIP("movq %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   14023                                     nameIReg64(eregOfRexRM(pfx,modrm)));
   14024             }
   14025          } else {
   14026             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   14027             delta += alen;
   14028             storeLE( mkexpr(addr),
   14029                      sz == 4
   14030                         ? getXMMRegLane32(gregOfRexRM(pfx,modrm),0)
   14031                         : getXMMRegLane64(gregOfRexRM(pfx,modrm),0) );
   14032             DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q',
   14033                                   nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
   14034          }
   14035          goto decode_success;
   14036       }
   14037       break;
   14038 
   14039    case 0x7F:
   14040       /* F3 0F 7F = MOVDQU -- move from G (xmm) to E (mem or xmm). */
   14041       if (haveF3no66noF2(pfx) && sz == 4) {
   14042          modrm = getUChar(delta);
   14043          if (epartIsReg(modrm)) {
   14044             goto decode_failure; /* awaiting test case */
   14045             delta += 1;
   14046             putXMMReg( eregOfRexRM(pfx,modrm),
   14047                        getXMMReg(gregOfRexRM(pfx,modrm)) );
   14048             DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   14049                                    nameXMMReg(eregOfRexRM(pfx,modrm)));
   14050          } else {
   14051             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   14052             delta += alen;
   14053             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   14054             DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
   14055          }
   14056          goto decode_success;
   14057       }
   14058       /* 66 0F 7F = MOVDQA -- move from G (xmm) to E (mem or xmm). */
   14059       if (have66noF2noF3(pfx) && sz == 2) {
   14060          modrm = getUChar(delta);
   14061          if (epartIsReg(modrm)) {
   14062             delta += 1;
   14063             putXMMReg( eregOfRexRM(pfx,modrm),
   14064                        getXMMReg(gregOfRexRM(pfx,modrm)) );
   14065             DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   14066                                    nameXMMReg(eregOfRexRM(pfx,modrm)));
   14067          } else {
   14068             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   14069             gen_SEGV_if_not_16_aligned( addr );
   14070             delta += alen;
   14071             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   14072             DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
   14073          }
   14074          goto decode_success;
   14075       }
   14076       break;
   14077 
   14078    case 0xAE:
   14079       /* 0F AE /7 = SFENCE -- flush pending operations to memory */
   14080       if (haveNo66noF2noF3(pfx)
   14081           && epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 7
   14082           && sz == 4) {
   14083          delta += 1;
   14084          /* Insert a memory fence.  It's sometimes important that these
   14085             are carried through to the generated code. */
   14086          stmt( IRStmt_MBE(Imbe_Fence) );
   14087          DIP("sfence\n");
   14088          goto decode_success;
   14089       }
   14090       /* mindless duplication follows .. */
   14091       /* 0F AE /5 = LFENCE -- flush pending operations to memory */
   14092       /* 0F AE /6 = MFENCE -- flush pending operations to memory */
   14093       if (haveNo66noF2noF3(pfx)
   14094           && epartIsReg(getUChar(delta))
   14095           && (gregLO3ofRM(getUChar(delta)) == 5
   14096               || gregLO3ofRM(getUChar(delta)) == 6)
   14097           && sz == 4) {
   14098          delta += 1;
   14099          /* Insert a memory fence.  It's sometimes important that these
   14100             are carried through to the generated code. */
   14101          stmt( IRStmt_MBE(Imbe_Fence) );
   14102          DIP("%sfence\n", gregLO3ofRM(getUChar(delta-1))==5 ? "l" : "m");
   14103          goto decode_success;
   14104       }
   14105 
   14106       /* 0F AE /7 = CLFLUSH -- flush cache line */
   14107       if (haveNo66noF2noF3(pfx)
   14108           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 7
   14109           && sz == 4) {
   14110 
   14111          /* This is something of a hack.  We need to know the size of
   14112             the cache line containing addr.  Since we don't (easily),
   14113             assume 256 on the basis that no real cache would have a
   14114             line that big.  It's safe to invalidate more stuff than we
   14115             need, just inefficient. */
   14116          ULong lineszB = 256ULL;
   14117 
   14118          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14119          delta += alen;
   14120 
   14121          /* Round addr down to the start of the containing block. */
   14122          stmt( IRStmt_Put(
   14123                   OFFB_CMSTART,
   14124                   binop( Iop_And64,
   14125                          mkexpr(addr),
   14126                          mkU64( ~(lineszB-1) ))) );
   14127 
   14128          stmt( IRStmt_Put(OFFB_CMLEN, mkU64(lineszB) ) );
   14129 
   14130          jmp_lit(dres, Ijk_InvalICache, (Addr64)(guest_RIP_bbstart+delta));
   14131 
   14132          DIP("clflush %s\n", dis_buf);
   14133          goto decode_success;
   14134       }
   14135 
   14136       /* 0F AE /3 = STMXCSR m32 -- store %mxcsr */
   14137       if (haveNo66noF2noF3(pfx)
   14138           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 3
   14139           && sz == 4) {
   14140          delta = dis_STMXCSR(vbi, pfx, delta, False/*!isAvx*/);
   14141          goto decode_success;
   14142       }
   14143       /* 0F AE /2 = LDMXCSR m32 -- load %mxcsr */
   14144       if (haveNo66noF2noF3(pfx)
   14145           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 2
   14146           && sz == 4) {
   14147          delta = dis_LDMXCSR(vbi, pfx, delta, False/*!isAvx*/);
   14148          goto decode_success;
   14149       }
   14150       /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory */
   14151       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   14152           && !epartIsReg(getUChar(delta))
   14153           && gregOfRexRM(pfx,getUChar(delta)) == 0) {
   14154          delta = dis_FXSAVE(vbi, pfx, delta, sz);
   14155          goto decode_success;
   14156       }
   14157       /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory */
   14158       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   14159           && !epartIsReg(getUChar(delta))
   14160           && gregOfRexRM(pfx,getUChar(delta)) == 1) {
   14161          delta = dis_FXRSTOR(vbi, pfx, delta, sz);
   14162          goto decode_success;
   14163       }
   14164       /* 0F AE /4 = XSAVE mem -- write x87, SSE, AVX state to memory */
   14165       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   14166           && !epartIsReg(getUChar(delta))
   14167           && gregOfRexRM(pfx,getUChar(delta)) == 4
   14168           && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   14169          delta = dis_XSAVE(vbi, pfx, delta, sz);
   14170          goto decode_success;
   14171       }
   14172       /* 0F AE /5 = XRSTOR mem -- read x87, SSE, AVX state from memory */
   14173       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   14174           && !epartIsReg(getUChar(delta))
   14175           && gregOfRexRM(pfx,getUChar(delta)) == 5
   14176           && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   14177          delta = dis_XRSTOR(vbi, pfx, delta, sz);
   14178          goto decode_success;
   14179       }
   14180       break;
   14181 
   14182    case 0xC2:
   14183       /* 0F C2 = CMPPS -- 32Fx4 comparison from R/M to R */
   14184       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14185          Long delta0 = delta;
   14186          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpps", True, 4 );
   14187          if (delta > delta0) goto decode_success;
   14188       }
   14189       /* F3 0F C2 = CMPSS -- 32F0x4 comparison from R/M to R */
   14190       if (haveF3no66noF2(pfx) && sz == 4) {
   14191          Long delta0 = delta;
   14192          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpss", False, 4 );
   14193          if (delta > delta0) goto decode_success;
   14194       }
   14195       /* F2 0F C2 = CMPSD -- 64F0x2 comparison from R/M to R */
   14196       if (haveF2no66noF3(pfx) && sz == 4) {
   14197          Long delta0 = delta;
   14198          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpsd", False, 8 );
   14199          if (delta > delta0) goto decode_success;
   14200       }
   14201       /* 66 0F C2 = CMPPD -- 64Fx2 comparison from R/M to R */
   14202       if (have66noF2noF3(pfx) && sz == 2) {
   14203          Long delta0 = delta;
   14204          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmppd", True, 8 );
   14205          if (delta > delta0) goto decode_success;
   14206       }
   14207       break;
   14208 
   14209    case 0xC3:
   14210       /* 0F C3 = MOVNTI -- for us, just a plain ireg store. */
   14211       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)) {
   14212          modrm = getUChar(delta);
   14213          if (!epartIsReg(modrm)) {
   14214             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14215             storeLE( mkexpr(addr), getIRegG(sz, pfx, modrm) );
   14216             DIP("movnti %s,%s\n", dis_buf,
   14217                                   nameIRegG(sz, pfx, modrm));
   14218             delta += alen;
   14219             goto decode_success;
   14220          }
   14221          /* else fall through */
   14222       }
   14223       break;
   14224 
   14225    case 0xC4:
   14226       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14227       /* 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   14228          put it into the specified lane of mmx(G). */
   14229       if (haveNo66noF2noF3(pfx)
   14230           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   14231          /* Use t0 .. t3 to hold the 4 original 16-bit lanes of the
   14232             mmx reg.  t4 is the new lane value.  t5 is the original
   14233             mmx value. t6 is the new mmx value. */
   14234          Int lane;
   14235          t4 = newTemp(Ity_I16);
   14236          t5 = newTemp(Ity_I64);
   14237          t6 = newTemp(Ity_I64);
   14238          modrm = getUChar(delta);
   14239          do_MMX_preamble();
   14240 
   14241          assign(t5, getMMXReg(gregLO3ofRM(modrm)));
   14242          breakup64to16s( t5, &t3, &t2, &t1, &t0 );
   14243 
   14244          if (epartIsReg(modrm)) {
   14245             assign(t4, getIReg16(eregOfRexRM(pfx,modrm)));
   14246             delta += 1+1;
   14247             lane = getUChar(delta-1);
   14248             DIP("pinsrw $%d,%s,%s\n", lane,
   14249                                       nameIReg16(eregOfRexRM(pfx,modrm)),
   14250                                       nameMMXReg(gregLO3ofRM(modrm)));
   14251          } else {
   14252             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   14253             delta += 1+alen;
   14254             lane = getUChar(delta-1);
   14255             assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   14256             DIP("pinsrw $%d,%s,%s\n", lane,
   14257                                       dis_buf,
   14258                                       nameMMXReg(gregLO3ofRM(modrm)));
   14259          }
   14260 
   14261          switch (lane & 3) {
   14262             case 0:  assign(t6, mk64from16s(t3,t2,t1,t4)); break;
   14263             case 1:  assign(t6, mk64from16s(t3,t2,t4,t0)); break;
   14264             case 2:  assign(t6, mk64from16s(t3,t4,t1,t0)); break;
   14265             case 3:  assign(t6, mk64from16s(t4,t2,t1,t0)); break;
   14266             default: vassert(0);
   14267          }
   14268          putMMXReg(gregLO3ofRM(modrm), mkexpr(t6));
   14269          goto decode_success;
   14270       }
   14271       /* 66 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   14272          put it into the specified lane of xmm(G). */
   14273       if (have66noF2noF3(pfx)
   14274           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   14275          Int lane;
   14276          t4 = newTemp(Ity_I16);
   14277          modrm = getUChar(delta);
   14278          UInt rG = gregOfRexRM(pfx,modrm);
   14279          if (epartIsReg(modrm)) {
   14280             UInt rE = eregOfRexRM(pfx,modrm);
   14281             assign(t4, getIReg16(rE));
   14282             delta += 1+1;
   14283             lane = getUChar(delta-1);
   14284             DIP("pinsrw $%d,%s,%s\n",
   14285                 lane, nameIReg16(rE), nameXMMReg(rG));
   14286          } else {
   14287             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   14288                               1/*byte after the amode*/ );
   14289             delta += 1+alen;
   14290             lane = getUChar(delta-1);
   14291             assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   14292             DIP("pinsrw $%d,%s,%s\n",
   14293                 lane, dis_buf, nameXMMReg(rG));
   14294          }
   14295          IRTemp src_vec = newTemp(Ity_V128);
   14296          assign(src_vec, getXMMReg(rG));
   14297          IRTemp res_vec = math_PINSRW_128( src_vec, t4, lane & 7);
   14298          putXMMReg(rG, mkexpr(res_vec));
   14299          goto decode_success;
   14300       }
   14301       break;
   14302 
   14303    case 0xC5:
   14304       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14305       /* 0F C5 = PEXTRW -- extract 16-bit field from mmx(E) and put
   14306          zero-extend of it in ireg(G). */
   14307       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)) {
   14308          modrm = getUChar(delta);
   14309          if (epartIsReg(modrm)) {
   14310             IRTemp sV = newTemp(Ity_I64);
   14311             t5 = newTemp(Ity_I16);
   14312             do_MMX_preamble();
   14313             assign(sV, getMMXReg(eregLO3ofRM(modrm)));
   14314             breakup64to16s( sV, &t3, &t2, &t1, &t0 );
   14315             switch (getUChar(delta+1) & 3) {
   14316                case 0:  assign(t5, mkexpr(t0)); break;
   14317                case 1:  assign(t5, mkexpr(t1)); break;
   14318                case 2:  assign(t5, mkexpr(t2)); break;
   14319                case 3:  assign(t5, mkexpr(t3)); break;
   14320                default: vassert(0);
   14321             }
   14322             if (sz == 8)
   14323                putIReg64(gregOfRexRM(pfx,modrm), unop(Iop_16Uto64, mkexpr(t5)));
   14324             else
   14325                putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_16Uto32, mkexpr(t5)));
   14326             DIP("pextrw $%d,%s,%s\n",
   14327                 (Int)getUChar(delta+1),
   14328                 nameMMXReg(eregLO3ofRM(modrm)),
   14329                 sz==8 ? nameIReg64(gregOfRexRM(pfx,modrm))
   14330                       : nameIReg32(gregOfRexRM(pfx,modrm))
   14331             );
   14332             delta += 2;
   14333             goto decode_success;
   14334          }
   14335          /* else fall through */
   14336          /* note, for anyone filling in the mem case: this insn has one
   14337             byte after the amode and therefore you must pass 1 as the
   14338             last arg to disAMode */
   14339       }
   14340       /* 66 0F C5 = PEXTRW -- extract 16-bit field from xmm(E) and put
   14341          zero-extend of it in ireg(G). */
   14342       if (have66noF2noF3(pfx)
   14343           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   14344          Long delta0 = delta;
   14345          delta = dis_PEXTRW_128_EregOnly_toG( vbi, pfx, delta,
   14346                                               False/*!isAvx*/ );
   14347          if (delta > delta0) goto decode_success;
   14348          /* else fall through -- decoding has failed */
   14349       }
   14350       break;
   14351 
   14352    case 0xC6:
   14353       /* 0F C6 /r ib = SHUFPS -- shuffle packed F32s */
   14354       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14355          Int    imm8 = 0;
   14356          IRTemp sV   = newTemp(Ity_V128);
   14357          IRTemp dV   = newTemp(Ity_V128);
   14358          modrm = getUChar(delta);
   14359          UInt rG = gregOfRexRM(pfx,modrm);
   14360          assign( dV, getXMMReg(rG) );
   14361          if (epartIsReg(modrm)) {
   14362             UInt rE = eregOfRexRM(pfx,modrm);
   14363             assign( sV, getXMMReg(rE) );
   14364             imm8 = (Int)getUChar(delta+1);
   14365             delta += 1+1;
   14366             DIP("shufps $%d,%s,%s\n", imm8, nameXMMReg(rE), nameXMMReg(rG));
   14367          } else {
   14368             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   14369             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   14370             imm8 = (Int)getUChar(delta+alen);
   14371             delta += 1+alen;
   14372             DIP("shufps $%d,%s,%s\n", imm8, dis_buf, nameXMMReg(rG));
   14373          }
   14374          IRTemp res = math_SHUFPS_128( sV, dV, imm8 );
   14375          putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
   14376          goto decode_success;
   14377       }
   14378       /* 66 0F C6 /r ib = SHUFPD -- shuffle packed F64s */
   14379       if (have66noF2noF3(pfx) && sz == 2) {
   14380          Int    select;
   14381          IRTemp sV = newTemp(Ity_V128);
   14382          IRTemp dV = newTemp(Ity_V128);
   14383 
   14384          modrm = getUChar(delta);
   14385          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   14386 
   14387          if (epartIsReg(modrm)) {
   14388             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   14389             select = (Int)getUChar(delta+1);
   14390             delta += 1+1;
   14391             DIP("shufpd $%d,%s,%s\n", select,
   14392                                       nameXMMReg(eregOfRexRM(pfx,modrm)),
   14393                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
   14394          } else {
   14395             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   14396             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   14397             select = getUChar(delta+alen);
   14398             delta += 1+alen;
   14399             DIP("shufpd $%d,%s,%s\n", select,
   14400                                       dis_buf,
   14401                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
   14402          }
   14403 
   14404          IRTemp res = math_SHUFPD_128( sV, dV, select );
   14405          putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
   14406          goto decode_success;
   14407       }
   14408       break;
   14409 
   14410    case 0xD1:
   14411       /* 66 0F D1 = PSRLW by E */
   14412       if (have66noF2noF3(pfx) && sz == 2) {
   14413          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrlw", Iop_ShrN16x8 );
   14414          goto decode_success;
   14415       }
   14416       break;
   14417 
   14418    case 0xD2:
   14419       /* 66 0F D2 = PSRLD by E */
   14420       if (have66noF2noF3(pfx) && sz == 2) {
   14421          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrld", Iop_ShrN32x4 );
   14422          goto decode_success;
   14423       }
   14424       break;
   14425 
   14426    case 0xD3:
   14427       /* 66 0F D3 = PSRLQ by E */
   14428       if (have66noF2noF3(pfx) && sz == 2) {
   14429          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrlq", Iop_ShrN64x2 );
   14430          goto decode_success;
   14431       }
   14432       break;
   14433 
   14434    case 0xD4:
   14435       /* 66 0F D4 = PADDQ */
   14436       if (have66noF2noF3(pfx) && sz == 2) {
   14437          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14438                                     "paddq", Iop_Add64x2, False );
   14439          goto decode_success;
   14440       }
   14441       /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   14442       /* 0F D4 = PADDQ -- add 64x1 */
   14443       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14444          do_MMX_preamble();
   14445          delta = dis_MMXop_regmem_to_reg (
   14446                    vbi, pfx, delta, opc, "paddq", False );
   14447          goto decode_success;
   14448       }
   14449       break;
   14450 
   14451    case 0xD5:
   14452       /* 66 0F D5 = PMULLW -- 16x8 multiply */
   14453       if (have66noF2noF3(pfx) && sz == 2) {
   14454          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14455                                     "pmullw", Iop_Mul16x8, False );
   14456          goto decode_success;
   14457       }
   14458       break;
   14459 
   14460    case 0xD6:
   14461       /* F3 0F D6 = MOVQ2DQ -- move from E (mmx) to G (lo half xmm, zero
   14462          hi half). */
   14463       if (haveF3no66noF2(pfx) && sz == 4) {
   14464          modrm = getUChar(delta);
   14465          if (epartIsReg(modrm)) {
   14466             do_MMX_preamble();
   14467             putXMMReg( gregOfRexRM(pfx,modrm),
   14468                        unop(Iop_64UtoV128, getMMXReg( eregLO3ofRM(modrm) )) );
   14469             DIP("movq2dq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   14470                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   14471             delta += 1;
   14472             goto decode_success;
   14473          }
   14474          /* apparently no mem case for this insn */
   14475       }
   14476       /* 66 0F D6 = MOVQ -- move 64 bits from G (lo half xmm) to E (mem
   14477          or lo half xmm).  */
   14478       if (have66noF2noF3(pfx)
   14479           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   14480          modrm = getUChar(delta);
   14481          if (epartIsReg(modrm)) {
   14482             /* fall through, awaiting test case */
   14483             /* dst: lo half copied, hi half zeroed */
   14484          } else {
   14485             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14486             storeLE( mkexpr(addr),
   14487                      getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
   14488             DIP("movq %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf );
   14489             delta += alen;
   14490             goto decode_success;
   14491          }
   14492       }
   14493       /* F2 0F D6 = MOVDQ2Q -- move from E (lo half xmm, not mem) to G (mmx). */
   14494       if (haveF2no66noF3(pfx) && sz == 4) {
   14495          modrm = getUChar(delta);
   14496          if (epartIsReg(modrm)) {
   14497             do_MMX_preamble();
   14498             putMMXReg( gregLO3ofRM(modrm),
   14499                        getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
   14500             DIP("movdq2q %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   14501                                    nameMMXReg(gregLO3ofRM(modrm)));
   14502             delta += 1;
   14503             goto decode_success;
   14504          }
   14505          /* apparently no mem case for this insn */
   14506       }
   14507       break;
   14508 
   14509    case 0xD7:
   14510       /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16
   14511          lanes in xmm(E), turn them into a byte, and put
   14512          zero-extend of it in ireg(G).  Doing this directly is just
   14513          too cumbersome; give up therefore and call a helper. */
   14514       if (have66noF2noF3(pfx)
   14515           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
   14516           && epartIsReg(getUChar(delta))) { /* no memory case, it seems */
   14517          delta = dis_PMOVMSKB_128( vbi, pfx, delta, False/*!isAvx*/ );
   14518          goto decode_success;
   14519       }
   14520       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14521       /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
   14522          mmx(E), turn them into a byte, and put zero-extend of it in
   14523          ireg(G). */
   14524       if (haveNo66noF2noF3(pfx)
   14525           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   14526          modrm = getUChar(delta);
   14527          if (epartIsReg(modrm)) {
   14528             do_MMX_preamble();
   14529             t0 = newTemp(Ity_I64);
   14530             t1 = newTemp(Ity_I32);
   14531             assign(t0, getMMXReg(eregLO3ofRM(modrm)));
   14532             assign(t1, unop(Iop_8Uto32, unop(Iop_GetMSBs8x8, mkexpr(t0))));
   14533             putIReg32(gregOfRexRM(pfx,modrm), mkexpr(t1));
   14534             DIP("pmovmskb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   14535                                     nameIReg32(gregOfRexRM(pfx,modrm)));
   14536             delta += 1;
   14537             goto decode_success;
   14538          }
   14539          /* else fall through */
   14540       }
   14541       break;
   14542 
   14543    case 0xD8:
   14544       /* 66 0F D8 = PSUBUSB */
   14545       if (have66noF2noF3(pfx) && sz == 2) {
   14546          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14547                                     "psubusb", Iop_QSub8Ux16, False );
   14548          goto decode_success;
   14549       }
   14550       break;
   14551 
   14552    case 0xD9:
   14553       /* 66 0F D9 = PSUBUSW */
   14554       if (have66noF2noF3(pfx) && sz == 2) {
   14555          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14556                                     "psubusw", Iop_QSub16Ux8, False );
   14557          goto decode_success;
   14558       }
   14559       break;
   14560 
   14561    case 0xDA:
   14562       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14563       /* 0F DA = PMINUB -- 8x8 unsigned min */
   14564       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14565          do_MMX_preamble();
   14566          delta = dis_MMXop_regmem_to_reg (
   14567                     vbi, pfx, delta, opc, "pminub", False );
   14568          goto decode_success;
   14569       }
   14570       /* 66 0F DA = PMINUB -- 8x16 unsigned min */
   14571       if (have66noF2noF3(pfx) && sz == 2) {
   14572          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14573                                     "pminub", Iop_Min8Ux16, False );
   14574          goto decode_success;
   14575       }
   14576       break;
   14577 
   14578    case 0xDB:
   14579       /* 66 0F DB = PAND */
   14580       if (have66noF2noF3(pfx) && sz == 2) {
   14581          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "pand", Iop_AndV128 );
   14582          goto decode_success;
   14583       }
   14584       break;
   14585 
   14586    case 0xDC:
   14587       /* 66 0F DC = PADDUSB */
   14588       if (have66noF2noF3(pfx) && sz == 2) {
   14589          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14590                                     "paddusb", Iop_QAdd8Ux16, False );
   14591          goto decode_success;
   14592       }
   14593       break;
   14594 
   14595    case 0xDD:
   14596       /* 66 0F DD = PADDUSW */
   14597       if (have66noF2noF3(pfx) && sz == 2) {
   14598          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14599                                     "paddusw", Iop_QAdd16Ux8, False );
   14600          goto decode_success;
   14601       }
   14602       break;
   14603 
   14604    case 0xDE:
   14605       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14606       /* 0F DE = PMAXUB -- 8x8 unsigned max */
   14607       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14608          do_MMX_preamble();
   14609          delta = dis_MMXop_regmem_to_reg (
   14610                     vbi, pfx, delta, opc, "pmaxub", False );
   14611          goto decode_success;
   14612       }
   14613       /* 66 0F DE = PMAXUB -- 8x16 unsigned max */
   14614       if (have66noF2noF3(pfx) && sz == 2) {
   14615          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14616                                     "pmaxub", Iop_Max8Ux16, False );
   14617          goto decode_success;
   14618       }
   14619       break;
   14620 
   14621    case 0xDF:
   14622       /* 66 0F DF = PANDN */
   14623       if (have66noF2noF3(pfx) && sz == 2) {
   14624          delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "pandn", Iop_AndV128 );
   14625          goto decode_success;
   14626       }
   14627       break;
   14628 
   14629    case 0xE0:
   14630       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14631       /* 0F E0 = PAVGB -- 8x8 unsigned Packed Average, with rounding */
   14632       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14633          do_MMX_preamble();
   14634          delta = dis_MMXop_regmem_to_reg (
   14635                     vbi, pfx, delta, opc, "pavgb", False );
   14636          goto decode_success;
   14637       }
   14638       /* 66 0F E0 = PAVGB */
   14639       if (have66noF2noF3(pfx) && sz == 2) {
   14640          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14641                                     "pavgb", Iop_Avg8Ux16, False );
   14642          goto decode_success;
   14643       }
   14644       break;
   14645 
   14646    case 0xE1:
   14647       /* 66 0F E1 = PSRAW by E */
   14648       if (have66noF2noF3(pfx) && sz == 2) {
   14649          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psraw", Iop_SarN16x8 );
   14650          goto decode_success;
   14651       }
   14652       break;
   14653 
   14654    case 0xE2:
   14655       /* 66 0F E2 = PSRAD by E */
   14656       if (have66noF2noF3(pfx) && sz == 2) {
   14657          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrad", Iop_SarN32x4 );
   14658          goto decode_success;
   14659       }
   14660       break;
   14661 
   14662    case 0xE3:
   14663       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14664       /* 0F E3 = PAVGW -- 16x4 unsigned Packed Average, with rounding */
   14665       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14666          do_MMX_preamble();
   14667          delta = dis_MMXop_regmem_to_reg (
   14668                     vbi, pfx, delta, opc, "pavgw", False );
   14669          goto decode_success;
   14670       }
   14671       /* 66 0F E3 = PAVGW */
   14672       if (have66noF2noF3(pfx) && sz == 2) {
   14673          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14674                                     "pavgw", Iop_Avg16Ux8, False );
   14675          goto decode_success;
   14676       }
   14677       break;
   14678 
   14679    case 0xE4:
   14680       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14681       /* 0F E4 = PMULUH -- 16x4 hi-half of unsigned widening multiply */
   14682       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14683          do_MMX_preamble();
   14684          delta = dis_MMXop_regmem_to_reg (
   14685                     vbi, pfx, delta, opc, "pmuluh", False );
   14686          goto decode_success;
   14687       }
   14688       /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */
   14689       if (have66noF2noF3(pfx) && sz == 2) {
   14690          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14691                                     "pmulhuw", Iop_MulHi16Ux8, False );
   14692          goto decode_success;
   14693       }
   14694       break;
   14695 
   14696    case 0xE5:
   14697       /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */
   14698       if (have66noF2noF3(pfx) && sz == 2) {
   14699          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14700                                     "pmulhw", Iop_MulHi16Sx8, False );
   14701          goto decode_success;
   14702       }
   14703       break;
   14704 
   14705    case 0xE6:
   14706       /* 66 0F E6 = CVTTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   14707          lo half xmm(G), and zero upper half, rounding towards zero */
   14708       /* F2 0F E6 = CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   14709          lo half xmm(G), according to prevailing rounding mode, and zero
   14710          upper half */
   14711       if ( (haveF2no66noF3(pfx) && sz == 4)
   14712            || (have66noF2noF3(pfx) && sz == 2) ) {
   14713          delta = dis_CVTxPD2DQ_128( vbi, pfx, delta, False/*!isAvx*/,
   14714                                     toBool(sz == 2)/*r2zero*/);
   14715          goto decode_success;
   14716       }
   14717       /* F3 0F E6 = CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
   14718          F64 in xmm(G) */
   14719       if (haveF3no66noF2(pfx) && sz == 4) {
   14720          delta = dis_CVTDQ2PD_128(vbi, pfx, delta, False/*!isAvx*/);
   14721          goto decode_success;
   14722       }
   14723       break;
   14724 
   14725    case 0xE7:
   14726       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14727       /* 0F E7 = MOVNTQ -- for us, just a plain MMX store.  Note, the
   14728          Intel manual does not say anything about the usual business of
   14729          the FP reg tags getting trashed whenever an MMX insn happens.
   14730          So we just leave them alone.
   14731       */
   14732       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14733          modrm = getUChar(delta);
   14734          if (!epartIsReg(modrm)) {
   14735             /* do_MMX_preamble(); Intel docs don't specify this */
   14736             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14737             storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
   14738             DIP("movntq %s,%s\n", dis_buf,
   14739                                   nameMMXReg(gregLO3ofRM(modrm)));
   14740             delta += alen;
   14741             goto decode_success;
   14742          }
   14743          /* else fall through */
   14744       }
   14745       /* 66 0F E7 = MOVNTDQ -- for us, just a plain SSE store. */
   14746       if (have66noF2noF3(pfx) && sz == 2) {
   14747          modrm = getUChar(delta);
   14748          if (!epartIsReg(modrm)) {
   14749             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14750             gen_SEGV_if_not_16_aligned( addr );
   14751             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   14752             DIP("movntdq %s,%s\n", dis_buf,
   14753                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   14754             delta += alen;
   14755             goto decode_success;
   14756          }
   14757          /* else fall through */
   14758       }
   14759       break;
   14760 
   14761    case 0xE8:
   14762       /* 66 0F E8 = PSUBSB */
   14763       if (have66noF2noF3(pfx) && sz == 2) {
   14764          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14765                                     "psubsb", Iop_QSub8Sx16, False );
   14766          goto decode_success;
   14767       }
   14768       break;
   14769 
   14770    case 0xE9:
   14771       /* 66 0F E9 = PSUBSW */
   14772       if (have66noF2noF3(pfx) && sz == 2) {
   14773          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14774                                     "psubsw", Iop_QSub16Sx8, False );
   14775          goto decode_success;
   14776       }
   14777       break;
   14778 
   14779    case 0xEA:
   14780       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14781       /* 0F EA = PMINSW -- 16x4 signed min */
   14782       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14783          do_MMX_preamble();
   14784          delta = dis_MMXop_regmem_to_reg (
   14785                     vbi, pfx, delta, opc, "pminsw", False );
   14786          goto decode_success;
   14787       }
   14788       /* 66 0F EA = PMINSW -- 16x8 signed min */
   14789       if (have66noF2noF3(pfx) && sz == 2) {
   14790          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14791                                     "pminsw", Iop_Min16Sx8, False );
   14792          goto decode_success;
   14793       }
   14794       break;
   14795 
   14796    case 0xEB:
   14797       /* 66 0F EB = POR */
   14798       if (have66noF2noF3(pfx) && sz == 2) {
   14799          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "por", Iop_OrV128 );
   14800          goto decode_success;
   14801       }
   14802       break;
   14803 
   14804    case 0xEC:
   14805       /* 66 0F EC = PADDSB */
   14806       if (have66noF2noF3(pfx) && sz == 2) {
   14807          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14808                                     "paddsb", Iop_QAdd8Sx16, False );
   14809          goto decode_success;
   14810       }
   14811       break;
   14812 
   14813    case 0xED:
   14814       /* 66 0F ED = PADDSW */
   14815       if (have66noF2noF3(pfx) && sz == 2) {
   14816          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14817                                     "paddsw", Iop_QAdd16Sx8, False );
   14818          goto decode_success;
   14819       }
   14820       break;
   14821 
   14822    case 0xEE:
   14823       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14824       /* 0F EE = PMAXSW -- 16x4 signed max */
   14825       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14826          do_MMX_preamble();
   14827          delta = dis_MMXop_regmem_to_reg (
   14828                     vbi, pfx, delta, opc, "pmaxsw", False );
   14829          goto decode_success;
   14830       }
   14831       /* 66 0F EE = PMAXSW -- 16x8 signed max */
   14832       if (have66noF2noF3(pfx) && sz == 2) {
   14833          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14834                                     "pmaxsw", Iop_Max16Sx8, False );
   14835          goto decode_success;
   14836       }
   14837       break;
   14838 
   14839    case 0xEF:
   14840       /* 66 0F EF = PXOR */
   14841       if (have66noF2noF3(pfx) && sz == 2) {
   14842          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "pxor", Iop_XorV128 );
   14843          goto decode_success;
   14844       }
   14845       break;
   14846 
   14847    case 0xF1:
   14848       /* 66 0F F1 = PSLLW by E */
   14849       if (have66noF2noF3(pfx) && sz == 2) {
   14850          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psllw", Iop_ShlN16x8 );
   14851          goto decode_success;
   14852       }
   14853       break;
   14854 
   14855    case 0xF2:
   14856       /* 66 0F F2 = PSLLD by E */
   14857       if (have66noF2noF3(pfx) && sz == 2) {
   14858          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "pslld", Iop_ShlN32x4 );
   14859          goto decode_success;
   14860       }
   14861       break;
   14862 
   14863    case 0xF3:
   14864       /* 66 0F F3 = PSLLQ by E */
   14865       if (have66noF2noF3(pfx) && sz == 2) {
   14866          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psllq", Iop_ShlN64x2 );
   14867          goto decode_success;
   14868       }
   14869       break;
   14870 
   14871    case 0xF4:
   14872       /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   14873          0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
   14874          half */
   14875       if (have66noF2noF3(pfx) && sz == 2) {
   14876          IRTemp sV = newTemp(Ity_V128);
   14877          IRTemp dV = newTemp(Ity_V128);
   14878          modrm = getUChar(delta);
   14879          UInt rG = gregOfRexRM(pfx,modrm);
   14880          assign( dV, getXMMReg(rG) );
   14881          if (epartIsReg(modrm)) {
   14882             UInt rE = eregOfRexRM(pfx,modrm);
   14883             assign( sV, getXMMReg(rE) );
   14884             delta += 1;
   14885             DIP("pmuludq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   14886          } else {
   14887             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14888             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   14889             delta += alen;
   14890             DIP("pmuludq %s,%s\n", dis_buf, nameXMMReg(rG));
   14891          }
   14892          putXMMReg( rG, mkexpr(math_PMULUDQ_128( sV, dV )) );
   14893          goto decode_success;
   14894       }
   14895       /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   14896       /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   14897          0 to form 64-bit result */
   14898       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14899          IRTemp sV = newTemp(Ity_I64);
   14900          IRTemp dV = newTemp(Ity_I64);
   14901          t1 = newTemp(Ity_I32);
   14902          t0 = newTemp(Ity_I32);
   14903          modrm = getUChar(delta);
   14904 
   14905          do_MMX_preamble();
   14906          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   14907 
   14908          if (epartIsReg(modrm)) {
   14909             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   14910             delta += 1;
   14911             DIP("pmuludq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   14912                                    nameMMXReg(gregLO3ofRM(modrm)));
   14913          } else {
   14914             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14915             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   14916             delta += alen;
   14917             DIP("pmuludq %s,%s\n", dis_buf,
   14918                                    nameMMXReg(gregLO3ofRM(modrm)));
   14919          }
   14920 
   14921          assign( t0, unop(Iop_64to32, mkexpr(dV)) );
   14922          assign( t1, unop(Iop_64to32, mkexpr(sV)) );
   14923          putMMXReg( gregLO3ofRM(modrm),
   14924                     binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) );
   14925          goto decode_success;
   14926       }
   14927       break;
   14928 
   14929    case 0xF5:
   14930       /* 66 0F F5 = PMADDWD -- Multiply and add packed integers from
   14931          E(xmm or mem) to G(xmm) */
   14932       if (have66noF2noF3(pfx) && sz == 2) {
   14933          IRTemp sV = newTemp(Ity_V128);
   14934          IRTemp dV = newTemp(Ity_V128);
   14935          modrm     = getUChar(delta);
   14936          UInt   rG = gregOfRexRM(pfx,modrm);
   14937          if (epartIsReg(modrm)) {
   14938             UInt rE = eregOfRexRM(pfx,modrm);
   14939             assign( sV, getXMMReg(rE) );
   14940             delta += 1;
   14941             DIP("pmaddwd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   14942          } else {
   14943             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14944             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   14945             delta += alen;
   14946             DIP("pmaddwd %s,%s\n", dis_buf, nameXMMReg(rG));
   14947          }
   14948          assign( dV, getXMMReg(rG) );
   14949          putXMMReg( rG, mkexpr(math_PMADDWD_128(dV, sV)) );
   14950          goto decode_success;
   14951       }
   14952       break;
   14953 
   14954    case 0xF6:
   14955       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14956       /* 0F F6 = PSADBW -- sum of 8Ux8 absolute differences */
   14957       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14958          do_MMX_preamble();
   14959          delta = dis_MMXop_regmem_to_reg (
   14960                     vbi, pfx, delta, opc, "psadbw", False );
   14961          goto decode_success;
   14962       }
   14963       /* 66 0F F6 = PSADBW -- 2 x (8x8 -> 48 zeroes ++ u16) Sum Abs Diffs
   14964          from E(xmm or mem) to G(xmm) */
   14965       if (have66noF2noF3(pfx) && sz == 2) {
   14966          IRTemp sV  = newTemp(Ity_V128);
   14967          IRTemp dV  = newTemp(Ity_V128);
   14968          modrm = getUChar(delta);
   14969          UInt   rG   = gregOfRexRM(pfx,modrm);
   14970          if (epartIsReg(modrm)) {
   14971             UInt rE = eregOfRexRM(pfx,modrm);
   14972             assign( sV, getXMMReg(rE) );
   14973             delta += 1;
   14974             DIP("psadbw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   14975          } else {
   14976             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14977             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   14978             delta += alen;
   14979             DIP("psadbw %s,%s\n", dis_buf, nameXMMReg(rG));
   14980          }
   14981          assign( dV, getXMMReg(rG) );
   14982          putXMMReg( rG, mkexpr( math_PSADBW_128 ( dV, sV ) ) );
   14983 
   14984          goto decode_success;
   14985       }
   14986       break;
   14987 
   14988    case 0xF7:
   14989       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14990       /* 0F F7 = MASKMOVQ -- 8x8 masked store */
   14991       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14992          Bool ok = False;
   14993          delta = dis_MMX( &ok, vbi, pfx, sz, delta-1 );
   14994          if (ok) goto decode_success;
   14995       }
   14996       /* 66 0F F7 = MASKMOVDQU -- store selected bytes of double quadword */
   14997       if (have66noF2noF3(pfx) && sz == 2 && epartIsReg(getUChar(delta))) {
   14998          delta = dis_MASKMOVDQU( vbi, pfx, delta, False/*!isAvx*/ );
   14999          goto decode_success;
   15000       }
   15001       break;
   15002 
   15003    case 0xF8:
   15004       /* 66 0F F8 = PSUBB */
   15005       if (have66noF2noF3(pfx) && sz == 2) {
   15006          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   15007                                     "psubb", Iop_Sub8x16, False );
   15008          goto decode_success;
   15009       }
   15010       break;
   15011 
   15012    case 0xF9:
   15013       /* 66 0F F9 = PSUBW */
   15014       if (have66noF2noF3(pfx) && sz == 2) {
   15015          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   15016                                     "psubw", Iop_Sub16x8, False );
   15017          goto decode_success;
   15018       }
   15019       break;
   15020 
   15021    case 0xFA:
   15022       /* 66 0F FA = PSUBD */
   15023       if (have66noF2noF3(pfx) && sz == 2) {
   15024          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   15025                                     "psubd", Iop_Sub32x4, False );
   15026          goto decode_success;
   15027       }
   15028       break;
   15029 
   15030    case 0xFB:
   15031       /* 66 0F FB = PSUBQ */
   15032       if (have66noF2noF3(pfx) && sz == 2) {
   15033          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   15034                                     "psubq", Iop_Sub64x2, False );
   15035          goto decode_success;
   15036       }
   15037       /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   15038       /* 0F FB = PSUBQ -- sub 64x1 */
   15039       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15040          do_MMX_preamble();
   15041          delta = dis_MMXop_regmem_to_reg (
   15042                    vbi, pfx, delta, opc, "psubq", False );
   15043          goto decode_success;
   15044       }
   15045       break;
   15046 
   15047    case 0xFC:
   15048       /* 66 0F FC = PADDB */
   15049       if (have66noF2noF3(pfx) && sz == 2) {
   15050          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   15051                                     "paddb", Iop_Add8x16, False );
   15052          goto decode_success;
   15053       }
   15054       break;
   15055 
   15056    case 0xFD:
   15057       /* 66 0F FD = PADDW */
   15058       if (have66noF2noF3(pfx) && sz == 2) {
   15059          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   15060                                     "paddw", Iop_Add16x8, False );
   15061          goto decode_success;
   15062       }
   15063       break;
   15064 
   15065    case 0xFE:
   15066       /* 66 0F FE = PADDD */
   15067       if (have66noF2noF3(pfx) && sz == 2) {
   15068          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   15069                                     "paddd", Iop_Add32x4, False );
   15070          goto decode_success;
   15071       }
   15072       break;
   15073 
   15074    default:
   15075       goto decode_failure;
   15076 
   15077    }
   15078 
   15079   decode_failure:
   15080    *decode_OK = False;
   15081    return deltaIN;
   15082 
   15083   decode_success:
   15084    *decode_OK = True;
   15085    return delta;
   15086 }
   15087 
   15088 
   15089 /*------------------------------------------------------------*/
   15090 /*---                                                      ---*/
   15091 /*--- Top-level SSE3 (not SupSSE3): dis_ESC_0F__SSE3       ---*/
   15092 /*---                                                      ---*/
   15093 /*------------------------------------------------------------*/
   15094 
   15095 static Long dis_MOVDDUP_128 ( const VexAbiInfo* vbi, Prefix pfx,
   15096                               Long delta, Bool isAvx )
   15097 {
   15098    IRTemp addr   = IRTemp_INVALID;
   15099    Int    alen   = 0;
   15100    HChar  dis_buf[50];
   15101    IRTemp sV    = newTemp(Ity_V128);
   15102    IRTemp d0    = newTemp(Ity_I64);
   15103    UChar  modrm = getUChar(delta);
   15104    UInt   rG    = gregOfRexRM(pfx,modrm);
   15105    if (epartIsReg(modrm)) {
   15106       UInt rE = eregOfRexRM(pfx,modrm);
   15107       assign( sV, getXMMReg(rE) );
   15108       DIP("%smovddup %s,%s\n",
   15109           isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
   15110       delta += 1;
   15111       assign ( d0, unop(Iop_V128to64, mkexpr(sV)) );
   15112    } else {
   15113       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15114       assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
   15115       DIP("%smovddup %s,%s\n",
   15116           isAvx ? "v" : "", dis_buf, nameXMMReg(rG));
   15117       delta += alen;
   15118    }
   15119    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   15120       ( rG, binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) );
   15121    return delta;
   15122 }
   15123 
   15124 
   15125 static Long dis_MOVDDUP_256 ( const VexAbiInfo* vbi, Prefix pfx,
   15126                               Long delta )
   15127 {
   15128    IRTemp addr   = IRTemp_INVALID;
   15129    Int    alen   = 0;
   15130    HChar  dis_buf[50];
   15131    IRTemp d0    = newTemp(Ity_I64);
   15132    IRTemp d1    = newTemp(Ity_I64);
   15133    UChar  modrm = getUChar(delta);
   15134    UInt   rG    = gregOfRexRM(pfx,modrm);
   15135    if (epartIsReg(modrm)) {
   15136       UInt rE = eregOfRexRM(pfx,modrm);
   15137       DIP("vmovddup %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   15138       delta += 1;
   15139       assign ( d0, getYMMRegLane64(rE, 0) );
   15140       assign ( d1, getYMMRegLane64(rE, 2) );
   15141    } else {
   15142       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15143       assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
   15144       assign( d1, loadLE(Ity_I64, binop(Iop_Add64,
   15145                                         mkexpr(addr), mkU64(16))) );
   15146       DIP("vmovddup %s,%s\n", dis_buf, nameYMMReg(rG));
   15147       delta += alen;
   15148    }
   15149    putYMMRegLane64( rG, 0, mkexpr(d0) );
   15150    putYMMRegLane64( rG, 1, mkexpr(d0) );
   15151    putYMMRegLane64( rG, 2, mkexpr(d1) );
   15152    putYMMRegLane64( rG, 3, mkexpr(d1) );
   15153    return delta;
   15154 }
   15155 
   15156 
   15157 static Long dis_MOVSxDUP_128 ( const VexAbiInfo* vbi, Prefix pfx,
   15158                                Long delta, Bool isAvx, Bool isL )
   15159 {
   15160    IRTemp addr  = IRTemp_INVALID;
   15161    Int    alen  = 0;
   15162    HChar  dis_buf[50];
   15163    IRTemp sV    = newTemp(Ity_V128);
   15164    UChar  modrm = getUChar(delta);
   15165    UInt   rG    = gregOfRexRM(pfx,modrm);
   15166    IRTemp s3, s2, s1, s0;
   15167    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   15168    if (epartIsReg(modrm)) {
   15169       UInt rE = eregOfRexRM(pfx,modrm);
   15170       assign( sV, getXMMReg(rE) );
   15171       DIP("%smovs%cdup %s,%s\n",
   15172           isAvx ? "v" : "", isL ? 'l' : 'h', nameXMMReg(rE), nameXMMReg(rG));
   15173       delta += 1;
   15174    } else {
   15175       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15176       if (!isAvx)
   15177          gen_SEGV_if_not_16_aligned( addr );
   15178       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15179       DIP("%smovs%cdup %s,%s\n",
   15180           isAvx ? "v" : "", isL ? 'l' : 'h', dis_buf, nameXMMReg(rG));
   15181       delta += alen;
   15182    }
   15183    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   15184    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   15185       ( rG, isL ? mkV128from32s( s2, s2, s0, s0 )
   15186                 : mkV128from32s( s3, s3, s1, s1 ) );
   15187    return delta;
   15188 }
   15189 
   15190 
   15191 static Long dis_MOVSxDUP_256 ( const VexAbiInfo* vbi, Prefix pfx,
   15192                                Long delta, Bool isL )
   15193 {
   15194    IRTemp addr  = IRTemp_INVALID;
   15195    Int    alen  = 0;
   15196    HChar  dis_buf[50];
   15197    IRTemp sV    = newTemp(Ity_V256);
   15198    UChar  modrm = getUChar(delta);
   15199    UInt   rG    = gregOfRexRM(pfx,modrm);
   15200    IRTemp s7, s6, s5, s4, s3, s2, s1, s0;
   15201    s7 = s6 = s5 = s4 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   15202    if (epartIsReg(modrm)) {
   15203       UInt rE = eregOfRexRM(pfx,modrm);
   15204       assign( sV, getYMMReg(rE) );
   15205       DIP("vmovs%cdup %s,%s\n",
   15206           isL ? 'l' : 'h', nameYMMReg(rE), nameYMMReg(rG));
   15207       delta += 1;
   15208    } else {
   15209       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15210       assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   15211       DIP("vmovs%cdup %s,%s\n",
   15212           isL ? 'l' : 'h', dis_buf, nameYMMReg(rG));
   15213       delta += alen;
   15214    }
   15215    breakupV256to32s( sV, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
   15216    putYMMRegLane128( rG, 1, isL ? mkV128from32s( s6, s6, s4, s4 )
   15217                                 : mkV128from32s( s7, s7, s5, s5 ) );
   15218    putYMMRegLane128( rG, 0, isL ? mkV128from32s( s2, s2, s0, s0 )
   15219                                 : mkV128from32s( s3, s3, s1, s1 ) );
   15220    return delta;
   15221 }
   15222 
   15223 
   15224 static IRTemp math_HADDPS_128 ( IRTemp dV, IRTemp sV, Bool isAdd )
   15225 {
   15226    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   15227    IRTemp leftV  = newTemp(Ity_V128);
   15228    IRTemp rightV = newTemp(Ity_V128);
   15229    IRTemp rm     = newTemp(Ity_I32);
   15230    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   15231 
   15232    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   15233    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   15234 
   15235    assign( leftV,  mkV128from32s( s2, s0, d2, d0 ) );
   15236    assign( rightV, mkV128from32s( s3, s1, d3, d1 ) );
   15237 
   15238    IRTemp res = newTemp(Ity_V128);
   15239    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   15240    assign( res, triop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4,
   15241                       mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
   15242    return res;
   15243 }
   15244 
   15245 
   15246 static IRTemp math_HADDPD_128 ( IRTemp dV, IRTemp sV, Bool isAdd )
   15247 {
   15248    IRTemp s1, s0, d1, d0;
   15249    IRTemp leftV  = newTemp(Ity_V128);
   15250    IRTemp rightV = newTemp(Ity_V128);
   15251    IRTemp rm     = newTemp(Ity_I32);
   15252    s1 = s0 = d1 = d0 = IRTemp_INVALID;
   15253 
   15254    breakupV128to64s( sV, &s1, &s0 );
   15255    breakupV128to64s( dV, &d1, &d0 );
   15256 
   15257    assign( leftV,  binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) );
   15258    assign( rightV, binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
   15259 
   15260    IRTemp res = newTemp(Ity_V128);
   15261    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   15262    assign( res, triop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2,
   15263                       mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
   15264    return res;
   15265 }
   15266 
   15267 
   15268 __attribute__((noinline))
   15269 static
   15270 Long dis_ESC_0F__SSE3 ( Bool* decode_OK,
   15271                         const VexAbiInfo* vbi,
   15272                         Prefix pfx, Int sz, Long deltaIN )
   15273 {
   15274    IRTemp addr  = IRTemp_INVALID;
   15275    UChar  modrm = 0;
   15276    Int    alen  = 0;
   15277    HChar  dis_buf[50];
   15278 
   15279    *decode_OK = False;
   15280 
   15281    Long   delta = deltaIN;
   15282    UChar  opc   = getUChar(delta);
   15283    delta++;
   15284    switch (opc) {
   15285 
   15286    case 0x12:
   15287       /* F3 0F 12 = MOVSLDUP -- move from E (mem or xmm) to G (xmm),
   15288          duplicating some lanes (2:2:0:0). */
   15289       if (haveF3no66noF2(pfx) && sz == 4) {
   15290          delta = dis_MOVSxDUP_128( vbi, pfx, delta, False/*!isAvx*/,
   15291                                    True/*isL*/ );
   15292          goto decode_success;
   15293       }
   15294       /* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
   15295          duplicating some lanes (0:1:0:1). */
   15296       if (haveF2no66noF3(pfx)
   15297           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   15298          delta = dis_MOVDDUP_128( vbi, pfx, delta, False/*!isAvx*/ );
   15299          goto decode_success;
   15300       }
   15301       break;
   15302 
   15303    case 0x16:
   15304       /* F3 0F 16 = MOVSHDUP -- move from E (mem or xmm) to G (xmm),
   15305          duplicating some lanes (3:3:1:1). */
   15306       if (haveF3no66noF2(pfx) && sz == 4) {
   15307          delta = dis_MOVSxDUP_128( vbi, pfx, delta, False/*!isAvx*/,
   15308                                    False/*!isL*/ );
   15309          goto decode_success;
   15310       }
   15311       break;
   15312 
   15313    case 0x7C:
   15314    case 0x7D:
   15315       /* F2 0F 7C = HADDPS -- 32x4 add across from E (mem or xmm) to G (xmm). */
   15316       /* F2 0F 7D = HSUBPS -- 32x4 sub across from E (mem or xmm) to G (xmm). */
   15317       if (haveF2no66noF3(pfx) && sz == 4) {
   15318          IRTemp eV     = newTemp(Ity_V128);
   15319          IRTemp gV     = newTemp(Ity_V128);
   15320          Bool   isAdd  = opc == 0x7C;
   15321          const HChar* str = isAdd ? "add" : "sub";
   15322          modrm         = getUChar(delta);
   15323          UInt   rG     = gregOfRexRM(pfx,modrm);
   15324          if (epartIsReg(modrm)) {
   15325             UInt rE = eregOfRexRM(pfx,modrm);
   15326             assign( eV, getXMMReg(rE) );
   15327             DIP("h%sps %s,%s\n", str, nameXMMReg(rE), nameXMMReg(rG));
   15328             delta += 1;
   15329          } else {
   15330             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15331             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   15332             DIP("h%sps %s,%s\n", str, dis_buf, nameXMMReg(rG));
   15333             delta += alen;
   15334          }
   15335 
   15336          assign( gV, getXMMReg(rG) );
   15337          putXMMReg( rG, mkexpr( math_HADDPS_128 ( gV, eV, isAdd ) ) );
   15338          goto decode_success;
   15339       }
   15340       /* 66 0F 7C = HADDPD -- 64x2 add across from E (mem or xmm) to G (xmm). */
   15341       /* 66 0F 7D = HSUBPD -- 64x2 sub across from E (mem or xmm) to G (xmm). */
   15342       if (have66noF2noF3(pfx) && sz == 2) {
   15343          IRTemp eV     = newTemp(Ity_V128);
   15344          IRTemp gV     = newTemp(Ity_V128);
   15345          Bool   isAdd  = opc == 0x7C;
   15346          const HChar* str = isAdd ? "add" : "sub";
   15347          modrm         = getUChar(delta);
   15348          UInt   rG     = gregOfRexRM(pfx,modrm);
   15349          if (epartIsReg(modrm)) {
   15350             UInt rE = eregOfRexRM(pfx,modrm);
   15351             assign( eV, getXMMReg(rE) );
   15352             DIP("h%spd %s,%s\n", str, nameXMMReg(rE), nameXMMReg(rG));
   15353             delta += 1;
   15354          } else {
   15355             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15356             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   15357             DIP("h%spd %s,%s\n", str, dis_buf, nameXMMReg(rG));
   15358             delta += alen;
   15359          }
   15360 
   15361          assign( gV, getXMMReg(rG) );
   15362          putXMMReg( rG, mkexpr( math_HADDPD_128 ( gV, eV, isAdd ) ) );
   15363          goto decode_success;
   15364       }
   15365       break;
   15366 
   15367    case 0xD0:
   15368       /* 66 0F D0 = ADDSUBPD -- 64x4 +/- from E (mem or xmm) to G (xmm). */
   15369       if (have66noF2noF3(pfx) && sz == 2) {
   15370          IRTemp eV   = newTemp(Ity_V128);
   15371          IRTemp gV   = newTemp(Ity_V128);
   15372          modrm       = getUChar(delta);
   15373          UInt   rG   = gregOfRexRM(pfx,modrm);
   15374          if (epartIsReg(modrm)) {
   15375             UInt rE = eregOfRexRM(pfx,modrm);
   15376             assign( eV, getXMMReg(rE) );
   15377             DIP("addsubpd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   15378             delta += 1;
   15379          } else {
   15380             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15381             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   15382             DIP("addsubpd %s,%s\n", dis_buf, nameXMMReg(rG));
   15383             delta += alen;
   15384          }
   15385 
   15386          assign( gV, getXMMReg(rG) );
   15387          putXMMReg( rG, mkexpr( math_ADDSUBPD_128 ( gV, eV ) ) );
   15388          goto decode_success;
   15389       }
   15390       /* F2 0F D0 = ADDSUBPS -- 32x4 +/-/+/- from E (mem or xmm) to G (xmm). */
   15391       if (haveF2no66noF3(pfx) && sz == 4) {
   15392          IRTemp eV   = newTemp(Ity_V128);
   15393          IRTemp gV   = newTemp(Ity_V128);
   15394          modrm       = getUChar(delta);
   15395          UInt   rG   = gregOfRexRM(pfx,modrm);
   15396 
   15397          modrm = getUChar(delta);
   15398          if (epartIsReg(modrm)) {
   15399             UInt rE = eregOfRexRM(pfx,modrm);
   15400             assign( eV, getXMMReg(rE) );
   15401             DIP("addsubps %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   15402             delta += 1;
   15403          } else {
   15404             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15405             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   15406             DIP("addsubps %s,%s\n", dis_buf, nameXMMReg(rG));
   15407             delta += alen;
   15408          }
   15409 
   15410          assign( gV, getXMMReg(rG) );
   15411          putXMMReg( rG, mkexpr( math_ADDSUBPS_128 ( gV, eV ) ) );
   15412          goto decode_success;
   15413       }
   15414       break;
   15415 
   15416    case 0xF0:
   15417       /* F2 0F F0 = LDDQU -- move from E (mem or xmm) to G (xmm). */
   15418       if (haveF2no66noF3(pfx) && sz == 4) {
   15419          modrm = getUChar(delta);
   15420          if (epartIsReg(modrm)) {
   15421             goto decode_failure;
   15422          } else {
   15423             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15424             putXMMReg( gregOfRexRM(pfx,modrm),
   15425                        loadLE(Ity_V128, mkexpr(addr)) );
   15426             DIP("lddqu %s,%s\n", dis_buf,
   15427                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   15428             delta += alen;
   15429          }
   15430          goto decode_success;
   15431       }
   15432       break;
   15433 
   15434    default:
   15435       goto decode_failure;
   15436 
   15437    }
   15438 
   15439   decode_failure:
   15440    *decode_OK = False;
   15441    return deltaIN;
   15442 
   15443   decode_success:
   15444    *decode_OK = True;
   15445    return delta;
   15446 }
   15447 
   15448 
   15449 /*------------------------------------------------------------*/
   15450 /*---                                                      ---*/
   15451 /*--- Top-level SSSE3: dis_ESC_0F38__SupSSE3               ---*/
   15452 /*---                                                      ---*/
   15453 /*------------------------------------------------------------*/
   15454 
   15455 static
   15456 IRTemp math_PSHUFB_XMM ( IRTemp dV/*data to perm*/, IRTemp sV/*perm*/ )
   15457 {
   15458    IRTemp sHi        = newTemp(Ity_I64);
   15459    IRTemp sLo        = newTemp(Ity_I64);
   15460    IRTemp dHi        = newTemp(Ity_I64);
   15461    IRTemp dLo        = newTemp(Ity_I64);
   15462    IRTemp rHi        = newTemp(Ity_I64);
   15463    IRTemp rLo        = newTemp(Ity_I64);
   15464    IRTemp sevens     = newTemp(Ity_I64);
   15465    IRTemp mask0x80hi = newTemp(Ity_I64);
   15466    IRTemp mask0x80lo = newTemp(Ity_I64);
   15467    IRTemp maskBit3hi = newTemp(Ity_I64);
   15468    IRTemp maskBit3lo = newTemp(Ity_I64);
   15469    IRTemp sAnd7hi    = newTemp(Ity_I64);
   15470    IRTemp sAnd7lo    = newTemp(Ity_I64);
   15471    IRTemp permdHi    = newTemp(Ity_I64);
   15472    IRTemp permdLo    = newTemp(Ity_I64);
   15473    IRTemp res        = newTemp(Ity_V128);
   15474 
   15475    assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   15476    assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   15477    assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   15478    assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   15479 
   15480    assign( sevens, mkU64(0x0707070707070707ULL) );
   15481 
   15482    /* mask0x80hi = Not(SarN8x8(sHi,7))
   15483       maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7)
   15484       sAnd7hi    = And(sHi,sevens)
   15485       permdHi    = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi),
   15486       And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) )
   15487       rHi        = And(permdHi,mask0x80hi)
   15488    */
   15489    assign(
   15490       mask0x80hi,
   15491       unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7))));
   15492 
   15493    assign(
   15494       maskBit3hi,
   15495       binop(Iop_SarN8x8,
   15496             binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)),
   15497             mkU8(7)));
   15498 
   15499    assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens)));
   15500 
   15501    assign(
   15502       permdHi,
   15503       binop(
   15504          Iop_Or64,
   15505          binop(Iop_And64,
   15506                binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)),
   15507                mkexpr(maskBit3hi)),
   15508          binop(Iop_And64,
   15509                binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)),
   15510                unop(Iop_Not64,mkexpr(maskBit3hi))) ));
   15511 
   15512    assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) );
   15513 
   15514    /* And the same for the lower half of the result.  What fun. */
   15515 
   15516    assign(
   15517       mask0x80lo,
   15518       unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7))));
   15519 
   15520    assign(
   15521       maskBit3lo,
   15522       binop(Iop_SarN8x8,
   15523             binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)),
   15524             mkU8(7)));
   15525 
   15526    assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens)));
   15527 
   15528    assign(
   15529       permdLo,
   15530       binop(
   15531          Iop_Or64,
   15532          binop(Iop_And64,
   15533                binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)),
   15534                mkexpr(maskBit3lo)),
   15535          binop(Iop_And64,
   15536                binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)),
   15537                unop(Iop_Not64,mkexpr(maskBit3lo))) ));
   15538 
   15539    assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) );
   15540 
   15541    assign(res, binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo)));
   15542    return res;
   15543 }
   15544 
   15545 
   15546 static
   15547 IRTemp math_PSHUFB_YMM ( IRTemp dV/*data to perm*/, IRTemp sV/*perm*/ )
   15548 {
   15549    IRTemp sHi, sLo, dHi, dLo;
   15550    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   15551    breakupV256toV128s( dV, &dHi, &dLo);
   15552    breakupV256toV128s( sV, &sHi, &sLo);
   15553    IRTemp res = newTemp(Ity_V256);
   15554    assign(res, binop(Iop_V128HLtoV256,
   15555                      mkexpr(math_PSHUFB_XMM(dHi, sHi)),
   15556                      mkexpr(math_PSHUFB_XMM(dLo, sLo))));
   15557    return res;
   15558 }
   15559 
   15560 
   15561 static Long dis_PHADD_128 ( const VexAbiInfo* vbi, Prefix pfx, Long delta,
   15562                             Bool isAvx, UChar opc )
   15563 {
   15564    IRTemp addr   = IRTemp_INVALID;
   15565    Int    alen   = 0;
   15566    HChar  dis_buf[50];
   15567    const HChar* str = "???";
   15568    IROp   opV64  = Iop_INVALID;
   15569    IROp   opCatO = Iop_CatOddLanes16x4;
   15570    IROp   opCatE = Iop_CatEvenLanes16x4;
   15571    IRTemp sV     = newTemp(Ity_V128);
   15572    IRTemp dV     = newTemp(Ity_V128);
   15573    IRTemp sHi    = newTemp(Ity_I64);
   15574    IRTemp sLo    = newTemp(Ity_I64);
   15575    IRTemp dHi    = newTemp(Ity_I64);
   15576    IRTemp dLo    = newTemp(Ity_I64);
   15577    UChar  modrm  = getUChar(delta);
   15578    UInt   rG     = gregOfRexRM(pfx,modrm);
   15579    UInt   rV     = isAvx ? getVexNvvvv(pfx) : rG;
   15580 
   15581    switch (opc) {
   15582       case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   15583       case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   15584       case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   15585       case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   15586       case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   15587       case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   15588       default: vassert(0);
   15589    }
   15590    if (opc == 0x02 || opc == 0x06) {
   15591       opCatO = Iop_InterleaveHI32x2;
   15592       opCatE = Iop_InterleaveLO32x2;
   15593    }
   15594 
   15595    assign( dV, getXMMReg(rV) );
   15596 
   15597    if (epartIsReg(modrm)) {
   15598       UInt rE = eregOfRexRM(pfx,modrm);
   15599       assign( sV, getXMMReg(rE) );
   15600       DIP("%sph%s %s,%s\n", isAvx ? "v" : "", str,
   15601           nameXMMReg(rE), nameXMMReg(rG));
   15602       delta += 1;
   15603    } else {
   15604       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15605       if (!isAvx)
   15606          gen_SEGV_if_not_16_aligned( addr );
   15607       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15608       DIP("%sph%s %s,%s\n", isAvx ? "v" : "", str,
   15609           dis_buf, nameXMMReg(rG));
   15610       delta += alen;
   15611    }
   15612 
   15613    assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   15614    assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   15615    assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   15616    assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   15617 
   15618    /* This isn't a particularly efficient way to compute the
   15619       result, but at least it avoids a proliferation of IROps,
   15620       hence avoids complication all the backends. */
   15621 
   15622    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   15623       ( rG,
   15624         binop(Iop_64HLtoV128,
   15625               binop(opV64,
   15626                     binop(opCatE,mkexpr(sHi),mkexpr(sLo)),
   15627                     binop(opCatO,mkexpr(sHi),mkexpr(sLo)) ),
   15628               binop(opV64,
   15629                     binop(opCatE,mkexpr(dHi),mkexpr(dLo)),
   15630                     binop(opCatO,mkexpr(dHi),mkexpr(dLo)) ) ) );
   15631    return delta;
   15632 }
   15633 
   15634 
   15635 static Long dis_PHADD_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta,
   15636                             UChar opc )
   15637 {
   15638    IRTemp addr   = IRTemp_INVALID;
   15639    Int    alen   = 0;
   15640    HChar  dis_buf[50];
   15641    const HChar* str = "???";
   15642    IROp   opV64  = Iop_INVALID;
   15643    IROp   opCatO = Iop_CatOddLanes16x4;
   15644    IROp   opCatE = Iop_CatEvenLanes16x4;
   15645    IRTemp sV     = newTemp(Ity_V256);
   15646    IRTemp dV     = newTemp(Ity_V256);
   15647    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   15648    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   15649    UChar  modrm  = getUChar(delta);
   15650    UInt   rG     = gregOfRexRM(pfx,modrm);
   15651    UInt   rV     = getVexNvvvv(pfx);
   15652 
   15653    switch (opc) {
   15654       case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   15655       case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   15656       case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   15657       case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   15658       case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   15659       case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   15660       default: vassert(0);
   15661    }
   15662    if (opc == 0x02 || opc == 0x06) {
   15663       opCatO = Iop_InterleaveHI32x2;
   15664       opCatE = Iop_InterleaveLO32x2;
   15665    }
   15666 
   15667    assign( dV, getYMMReg(rV) );
   15668 
   15669    if (epartIsReg(modrm)) {
   15670       UInt rE = eregOfRexRM(pfx,modrm);
   15671       assign( sV, getYMMReg(rE) );
   15672       DIP("vph%s %s,%s\n", str, nameYMMReg(rE), nameYMMReg(rG));
   15673       delta += 1;
   15674    } else {
   15675       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15676       assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   15677       DIP("vph%s %s,%s\n", str, dis_buf, nameYMMReg(rG));
   15678       delta += alen;
   15679    }
   15680 
   15681    breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
   15682    breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
   15683 
   15684    /* This isn't a particularly efficient way to compute the
   15685       result, but at least it avoids a proliferation of IROps,
   15686       hence avoids complication all the backends. */
   15687 
   15688    putYMMReg( rG,
   15689               binop(Iop_V128HLtoV256,
   15690                     binop(Iop_64HLtoV128,
   15691                           binop(opV64,
   15692                                 binop(opCatE,mkexpr(s3),mkexpr(s2)),
   15693                                 binop(opCatO,mkexpr(s3),mkexpr(s2)) ),
   15694                           binop(opV64,
   15695                                 binop(opCatE,mkexpr(d3),mkexpr(d2)),
   15696                                 binop(opCatO,mkexpr(d3),mkexpr(d2)) ) ),
   15697                     binop(Iop_64HLtoV128,
   15698                           binop(opV64,
   15699                                 binop(opCatE,mkexpr(s1),mkexpr(s0)),
   15700                                 binop(opCatO,mkexpr(s1),mkexpr(s0)) ),
   15701                           binop(opV64,
   15702                                 binop(opCatE,mkexpr(d1),mkexpr(d0)),
   15703                                 binop(opCatO,mkexpr(d1),mkexpr(d0)) ) ) ) );
   15704    return delta;
   15705 }
   15706 
   15707 
   15708 static IRTemp math_PMADDUBSW_128 ( IRTemp dV, IRTemp sV )
   15709 {
   15710    IRTemp sVoddsSX  = newTemp(Ity_V128);
   15711    IRTemp sVevensSX = newTemp(Ity_V128);
   15712    IRTemp dVoddsZX  = newTemp(Ity_V128);
   15713    IRTemp dVevensZX = newTemp(Ity_V128);
   15714    /* compute dV unsigned x sV signed */
   15715    assign( sVoddsSX, binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) );
   15716    assign( sVevensSX, binop(Iop_SarN16x8,
   15717                             binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)),
   15718                             mkU8(8)) );
   15719    assign( dVoddsZX, binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) );
   15720    assign( dVevensZX, binop(Iop_ShrN16x8,
   15721                             binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)),
   15722                             mkU8(8)) );
   15723 
   15724    IRTemp res = newTemp(Ity_V128);
   15725    assign( res, binop(Iop_QAdd16Sx8,
   15726                       binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
   15727                       binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX))
   15728                      )
   15729          );
   15730    return res;
   15731 }
   15732 
   15733 
   15734 static
   15735 IRTemp math_PMADDUBSW_256 ( IRTemp dV, IRTemp sV )
   15736 {
   15737    IRTemp sHi, sLo, dHi, dLo;
   15738    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   15739    breakupV256toV128s( dV, &dHi, &dLo);
   15740    breakupV256toV128s( sV, &sHi, &sLo);
   15741    IRTemp res = newTemp(Ity_V256);
   15742    assign(res, binop(Iop_V128HLtoV256,
   15743                      mkexpr(math_PMADDUBSW_128(dHi, sHi)),
   15744                      mkexpr(math_PMADDUBSW_128(dLo, sLo))));
   15745    return res;
   15746 }
   15747 
   15748 
   15749 __attribute__((noinline))
   15750 static
   15751 Long dis_ESC_0F38__SupSSE3 ( Bool* decode_OK,
   15752                              const VexAbiInfo* vbi,
   15753                              Prefix pfx, Int sz, Long deltaIN )
   15754 {
   15755    IRTemp addr  = IRTemp_INVALID;
   15756    UChar  modrm = 0;
   15757    Int    alen  = 0;
   15758    HChar  dis_buf[50];
   15759 
   15760    *decode_OK = False;
   15761 
   15762    Long   delta = deltaIN;
   15763    UChar  opc   = getUChar(delta);
   15764    delta++;
   15765    switch (opc) {
   15766 
   15767    case 0x00:
   15768       /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */
   15769       if (have66noF2noF3(pfx)
   15770           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   15771          IRTemp sV = newTemp(Ity_V128);
   15772          IRTemp dV = newTemp(Ity_V128);
   15773 
   15774          modrm = getUChar(delta);
   15775          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   15776 
   15777          if (epartIsReg(modrm)) {
   15778             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   15779             delta += 1;
   15780             DIP("pshufb %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   15781                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   15782          } else {
   15783             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15784             gen_SEGV_if_not_16_aligned( addr );
   15785             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15786             delta += alen;
   15787             DIP("pshufb %s,%s\n", dis_buf,
   15788                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   15789          }
   15790 
   15791          IRTemp res = math_PSHUFB_XMM( dV, sV );
   15792          putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(res));
   15793          goto decode_success;
   15794       }
   15795       /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */
   15796       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15797          IRTemp sV      = newTemp(Ity_I64);
   15798          IRTemp dV      = newTemp(Ity_I64);
   15799 
   15800          modrm = getUChar(delta);
   15801          do_MMX_preamble();
   15802          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   15803 
   15804          if (epartIsReg(modrm)) {
   15805             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   15806             delta += 1;
   15807             DIP("pshufb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   15808                                   nameMMXReg(gregLO3ofRM(modrm)));
   15809          } else {
   15810             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15811             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   15812             delta += alen;
   15813             DIP("pshufb %s,%s\n", dis_buf,
   15814                                   nameMMXReg(gregLO3ofRM(modrm)));
   15815          }
   15816 
   15817          putMMXReg(
   15818             gregLO3ofRM(modrm),
   15819             binop(
   15820                Iop_And64,
   15821                /* permute the lanes */
   15822                binop(
   15823                   Iop_Perm8x8,
   15824                   mkexpr(dV),
   15825                   binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL))
   15826                ),
   15827                /* mask off lanes which have (index & 0x80) == 0x80 */
   15828                unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7)))
   15829             )
   15830          );
   15831          goto decode_success;
   15832       }
   15833       break;
   15834 
   15835    case 0x01:
   15836    case 0x02:
   15837    case 0x03:
   15838    case 0x05:
   15839    case 0x06:
   15840    case 0x07:
   15841       /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and
   15842          G to G (xmm). */
   15843       /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and
   15844          G to G (xmm). */
   15845       /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or
   15846          xmm) and G to G (xmm). */
   15847       /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and
   15848          G to G (xmm). */
   15849       /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and
   15850          G to G (xmm). */
   15851       /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or
   15852          xmm) and G to G (xmm). */
   15853       if (have66noF2noF3(pfx)
   15854           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   15855          delta = dis_PHADD_128( vbi, pfx, delta, False/*isAvx*/, opc );
   15856          goto decode_success;
   15857       }
   15858       /* ***--- these are MMX class insns introduced in SSSE3 ---*** */
   15859       /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G
   15860          to G (mmx). */
   15861       /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G
   15862          to G (mmx). */
   15863       /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or
   15864          mmx) and G to G (mmx). */
   15865       /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G
   15866          to G (mmx). */
   15867       /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G
   15868          to G (mmx). */
   15869       /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or
   15870          mmx) and G to G (mmx). */
   15871       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15872          const HChar* str = "???";
   15873          IROp   opV64  = Iop_INVALID;
   15874          IROp   opCatO = Iop_CatOddLanes16x4;
   15875          IROp   opCatE = Iop_CatEvenLanes16x4;
   15876          IRTemp sV     = newTemp(Ity_I64);
   15877          IRTemp dV     = newTemp(Ity_I64);
   15878 
   15879          modrm = getUChar(delta);
   15880 
   15881          switch (opc) {
   15882             case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   15883             case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   15884             case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   15885             case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   15886             case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   15887             case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   15888             default: vassert(0);
   15889          }
   15890          if (opc == 0x02 || opc == 0x06) {
   15891             opCatO = Iop_InterleaveHI32x2;
   15892             opCatE = Iop_InterleaveLO32x2;
   15893          }
   15894 
   15895          do_MMX_preamble();
   15896          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   15897 
   15898          if (epartIsReg(modrm)) {
   15899             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   15900             delta += 1;
   15901             DIP("ph%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
   15902                                      nameMMXReg(gregLO3ofRM(modrm)));
   15903          } else {
   15904             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15905             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   15906             delta += alen;
   15907             DIP("ph%s %s,%s\n", str, dis_buf,
   15908                                      nameMMXReg(gregLO3ofRM(modrm)));
   15909          }
   15910 
   15911          putMMXReg(
   15912             gregLO3ofRM(modrm),
   15913             binop(opV64,
   15914                   binop(opCatE,mkexpr(sV),mkexpr(dV)),
   15915                   binop(opCatO,mkexpr(sV),mkexpr(dV))
   15916             )
   15917          );
   15918          goto decode_success;
   15919       }
   15920       break;
   15921 
   15922    case 0x04:
   15923       /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
   15924          Unsigned Bytes (XMM) */
   15925       if (have66noF2noF3(pfx)
   15926           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   15927          IRTemp sV = newTemp(Ity_V128);
   15928          IRTemp dV = newTemp(Ity_V128);
   15929          modrm     = getUChar(delta);
   15930          UInt   rG = gregOfRexRM(pfx,modrm);
   15931 
   15932          assign( dV, getXMMReg(rG) );
   15933 
   15934          if (epartIsReg(modrm)) {
   15935             UInt rE = eregOfRexRM(pfx,modrm);
   15936             assign( sV, getXMMReg(rE) );
   15937             delta += 1;
   15938             DIP("pmaddubsw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   15939          } else {
   15940             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15941             gen_SEGV_if_not_16_aligned( addr );
   15942             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15943             delta += alen;
   15944             DIP("pmaddubsw %s,%s\n", dis_buf, nameXMMReg(rG));
   15945          }
   15946 
   15947          putXMMReg( rG, mkexpr( math_PMADDUBSW_128( dV, sV ) ) );
   15948          goto decode_success;
   15949       }
   15950       /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
   15951          Unsigned Bytes (MMX) */
   15952       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15953          IRTemp sV        = newTemp(Ity_I64);
   15954          IRTemp dV        = newTemp(Ity_I64);
   15955          IRTemp sVoddsSX  = newTemp(Ity_I64);
   15956          IRTemp sVevensSX = newTemp(Ity_I64);
   15957          IRTemp dVoddsZX  = newTemp(Ity_I64);
   15958          IRTemp dVevensZX = newTemp(Ity_I64);
   15959 
   15960          modrm = getUChar(delta);
   15961          do_MMX_preamble();
   15962          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   15963 
   15964          if (epartIsReg(modrm)) {
   15965             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   15966             delta += 1;
   15967             DIP("pmaddubsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   15968                                      nameMMXReg(gregLO3ofRM(modrm)));
   15969          } else {
   15970             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15971             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   15972             delta += alen;
   15973             DIP("pmaddubsw %s,%s\n", dis_buf,
   15974                                      nameMMXReg(gregLO3ofRM(modrm)));
   15975          }
   15976 
   15977          /* compute dV unsigned x sV signed */
   15978          assign( sVoddsSX,
   15979                  binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) );
   15980          assign( sVevensSX,
   15981                  binop(Iop_SarN16x4,
   15982                        binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)),
   15983                        mkU8(8)) );
   15984          assign( dVoddsZX,
   15985                  binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) );
   15986          assign( dVevensZX,
   15987                  binop(Iop_ShrN16x4,
   15988                        binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)),
   15989                        mkU8(8)) );
   15990 
   15991          putMMXReg(
   15992             gregLO3ofRM(modrm),
   15993             binop(Iop_QAdd16Sx4,
   15994                   binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
   15995                   binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX))
   15996             )
   15997          );
   15998          goto decode_success;
   15999       }
   16000       break;
   16001 
   16002    case 0x08:
   16003    case 0x09:
   16004    case 0x0A:
   16005       /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */
   16006       /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */
   16007       /* 66 0F 38 0A = PSIGND -- Packed Sign 32x4 (XMM) */
   16008       if (have66noF2noF3(pfx)
   16009           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   16010          IRTemp sV      = newTemp(Ity_V128);
   16011          IRTemp dV      = newTemp(Ity_V128);
   16012          IRTemp sHi     = newTemp(Ity_I64);
   16013          IRTemp sLo     = newTemp(Ity_I64);
   16014          IRTemp dHi     = newTemp(Ity_I64);
   16015          IRTemp dLo     = newTemp(Ity_I64);
   16016          const HChar* str = "???";
   16017          Int    laneszB = 0;
   16018 
   16019          switch (opc) {
   16020             case 0x08: laneszB = 1; str = "b"; break;
   16021             case 0x09: laneszB = 2; str = "w"; break;
   16022             case 0x0A: laneszB = 4; str = "d"; break;
   16023             default: vassert(0);
   16024          }
   16025 
   16026          modrm = getUChar(delta);
   16027          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   16028 
   16029          if (epartIsReg(modrm)) {
   16030             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   16031             delta += 1;
   16032             DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
   16033                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   16034          } else {
   16035             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   16036             gen_SEGV_if_not_16_aligned( addr );
   16037             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   16038             delta += alen;
   16039             DIP("psign%s %s,%s\n", str, dis_buf,
   16040                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   16041          }
   16042 
   16043          assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   16044          assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   16045          assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   16046          assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   16047 
   16048          putXMMReg(
   16049             gregOfRexRM(pfx,modrm),
   16050             binop(Iop_64HLtoV128,
   16051                   dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
   16052                   dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
   16053             )
   16054          );
   16055          goto decode_success;
   16056       }
   16057       /* 0F 38 08 = PSIGNB -- Packed Sign 8x8  (MMX) */
   16058       /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */
   16059       /* 0F 38 0A = PSIGND -- Packed Sign 32x2 (MMX) */
   16060       if (haveNo66noF2noF3(pfx) && sz == 4) {
   16061          IRTemp sV      = newTemp(Ity_I64);
   16062          IRTemp dV      = newTemp(Ity_I64);
   16063          const HChar* str = "???";
   16064          Int    laneszB = 0;
   16065 
   16066          switch (opc) {
   16067             case 0x08: laneszB = 1; str = "b"; break;
   16068             case 0x09: laneszB = 2; str = "w"; break;
   16069             case 0x0A: laneszB = 4; str = "d"; break;
   16070             default: vassert(0);
   16071          }
   16072 
   16073          modrm = getUChar(delta);
   16074          do_MMX_preamble();
   16075          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   16076 
   16077          if (epartIsReg(modrm)) {
   16078             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   16079             delta += 1;
   16080             DIP("psign%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
   16081                                         nameMMXReg(gregLO3ofRM(modrm)));
   16082          } else {
   16083             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   16084             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   16085             delta += alen;
   16086             DIP("psign%s %s,%s\n", str, dis_buf,
   16087                                         nameMMXReg(gregLO3ofRM(modrm)));
   16088          }
   16089 
   16090          putMMXReg(
   16091             gregLO3ofRM(modrm),
   16092             dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB )
   16093          );
   16094          goto decode_success;
   16095       }
   16096       break;
   16097 
   16098    case 0x0B:
   16099       /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and
   16100          Scale (XMM) */
   16101       if (have66noF2noF3(pfx)
   16102           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   16103          IRTemp sV  = newTemp(Ity_V128);
   16104          IRTemp dV  = newTemp(Ity_V128);
   16105          IRTemp sHi = newTemp(Ity_I64);
   16106          IRTemp sLo = newTemp(Ity_I64);
   16107          IRTemp dHi = newTemp(Ity_I64);
   16108          IRTemp dLo = newTemp(Ity_I64);
   16109 
   16110          modrm = getUChar(delta);
   16111          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   16112 
   16113          if (epartIsReg(modrm)) {
   16114             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   16115             delta += 1;
   16116             DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   16117                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   16118          } else {
   16119             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   16120             gen_SEGV_if_not_16_aligned( addr );
   16121             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   16122             delta += alen;
   16123             DIP("pmulhrsw %s,%s\n", dis_buf,
   16124                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   16125          }
   16126 
   16127          assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   16128          assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   16129          assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   16130          assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   16131 
   16132          putXMMReg(
   16133             gregOfRexRM(pfx,modrm),
   16134             binop(Iop_64HLtoV128,
   16135                   dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
   16136                   dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
   16137             )
   16138          );
   16139          goto decode_success;
   16140       }
   16141       /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale
   16142          (MMX) */
   16143       if (haveNo66noF2noF3(pfx) && sz == 4) {
   16144          IRTemp sV = newTemp(Ity_I64);
   16145          IRTemp dV = newTemp(Ity_I64);
   16146 
   16147          modrm = getUChar(delta);
   16148          do_MMX_preamble();
   16149          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   16150 
   16151          if (epartIsReg(modrm)) {
   16152             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   16153             delta += 1;
   16154             DIP("pmulhrsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   16155                                     nameMMXReg(gregLO3ofRM(modrm)));
   16156          } else {
   16157             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   16158             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   16159             delta += alen;
   16160             DIP("pmulhrsw %s,%s\n", dis_buf,
   16161                                     nameMMXReg(gregLO3ofRM(modrm)));
   16162          }
   16163 
   16164          putMMXReg(
   16165             gregLO3ofRM(modrm),
   16166             dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) )
   16167          );
   16168          goto decode_success;
   16169       }
   16170       break;
   16171 
   16172    case 0x1C:
   16173    case 0x1D:
   16174    case 0x1E:
   16175       /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */
   16176       /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */
   16177       /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */
   16178       if (have66noF2noF3(pfx)
   16179           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   16180          IRTemp sV  = newTemp(Ity_V128);
   16181          const HChar* str = "???";
   16182          Int    laneszB = 0;
   16183 
   16184          switch (opc) {
   16185             case 0x1C: laneszB = 1; str = "b"; break;
   16186             case 0x1D: laneszB = 2; str = "w"; break;
   16187             case 0x1E: laneszB = 4; str = "d"; break;
   16188             default: vassert(0);
   16189          }
   16190 
   16191          modrm = getUChar(delta);
   16192          if (epartIsReg(modrm)) {
   16193             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   16194             delta += 1;
   16195             DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
   16196                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
   16197          } else {
   16198             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   16199             gen_SEGV_if_not_16_aligned( addr );
   16200             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   16201             delta += alen;
   16202             DIP("pabs%s %s,%s\n", str, dis_buf,
   16203                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
   16204          }
   16205 
   16206          putXMMReg( gregOfRexRM(pfx,modrm),
   16207                     mkexpr(math_PABS_XMM(sV, laneszB)) );
   16208          goto decode_success;
   16209       }
   16210       /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8  (MMX) */
   16211       /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */
   16212       /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */
   16213       if (haveNo66noF2noF3(pfx) && sz == 4) {
   16214          IRTemp sV      = newTemp(Ity_I64);
   16215          const HChar* str = "???";
   16216          Int    laneszB = 0;
   16217 
   16218          switch (opc) {
   16219             case 0x1C: laneszB = 1; str = "b"; break;
   16220             case 0x1D: laneszB = 2; str = "w"; break;
   16221             case 0x1E: laneszB = 4; str = "d"; break;
   16222             default: vassert(0);
   16223          }
   16224 
   16225          modrm = getUChar(delta);
   16226          do_MMX_preamble();
   16227 
   16228          if (epartIsReg(modrm)) {
   16229             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   16230             delta += 1;
   16231             DIP("pabs%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
   16232                                        nameMMXReg(gregLO3ofRM(modrm)));
   16233          } else {
   16234             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   16235             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   16236             delta += alen;
   16237             DIP("pabs%s %s,%s\n", str, dis_buf,
   16238                                        nameMMXReg(gregLO3ofRM(modrm)));
   16239          }
   16240 
   16241          putMMXReg( gregLO3ofRM(modrm),
   16242                     mkexpr(math_PABS_MMX( sV, laneszB )) );
   16243          goto decode_success;
   16244       }
   16245       break;
   16246 
   16247    default:
   16248       break;
   16249 
   16250    }
   16251 
   16252   //decode_failure:
   16253    *decode_OK = False;
   16254    return deltaIN;
   16255 
   16256   decode_success:
   16257    *decode_OK = True;
   16258    return delta;
   16259 }
   16260 
   16261 
   16262 /*------------------------------------------------------------*/
   16263 /*---                                                      ---*/
   16264 /*--- Top-level SSSE3: dis_ESC_0F3A__SupSSE3               ---*/
   16265 /*---                                                      ---*/
   16266 /*------------------------------------------------------------*/
   16267 
   16268 __attribute__((noinline))
   16269 static
   16270 Long dis_ESC_0F3A__SupSSE3 ( Bool* decode_OK,
   16271                              const VexAbiInfo* vbi,
   16272                              Prefix pfx, Int sz, Long deltaIN )
   16273 {
   16274    Long   d64   = 0;
   16275    IRTemp addr  = IRTemp_INVALID;
   16276    UChar  modrm = 0;
   16277    Int    alen  = 0;
   16278    HChar  dis_buf[50];
   16279 
   16280    *decode_OK = False;
   16281 
   16282    Long   delta = deltaIN;
   16283    UChar  opc   = getUChar(delta);
   16284    delta++;
   16285    switch (opc) {
   16286 
   16287    case 0x0F:
   16288       /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */
   16289       if (have66noF2noF3(pfx)
   16290           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   16291          IRTemp sV  = newTemp(Ity_V128);
   16292          IRTemp dV  = newTemp(Ity_V128);
   16293 
   16294          modrm = getUChar(delta);
   16295          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   16296 
   16297          if (epartIsReg(modrm)) {
   16298             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   16299             d64 = (Long)getUChar(delta+1);
   16300             delta += 1+1;
   16301             DIP("palignr $%lld,%s,%s\n", d64,
   16302                                        nameXMMReg(eregOfRexRM(pfx,modrm)),
   16303                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
   16304          } else {
   16305             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   16306             gen_SEGV_if_not_16_aligned( addr );
   16307             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   16308             d64 = (Long)getUChar(delta+alen);
   16309             delta += alen+1;
   16310             DIP("palignr $%lld,%s,%s\n", d64,
   16311                                        dis_buf,
   16312                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
   16313          }
   16314 
   16315          IRTemp res = math_PALIGNR_XMM( sV, dV, d64 );
   16316          putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
   16317          goto decode_success;
   16318       }
   16319       /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */
   16320       if (haveNo66noF2noF3(pfx) && sz == 4) {
   16321          IRTemp sV  = newTemp(Ity_I64);
   16322          IRTemp dV  = newTemp(Ity_I64);
   16323          IRTemp res = newTemp(Ity_I64);
   16324 
   16325          modrm = getUChar(delta);
   16326          do_MMX_preamble();
   16327          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   16328 
   16329          if (epartIsReg(modrm)) {
   16330             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   16331             d64 = (Long)getUChar(delta+1);
   16332             delta += 1+1;
   16333             DIP("palignr $%lld,%s,%s\n",  d64,
   16334                                         nameMMXReg(eregLO3ofRM(modrm)),
   16335                                         nameMMXReg(gregLO3ofRM(modrm)));
   16336          } else {
   16337             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   16338             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   16339             d64 = (Long)getUChar(delta+alen);
   16340             delta += alen+1;
   16341             DIP("palignr $%lld%s,%s\n", d64,
   16342                                       dis_buf,
   16343                                       nameMMXReg(gregLO3ofRM(modrm)));
   16344          }
   16345 
   16346          if (d64 == 0) {
   16347             assign( res, mkexpr(sV) );
   16348          }
   16349          else if (d64 >= 1 && d64 <= 7) {
   16350             assign(res,
   16351                    binop(Iop_Or64,
   16352                          binop(Iop_Shr64, mkexpr(sV), mkU8(8*d64)),
   16353                          binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d64))
   16354                         )));
   16355          }
   16356          else if (d64 == 8) {
   16357            assign( res, mkexpr(dV) );
   16358          }
   16359          else if (d64 >= 9 && d64 <= 15) {
   16360             assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d64-8))) );
   16361          }
   16362          else if (d64 >= 16 && d64 <= 255) {
   16363             assign( res, mkU64(0) );
   16364          }
   16365          else
   16366             vassert(0);
   16367 
   16368          putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
   16369          goto decode_success;
   16370       }
   16371       break;
   16372 
   16373    default:
   16374       break;
   16375 
   16376    }
   16377 
   16378   //decode_failure:
   16379    *decode_OK = False;
   16380    return deltaIN;
   16381 
   16382   decode_success:
   16383    *decode_OK = True;
   16384    return delta;
   16385 }
   16386 
   16387 
   16388 /*------------------------------------------------------------*/
   16389 /*---                                                      ---*/
   16390 /*--- Top-level SSE4: dis_ESC_0F__SSE4                     ---*/
   16391 /*---                                                      ---*/
   16392 /*------------------------------------------------------------*/
   16393 
   16394 __attribute__((noinline))
   16395 static
   16396 Long dis_ESC_0F__SSE4 ( Bool* decode_OK,
   16397                         const VexArchInfo* archinfo,
   16398                         const VexAbiInfo* vbi,
   16399                         Prefix pfx, Int sz, Long deltaIN )
   16400 {
   16401    IRTemp addr  = IRTemp_INVALID;
   16402    IRType ty    = Ity_INVALID;
   16403    UChar  modrm = 0;
   16404    Int    alen  = 0;
   16405    HChar  dis_buf[50];
   16406 
   16407    *decode_OK = False;
   16408 
   16409    Long   delta = deltaIN;
   16410    UChar  opc   = getUChar(delta);
   16411    delta++;
   16412    switch (opc) {
   16413 
   16414    case 0xB8:
   16415       /* F3 0F B8  = POPCNT{W,L,Q}
   16416          Count the number of 1 bits in a register
   16417       */
   16418       if (haveF3noF2(pfx) /* so both 66 and REX.W are possibilities */
   16419           && (sz == 2 || sz == 4 || sz == 8)) {
   16420          /*IRType*/ ty  = szToITy(sz);
   16421          IRTemp     src = newTemp(ty);
   16422          modrm = getUChar(delta);
   16423          if (epartIsReg(modrm)) {
   16424             assign(src, getIRegE(sz, pfx, modrm));
   16425             delta += 1;
   16426             DIP("popcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
   16427                 nameIRegG(sz, pfx, modrm));
   16428          } else {
   16429             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0);
   16430             assign(src, loadLE(ty, mkexpr(addr)));
   16431             delta += alen;
   16432             DIP("popcnt%c %s, %s\n", nameISize(sz), dis_buf,
   16433                 nameIRegG(sz, pfx, modrm));
   16434          }
   16435 
   16436          IRTemp result = gen_POPCOUNT(ty, src);
   16437          putIRegG(sz, pfx, modrm, mkexpr(result));
   16438 
   16439          // Update flags.  This is pretty lame .. perhaps can do better
   16440          // if this turns out to be performance critical.
   16441          // O S A C P are cleared.  Z is set if SRC == 0.
   16442          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   16443          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   16444          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   16445          stmt( IRStmt_Put( OFFB_CC_DEP1,
   16446                binop(Iop_Shl64,
   16447                      unop(Iop_1Uto64,
   16448                           binop(Iop_CmpEQ64,
   16449                                 widenUto64(mkexpr(src)),
   16450                                 mkU64(0))),
   16451                      mkU8(AMD64G_CC_SHIFT_Z))));
   16452 
   16453          goto decode_success;
   16454       }
   16455       break;
   16456 
   16457    case 0xBC:
   16458       /* F3 0F BC -- TZCNT (count trailing zeroes.  A BMI extension,
   16459          which we can only decode if we're sure this is a BMI1 capable cpu
   16460          that supports TZCNT, since otherwise it's BSF, which behaves
   16461          differently on zero source.  */
   16462       if (haveF3noF2(pfx) /* so both 66 and 48 are possibilities */
   16463           && (sz == 2 || sz == 4 || sz == 8)
   16464           && 0 != (archinfo->hwcaps & VEX_HWCAPS_AMD64_BMI)) {
   16465          /*IRType*/ ty  = szToITy(sz);
   16466          IRTemp     src = newTemp(ty);
   16467          modrm = getUChar(delta);
   16468          if (epartIsReg(modrm)) {
   16469             assign(src, getIRegE(sz, pfx, modrm));
   16470             delta += 1;
   16471             DIP("tzcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
   16472                 nameIRegG(sz, pfx, modrm));
   16473          } else {
   16474             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0);
   16475             assign(src, loadLE(ty, mkexpr(addr)));
   16476             delta += alen;
   16477             DIP("tzcnt%c %s, %s\n", nameISize(sz), dis_buf,
   16478                 nameIRegG(sz, pfx, modrm));
   16479          }
   16480 
   16481          IRTemp res = gen_TZCNT(ty, src);
   16482          putIRegG(sz, pfx, modrm, mkexpr(res));
   16483 
   16484          // Update flags.  This is pretty lame .. perhaps can do better
   16485          // if this turns out to be performance critical.
   16486          // O S A P are cleared.  Z is set if RESULT == 0.
   16487          // C is set if SRC is zero.
   16488          IRTemp src64 = newTemp(Ity_I64);
   16489          IRTemp res64 = newTemp(Ity_I64);
   16490          assign(src64, widenUto64(mkexpr(src)));
   16491          assign(res64, widenUto64(mkexpr(res)));
   16492 
   16493          IRTemp oszacp = newTemp(Ity_I64);
   16494          assign(
   16495             oszacp,
   16496             binop(Iop_Or64,
   16497                   binop(Iop_Shl64,
   16498                         unop(Iop_1Uto64,
   16499                              binop(Iop_CmpEQ64, mkexpr(res64), mkU64(0))),
   16500                         mkU8(AMD64G_CC_SHIFT_Z)),
   16501                   binop(Iop_Shl64,
   16502                         unop(Iop_1Uto64,
   16503                              binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0))),
   16504                         mkU8(AMD64G_CC_SHIFT_C))
   16505             )
   16506          );
   16507 
   16508          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   16509          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   16510          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   16511          stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
   16512 
   16513          goto decode_success;
   16514       }
   16515       break;
   16516 
   16517    case 0xBD:
   16518       /* F3 0F BD -- LZCNT (count leading zeroes.  An AMD extension,
   16519          which we can only decode if we're sure this is an AMD cpu
   16520          that supports LZCNT, since otherwise it's BSR, which behaves
   16521          differently.  Bizarrely, my Sandy Bridge also accepts these
   16522          instructions but produces different results. */
   16523       if (haveF3noF2(pfx) /* so both 66 and 48 are possibilities */
   16524           && (sz == 2 || sz == 4 || sz == 8)
   16525           && 0 != (archinfo->hwcaps & VEX_HWCAPS_AMD64_LZCNT)) {
   16526          /*IRType*/ ty  = szToITy(sz);
   16527          IRTemp     src = newTemp(ty);
   16528          modrm = getUChar(delta);
   16529          if (epartIsReg(modrm)) {
   16530             assign(src, getIRegE(sz, pfx, modrm));
   16531             delta += 1;
   16532             DIP("lzcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
   16533                 nameIRegG(sz, pfx, modrm));
   16534          } else {
   16535             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0);
   16536             assign(src, loadLE(ty, mkexpr(addr)));
   16537             delta += alen;
   16538             DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf,
   16539                 nameIRegG(sz, pfx, modrm));
   16540          }
   16541 
   16542          IRTemp res = gen_LZCNT(ty, src);
   16543          putIRegG(sz, pfx, modrm, mkexpr(res));
   16544 
   16545          // Update flags.  This is pretty lame .. perhaps can do better
   16546          // if this turns out to be performance critical.
   16547          // O S A P are cleared.  Z is set if RESULT == 0.
   16548          // C is set if SRC is zero.
   16549          IRTemp src64 = newTemp(Ity_I64);
   16550          IRTemp res64 = newTemp(Ity_I64);
   16551          assign(src64, widenUto64(mkexpr(src)));
   16552          assign(res64, widenUto64(mkexpr(res)));
   16553 
   16554          IRTemp oszacp = newTemp(Ity_I64);
   16555          assign(
   16556             oszacp,
   16557             binop(Iop_Or64,
   16558                   binop(Iop_Shl64,
   16559                         unop(Iop_1Uto64,
   16560                              binop(Iop_CmpEQ64, mkexpr(res64), mkU64(0))),
   16561                         mkU8(AMD64G_CC_SHIFT_Z)),
   16562                   binop(Iop_Shl64,
   16563                         unop(Iop_1Uto64,
   16564                              binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0))),
   16565                         mkU8(AMD64G_CC_SHIFT_C))
   16566             )
   16567          );
   16568 
   16569          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   16570          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   16571          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   16572          stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
   16573 
   16574          goto decode_success;
   16575       }
   16576       break;
   16577 
   16578    default:
   16579       break;
   16580 
   16581    }
   16582 
   16583   //decode_failure:
   16584    *decode_OK = False;
   16585    return deltaIN;
   16586 
   16587   decode_success:
   16588    *decode_OK = True;
   16589    return delta;
   16590 }
   16591 
   16592 
   16593 /*------------------------------------------------------------*/
   16594 /*---                                                      ---*/
   16595 /*--- Top-level SSE4: dis_ESC_0F38__SSE4                   ---*/
   16596 /*---                                                      ---*/
   16597 /*------------------------------------------------------------*/
   16598 
   16599 static IRTemp math_PBLENDVB_128 ( IRTemp vecE, IRTemp vecG,
   16600                                   IRTemp vec0/*controlling mask*/,
   16601                                   UInt gran, IROp opSAR )
   16602 {
   16603    /* The tricky bit is to convert vec0 into a suitable mask, by
   16604       copying the most significant bit of each lane into all positions
   16605       in the lane. */
   16606    IRTemp sh = newTemp(Ity_I8);
   16607    assign(sh, mkU8(8 * gran - 1));
   16608 
   16609    IRTemp mask = newTemp(Ity_V128);
   16610    assign(mask, binop(opSAR, mkexpr(vec0), mkexpr(sh)));
   16611 
   16612    IRTemp notmask = newTemp(Ity_V128);
   16613    assign(notmask, unop(Iop_NotV128, mkexpr(mask)));
   16614 
   16615    IRTemp res = newTemp(Ity_V128);
   16616    assign(res,  binop(Iop_OrV128,
   16617                       binop(Iop_AndV128, mkexpr(vecE), mkexpr(mask)),
   16618                       binop(Iop_AndV128, mkexpr(vecG), mkexpr(notmask))));
   16619    return res;
   16620 }
   16621 
   16622 static IRTemp math_PBLENDVB_256 ( IRTemp vecE, IRTemp vecG,
   16623                                   IRTemp vec0/*controlling mask*/,
   16624                                   UInt gran, IROp opSAR128 )
   16625 {
   16626    /* The tricky bit is to convert vec0 into a suitable mask, by
   16627       copying the most significant bit of each lane into all positions
   16628       in the lane. */
   16629    IRTemp sh = newTemp(Ity_I8);
   16630    assign(sh, mkU8(8 * gran - 1));
   16631 
   16632    IRTemp vec0Hi = IRTemp_INVALID;
   16633    IRTemp vec0Lo = IRTemp_INVALID;
   16634    breakupV256toV128s( vec0, &vec0Hi, &vec0Lo );
   16635 
   16636    IRTemp mask = newTemp(Ity_V256);
   16637    assign(mask, binop(Iop_V128HLtoV256,
   16638                       binop(opSAR128, mkexpr(vec0Hi), mkexpr(sh)),
   16639                       binop(opSAR128, mkexpr(vec0Lo), mkexpr(sh))));
   16640 
   16641    IRTemp notmask = newTemp(Ity_V256);
   16642    assign(notmask, unop(Iop_NotV256, mkexpr(mask)));
   16643 
   16644    IRTemp res = newTemp(Ity_V256);
   16645    assign(res,  binop(Iop_OrV256,
   16646                       binop(Iop_AndV256, mkexpr(vecE), mkexpr(mask)),
   16647                       binop(Iop_AndV256, mkexpr(vecG), mkexpr(notmask))));
   16648    return res;
   16649 }
   16650 
   16651 static Long dis_VBLENDV_128 ( const VexAbiInfo* vbi, Prefix pfx, Long delta,
   16652                               const HChar *name, UInt gran, IROp opSAR )
   16653 {
   16654    IRTemp addr   = IRTemp_INVALID;
   16655    Int    alen   = 0;
   16656    HChar  dis_buf[50];
   16657    UChar  modrm  = getUChar(delta);
   16658    UInt   rG     = gregOfRexRM(pfx, modrm);
   16659    UInt   rV     = getVexNvvvv(pfx);
   16660    UInt   rIS4   = 0xFF; /* invalid */
   16661    IRTemp vecE   = newTemp(Ity_V128);
   16662    IRTemp vecV   = newTemp(Ity_V128);
   16663    IRTemp vecIS4 = newTemp(Ity_V128);
   16664    if (epartIsReg(modrm)) {
   16665       delta++;
   16666       UInt rE = eregOfRexRM(pfx, modrm);
   16667       assign(vecE, getXMMReg(rE));
   16668       UChar ib = getUChar(delta);
   16669       rIS4 = (ib >> 4) & 0xF;
   16670       DIP("%s %s,%s,%s,%s\n",
   16671           name, nameXMMReg(rIS4), nameXMMReg(rE),
   16672           nameXMMReg(rV), nameXMMReg(rG));
   16673    } else {
   16674       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   16675       delta += alen;
   16676       assign(vecE, loadLE(Ity_V128, mkexpr(addr)));
   16677       UChar ib = getUChar(delta);
   16678       rIS4 = (ib >> 4) & 0xF;
   16679       DIP("%s %s,%s,%s,%s\n",
   16680           name, nameXMMReg(rIS4), dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   16681    }
   16682    delta++;
   16683    assign(vecV,   getXMMReg(rV));
   16684    assign(vecIS4, getXMMReg(rIS4));
   16685    IRTemp res = math_PBLENDVB_128( vecE, vecV, vecIS4, gran, opSAR );
   16686    putYMMRegLoAndZU( rG, mkexpr(res) );
   16687    return delta;
   16688 }
   16689 
   16690 static Long dis_VBLENDV_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta,
   16691                               const HChar *name, UInt gran, IROp opSAR128 )
   16692 {
   16693    IRTemp addr   = IRTemp_INVALID;
   16694    Int    alen   = 0;
   16695    HChar  dis_buf[50];
   16696    UChar  modrm  = getUChar(delta);
   16697    UInt   rG     = gregOfRexRM(pfx, modrm);
   16698    UInt   rV     = getVexNvvvv(pfx);
   16699    UInt   rIS4   = 0xFF; /* invalid */
   16700    IRTemp vecE   = newTemp(Ity_V256);
   16701    IRTemp vecV   = newTemp(Ity_V256);
   16702    IRTemp vecIS4 = newTemp(Ity_V256);
   16703    if (epartIsReg(modrm)) {
   16704       delta++;
   16705       UInt rE = eregOfRexRM(pfx, modrm);
   16706       assign(vecE, getYMMReg(rE));
   16707       UChar ib = getUChar(delta);
   16708       rIS4 = (ib >> 4) & 0xF;
   16709       DIP("%s %s,%s,%s,%s\n",
   16710           name, nameYMMReg(rIS4), nameYMMReg(rE),
   16711           nameYMMReg(rV), nameYMMReg(rG));
   16712    } else {
   16713       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   16714       delta += alen;
   16715       assign(vecE, loadLE(Ity_V256, mkexpr(addr)));
   16716       UChar ib = getUChar(delta);
   16717       rIS4 = (ib >> 4) & 0xF;
   16718       DIP("%s %s,%s,%s,%s\n",
   16719           name, nameYMMReg(rIS4), dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   16720    }
   16721    delta++;
   16722    assign(vecV,   getYMMReg(rV));
   16723    assign(vecIS4, getYMMReg(rIS4));
   16724    IRTemp res = math_PBLENDVB_256( vecE, vecV, vecIS4, gran, opSAR128 );
   16725    putYMMReg( rG, mkexpr(res) );
   16726    return delta;
   16727 }
   16728 
   16729 static void finish_xTESTy ( IRTemp andV, IRTemp andnV, Int sign )
   16730 {
   16731    /* Set Z=1 iff (vecE & vecG) == 0
   16732       Set C=1 iff (vecE & not vecG) == 0
   16733    */
   16734 
   16735    /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
   16736 
   16737    /* andV resp. andnV, reduced to 64-bit values, by or-ing the top
   16738       and bottom 64-bits together.  It relies on this trick:
   16739 
   16740       InterleaveLO64x2([a,b],[c,d]) == [b,d]    hence
   16741 
   16742       InterleaveLO64x2([a,b],[a,b]) == [b,b]    and similarly
   16743       InterleaveHI64x2([a,b],[a,b]) == [a,a]
   16744 
   16745       and so the OR of the above 2 exprs produces
   16746       [a OR b, a OR b], from which we simply take the lower half.
   16747    */
   16748    IRTemp and64  = newTemp(Ity_I64);
   16749    IRTemp andn64 = newTemp(Ity_I64);
   16750 
   16751    assign(and64,
   16752           unop(Iop_V128to64,
   16753                binop(Iop_OrV128,
   16754                      binop(Iop_InterleaveLO64x2,
   16755                            mkexpr(andV), mkexpr(andV)),
   16756                      binop(Iop_InterleaveHI64x2,
   16757                            mkexpr(andV), mkexpr(andV)))));
   16758 
   16759    assign(andn64,
   16760           unop(Iop_V128to64,
   16761                binop(Iop_OrV128,
   16762                      binop(Iop_InterleaveLO64x2,
   16763                            mkexpr(andnV), mkexpr(andnV)),
   16764                      binop(Iop_InterleaveHI64x2,
   16765                            mkexpr(andnV), mkexpr(andnV)))));
   16766 
   16767    IRTemp z64 = newTemp(Ity_I64);
   16768    IRTemp c64 = newTemp(Ity_I64);
   16769    if (sign == 64) {
   16770       /* When only interested in the most significant bit, just shift
   16771          arithmetically right and negate.  */
   16772       assign(z64,
   16773              unop(Iop_Not64,
   16774                   binop(Iop_Sar64, mkexpr(and64), mkU8(63))));
   16775 
   16776       assign(c64,
   16777              unop(Iop_Not64,
   16778                   binop(Iop_Sar64, mkexpr(andn64), mkU8(63))));
   16779    } else {
   16780       if (sign == 32) {
   16781          /* When interested in bit 31 and bit 63, mask those bits and
   16782             fallthrough into the PTEST handling.  */
   16783          IRTemp t0 = newTemp(Ity_I64);
   16784          IRTemp t1 = newTemp(Ity_I64);
   16785          IRTemp t2 = newTemp(Ity_I64);
   16786          assign(t0, mkU64(0x8000000080000000ULL));
   16787          assign(t1, binop(Iop_And64, mkexpr(and64), mkexpr(t0)));
   16788          assign(t2, binop(Iop_And64, mkexpr(andn64), mkexpr(t0)));
   16789          and64 = t1;
   16790          andn64 = t2;
   16791       }
   16792       /* Now convert and64, andn64 to all-zeroes or all-1s, so we can
   16793          slice out the Z and C bits conveniently.  We use the standard
   16794          trick all-zeroes -> all-zeroes, anything-else -> all-ones
   16795          done by "(x | -x) >>s (word-size - 1)".
   16796       */
   16797       assign(z64,
   16798              unop(Iop_Not64,
   16799                   binop(Iop_Sar64,
   16800                         binop(Iop_Or64,
   16801                               binop(Iop_Sub64, mkU64(0), mkexpr(and64)),
   16802                                     mkexpr(and64)), mkU8(63))));
   16803 
   16804       assign(c64,
   16805              unop(Iop_Not64,
   16806                   binop(Iop_Sar64,
   16807                         binop(Iop_Or64,
   16808                               binop(Iop_Sub64, mkU64(0), mkexpr(andn64)),
   16809                                     mkexpr(andn64)), mkU8(63))));
   16810    }
   16811 
   16812    /* And finally, slice out the Z and C flags and set the flags
   16813       thunk to COPY for them.  OSAP are set to zero. */
   16814    IRTemp newOSZACP = newTemp(Ity_I64);
   16815    assign(newOSZACP,
   16816           binop(Iop_Or64,
   16817                 binop(Iop_And64, mkexpr(z64), mkU64(AMD64G_CC_MASK_Z)),
   16818                 binop(Iop_And64, mkexpr(c64), mkU64(AMD64G_CC_MASK_C))));
   16819 
   16820    stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(newOSZACP)));
   16821    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   16822    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   16823    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   16824 }
   16825 
   16826 
   16827 /* Handles 128 bit versions of PTEST, VTESTPS or VTESTPD.
   16828    sign is 0 for PTEST insn, 32 for VTESTPS and 64 for VTESTPD. */
   16829 static Long dis_xTESTy_128 ( const VexAbiInfo* vbi, Prefix pfx,
   16830                              Long delta, Bool isAvx, Int sign )
   16831 {
   16832    IRTemp addr   = IRTemp_INVALID;
   16833    Int    alen   = 0;
   16834    HChar  dis_buf[50];
   16835    UChar  modrm  = getUChar(delta);
   16836    UInt   rG     = gregOfRexRM(pfx, modrm);
   16837    IRTemp vecE = newTemp(Ity_V128);
   16838    IRTemp vecG = newTemp(Ity_V128);
   16839 
   16840    if ( epartIsReg(modrm) ) {
   16841       UInt rE = eregOfRexRM(pfx, modrm);
   16842       assign(vecE, getXMMReg(rE));
   16843       delta += 1;
   16844       DIP( "%s%stest%s %s,%s\n",
   16845            isAvx ? "v" : "", sign == 0 ? "p" : "",
   16846            sign == 0 ? "" : sign == 32 ? "ps" : "pd",
   16847            nameXMMReg(rE), nameXMMReg(rG) );
   16848    } else {
   16849       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16850       if (!isAvx)
   16851          gen_SEGV_if_not_16_aligned( addr );
   16852       assign(vecE, loadLE( Ity_V128, mkexpr(addr) ));
   16853       delta += alen;
   16854       DIP( "%s%stest%s %s,%s\n",
   16855            isAvx ? "v" : "", sign == 0 ? "p" : "",
   16856            sign == 0 ? "" : sign == 32 ? "ps" : "pd",
   16857            dis_buf, nameXMMReg(rG) );
   16858    }
   16859 
   16860    assign(vecG, getXMMReg(rG));
   16861 
   16862    /* Set Z=1 iff (vecE & vecG) == 0
   16863       Set C=1 iff (vecE & not vecG) == 0
   16864    */
   16865 
   16866    /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
   16867    IRTemp andV  = newTemp(Ity_V128);
   16868    IRTemp andnV = newTemp(Ity_V128);
   16869    assign(andV,  binop(Iop_AndV128, mkexpr(vecE), mkexpr(vecG)));
   16870    assign(andnV, binop(Iop_AndV128,
   16871                        mkexpr(vecE),
   16872                        binop(Iop_XorV128, mkexpr(vecG),
   16873                                           mkV128(0xFFFF))));
   16874 
   16875    finish_xTESTy ( andV, andnV, sign );
   16876    return delta;
   16877 }
   16878 
   16879 
   16880 /* Handles 256 bit versions of PTEST, VTESTPS or VTESTPD.
   16881    sign is 0 for PTEST insn, 32 for VTESTPS and 64 for VTESTPD. */
   16882 static Long dis_xTESTy_256 ( const VexAbiInfo* vbi, Prefix pfx,
   16883                              Long delta, Int sign )
   16884 {
   16885    IRTemp addr   = IRTemp_INVALID;
   16886    Int    alen   = 0;
   16887    HChar  dis_buf[50];
   16888    UChar  modrm  = getUChar(delta);
   16889    UInt   rG     = gregOfRexRM(pfx, modrm);
   16890    IRTemp vecE   = newTemp(Ity_V256);
   16891    IRTemp vecG   = newTemp(Ity_V256);
   16892 
   16893    if ( epartIsReg(modrm) ) {
   16894       UInt rE = eregOfRexRM(pfx, modrm);
   16895       assign(vecE, getYMMReg(rE));
   16896       delta += 1;
   16897       DIP( "v%stest%s %s,%s\n", sign == 0 ? "p" : "",
   16898            sign == 0 ? "" : sign == 32 ? "ps" : "pd",
   16899            nameYMMReg(rE), nameYMMReg(rG) );
   16900    } else {
   16901       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16902       assign(vecE, loadLE( Ity_V256, mkexpr(addr) ));
   16903       delta += alen;
   16904       DIP( "v%stest%s %s,%s\n", sign == 0 ? "p" : "",
   16905            sign == 0 ? "" : sign == 32 ? "ps" : "pd",
   16906            dis_buf, nameYMMReg(rG) );
   16907    }
   16908 
   16909    assign(vecG, getYMMReg(rG));
   16910 
   16911    /* Set Z=1 iff (vecE & vecG) == 0
   16912       Set C=1 iff (vecE & not vecG) == 0
   16913    */
   16914 
   16915    /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
   16916    IRTemp andV  = newTemp(Ity_V256);
   16917    IRTemp andnV = newTemp(Ity_V256);
   16918    assign(andV,  binop(Iop_AndV256, mkexpr(vecE), mkexpr(vecG)));
   16919    assign(andnV, binop(Iop_AndV256,
   16920                        mkexpr(vecE), unop(Iop_NotV256, mkexpr(vecG))));
   16921 
   16922    IRTemp andVhi  = IRTemp_INVALID;
   16923    IRTemp andVlo  = IRTemp_INVALID;
   16924    IRTemp andnVhi = IRTemp_INVALID;
   16925    IRTemp andnVlo = IRTemp_INVALID;
   16926    breakupV256toV128s( andV, &andVhi, &andVlo );
   16927    breakupV256toV128s( andnV, &andnVhi, &andnVlo );
   16928 
   16929    IRTemp andV128  = newTemp(Ity_V128);
   16930    IRTemp andnV128 = newTemp(Ity_V128);
   16931    assign( andV128, binop( Iop_OrV128, mkexpr(andVhi), mkexpr(andVlo) ) );
   16932    assign( andnV128, binop( Iop_OrV128, mkexpr(andnVhi), mkexpr(andnVlo) ) );
   16933 
   16934    finish_xTESTy ( andV128, andnV128, sign );
   16935    return delta;
   16936 }
   16937 
   16938 
   16939 /* Handles 128 bit versions of PMOVZXBW and PMOVSXBW. */
   16940 static Long dis_PMOVxXBW_128 ( const VexAbiInfo* vbi, Prefix pfx,
   16941                                Long delta, Bool isAvx, Bool xIsZ )
   16942 {
   16943    IRTemp addr   = IRTemp_INVALID;
   16944    Int    alen   = 0;
   16945    HChar  dis_buf[50];
   16946    IRTemp srcVec = newTemp(Ity_V128);
   16947    UChar  modrm  = getUChar(delta);
   16948    const HChar* mbV    = isAvx ? "v" : "";
   16949    const HChar  how    = xIsZ ? 'z' : 's';
   16950    UInt   rG     = gregOfRexRM(pfx, modrm);
   16951    if ( epartIsReg(modrm) ) {
   16952       UInt rE = eregOfRexRM(pfx, modrm);
   16953       assign( srcVec, getXMMReg(rE) );
   16954       delta += 1;
   16955       DIP( "%spmov%cxbw %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
   16956    } else {
   16957       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16958       assign( srcVec,
   16959               unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
   16960       delta += alen;
   16961       DIP( "%spmov%cxbw %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
   16962    }
   16963 
   16964    IRExpr* res
   16965       = xIsZ /* do math for either zero or sign extend */
   16966         ? binop( Iop_InterleaveLO8x16,
   16967                  IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) )
   16968         : binop( Iop_SarN16x8,
   16969                  binop( Iop_ShlN16x8,
   16970                         binop( Iop_InterleaveLO8x16,
   16971                                IRExpr_Const( IRConst_V128(0) ),
   16972                                mkexpr(srcVec) ),
   16973                         mkU8(8) ),
   16974                  mkU8(8) );
   16975 
   16976    (isAvx ? putYMMRegLoAndZU : putXMMReg) ( rG, res );
   16977 
   16978    return delta;
   16979 }
   16980 
   16981 
   16982 /* Handles 256 bit versions of PMOVZXBW and PMOVSXBW. */
   16983 static Long dis_PMOVxXBW_256 ( const VexAbiInfo* vbi, Prefix pfx,
   16984                                Long delta, Bool xIsZ )
   16985 {
   16986    IRTemp addr   = IRTemp_INVALID;
   16987    Int    alen   = 0;
   16988    HChar  dis_buf[50];
   16989    IRTemp srcVec = newTemp(Ity_V128);
   16990    UChar  modrm  = getUChar(delta);
   16991    UChar  how    = xIsZ ? 'z' : 's';
   16992    UInt   rG     = gregOfRexRM(pfx, modrm);
   16993    if ( epartIsReg(modrm) ) {
   16994       UInt rE = eregOfRexRM(pfx, modrm);
   16995       assign( srcVec, getXMMReg(rE) );
   16996       delta += 1;
   16997       DIP( "vpmov%cxbw %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
   16998    } else {
   16999       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17000       assign( srcVec, loadLE( Ity_V128, mkexpr(addr) ) );
   17001       delta += alen;
   17002       DIP( "vpmov%cxbw %s,%s\n", how, dis_buf, nameYMMReg(rG) );
   17003    }
   17004 
   17005    /* First do zero extend.  */
   17006    IRExpr* res
   17007       = binop( Iop_V128HLtoV256,
   17008                binop( Iop_InterleaveHI8x16,
   17009                       IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ),
   17010                binop( Iop_InterleaveLO8x16,
   17011                       IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) );
   17012    /* And if needed sign extension as well.  */
   17013    if (!xIsZ)
   17014       res = binop( Iop_SarN16x16,
   17015                    binop( Iop_ShlN16x16, res, mkU8(8) ), mkU8(8) );
   17016 
   17017    putYMMReg ( rG, res );
   17018 
   17019    return delta;
   17020 }
   17021 
   17022 
   17023 static Long dis_PMOVxXWD_128 ( const VexAbiInfo* vbi, Prefix pfx,
   17024                                Long delta, Bool isAvx, Bool xIsZ )
   17025 {
   17026    IRTemp addr   = IRTemp_INVALID;
   17027    Int    alen   = 0;
   17028    HChar  dis_buf[50];
   17029    IRTemp srcVec = newTemp(Ity_V128);
   17030    UChar  modrm  = getUChar(delta);
   17031    const HChar* mbV    = isAvx ? "v" : "";
   17032    const HChar  how    = xIsZ ? 'z' : 's';
   17033    UInt   rG     = gregOfRexRM(pfx, modrm);
   17034 
   17035    if ( epartIsReg(modrm) ) {
   17036       UInt rE = eregOfRexRM(pfx, modrm);
   17037       assign( srcVec, getXMMReg(rE) );
   17038       delta += 1;
   17039       DIP( "%spmov%cxwd %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
   17040    } else {
   17041       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17042       assign( srcVec,
   17043               unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
   17044       delta += alen;
   17045       DIP( "%spmov%cxwd %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
   17046    }
   17047 
   17048    IRExpr* res
   17049       = binop( Iop_InterleaveLO16x8,
   17050                IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) );
   17051    if (!xIsZ)
   17052       res = binop(Iop_SarN32x4,
   17053                   binop(Iop_ShlN32x4, res, mkU8(16)), mkU8(16));
   17054 
   17055    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   17056       ( gregOfRexRM(pfx, modrm), res );
   17057 
   17058    return delta;
   17059 }
   17060 
   17061 
   17062 static Long dis_PMOVxXWD_256 ( const VexAbiInfo* vbi, Prefix pfx,
   17063                                Long delta, Bool xIsZ )
   17064 {
   17065    IRTemp addr   = IRTemp_INVALID;
   17066    Int    alen   = 0;
   17067    HChar  dis_buf[50];
   17068    IRTemp srcVec = newTemp(Ity_V128);
   17069    UChar  modrm  = getUChar(delta);
   17070    UChar  how    = xIsZ ? 'z' : 's';
   17071    UInt   rG     = gregOfRexRM(pfx, modrm);
   17072 
   17073    if ( epartIsReg(modrm) ) {
   17074       UInt rE = eregOfRexRM(pfx, modrm);
   17075       assign( srcVec, getXMMReg(rE) );
   17076       delta += 1;
   17077       DIP( "vpmov%cxwd %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
   17078    } else {
   17079       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17080       assign( srcVec, loadLE( Ity_V128, mkexpr(addr) ) );
   17081       delta += alen;
   17082       DIP( "vpmov%cxwd %s,%s\n", how, dis_buf, nameYMMReg(rG) );
   17083    }
   17084 
   17085    IRExpr* res
   17086       = binop( Iop_V128HLtoV256,
   17087                binop( Iop_InterleaveHI16x8,
   17088                       IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ),
   17089                binop( Iop_InterleaveLO16x8,
   17090                       IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) );
   17091    if (!xIsZ)
   17092       res = binop(Iop_SarN32x8,
   17093                   binop(Iop_ShlN32x8, res, mkU8(16)), mkU8(16));
   17094 
   17095    putYMMReg ( rG, res );
   17096 
   17097    return delta;
   17098 }
   17099 
   17100 
   17101 static Long dis_PMOVSXWQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
   17102                                Long delta, Bool isAvx )
   17103 {
   17104    IRTemp addr     = IRTemp_INVALID;
   17105    Int    alen     = 0;
   17106    HChar  dis_buf[50];
   17107    IRTemp srcBytes = newTemp(Ity_I32);
   17108    UChar  modrm    = getUChar(delta);
   17109    const HChar* mbV = isAvx ? "v" : "";
   17110    UInt   rG       = gregOfRexRM(pfx, modrm);
   17111 
   17112    if ( epartIsReg( modrm ) ) {
   17113       UInt rE = eregOfRexRM(pfx, modrm);
   17114       assign( srcBytes, getXMMRegLane32( rE, 0 ) );
   17115       delta += 1;
   17116       DIP( "%spmovsxwq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
   17117    } else {
   17118       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17119       assign( srcBytes, loadLE( Ity_I32, mkexpr(addr) ) );
   17120       delta += alen;
   17121       DIP( "%spmovsxwq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
   17122    }
   17123 
   17124    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   17125       ( rG, binop( Iop_64HLtoV128,
   17126                    unop( Iop_16Sto64,
   17127                          unop( Iop_32HIto16, mkexpr(srcBytes) ) ),
   17128                    unop( Iop_16Sto64,
   17129                          unop( Iop_32to16, mkexpr(srcBytes) ) ) ) );
   17130    return delta;
   17131 }
   17132 
   17133 
   17134 static Long dis_PMOVSXWQ_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta )
   17135 {
   17136    IRTemp addr     = IRTemp_INVALID;
   17137    Int    alen     = 0;
   17138    HChar  dis_buf[50];
   17139    IRTemp srcBytes = newTemp(Ity_I64);
   17140    UChar  modrm    = getUChar(delta);
   17141    UInt   rG       = gregOfRexRM(pfx, modrm);
   17142    IRTemp s3, s2, s1, s0;
   17143    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   17144 
   17145    if ( epartIsReg( modrm ) ) {
   17146       UInt rE = eregOfRexRM(pfx, modrm);
   17147       assign( srcBytes, getXMMRegLane64( rE, 0 ) );
   17148       delta += 1;
   17149       DIP( "vpmovsxwq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
   17150    } else {
   17151       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17152       assign( srcBytes, loadLE( Ity_I64, mkexpr(addr) ) );
   17153       delta += alen;
   17154       DIP( "vpmovsxwq %s,%s\n", dis_buf, nameYMMReg(rG) );
   17155    }
   17156 
   17157    breakup64to16s( srcBytes, &s3, &s2, &s1, &s0 );
   17158    putYMMReg( rG, binop( Iop_V128HLtoV256,
   17159                          binop( Iop_64HLtoV128,
   17160                                 unop( Iop_16Sto64, mkexpr(s3) ),
   17161                                 unop( Iop_16Sto64, mkexpr(s2) ) ),
   17162                          binop( Iop_64HLtoV128,
   17163                                 unop( Iop_16Sto64, mkexpr(s1) ),
   17164                                 unop( Iop_16Sto64, mkexpr(s0) ) ) ) );
   17165    return delta;
   17166 }
   17167 
   17168 
   17169 static Long dis_PMOVZXWQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
   17170                                Long delta, Bool isAvx )
   17171 {
   17172    IRTemp addr     = IRTemp_INVALID;
   17173    Int    alen     = 0;
   17174    HChar  dis_buf[50];
   17175    IRTemp srcVec = newTemp(Ity_V128);
   17176    UChar  modrm    = getUChar(delta);
   17177    const HChar* mbV = isAvx ? "v" : "";
   17178    UInt   rG       = gregOfRexRM(pfx, modrm);
   17179 
   17180    if ( epartIsReg( modrm ) ) {
   17181       UInt rE = eregOfRexRM(pfx, modrm);
   17182       assign( srcVec, getXMMReg(rE) );
   17183       delta += 1;
   17184       DIP( "%spmovzxwq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
   17185    } else {
   17186       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17187       assign( srcVec,
   17188               unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
   17189       delta += alen;
   17190       DIP( "%spmovzxwq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
   17191    }
   17192 
   17193    IRTemp zeroVec = newTemp( Ity_V128 );
   17194    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   17195 
   17196    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   17197       ( rG, binop( Iop_InterleaveLO16x8,
   17198                    mkexpr(zeroVec),
   17199                    binop( Iop_InterleaveLO16x8,
   17200                           mkexpr(zeroVec), mkexpr(srcVec) ) ) );
   17201    return delta;
   17202 }
   17203 
   17204 
   17205 static Long dis_PMOVZXWQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
   17206                                Long delta )
   17207 {
   17208    IRTemp addr     = IRTemp_INVALID;
   17209    Int    alen     = 0;
   17210    HChar  dis_buf[50];
   17211    IRTemp srcVec = newTemp(Ity_V128);
   17212    UChar  modrm    = getUChar(delta);
   17213    UInt   rG       = gregOfRexRM(pfx, modrm);
   17214 
   17215    if ( epartIsReg( modrm ) ) {
   17216       UInt rE = eregOfRexRM(pfx, modrm);
   17217       assign( srcVec, getXMMReg(rE) );
   17218       delta += 1;
   17219       DIP( "vpmovzxwq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
   17220    } else {
   17221       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17222       assign( srcVec,
   17223               unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
   17224       delta += alen;
   17225       DIP( "vpmovzxwq %s,%s\n", dis_buf, nameYMMReg(rG) );
   17226    }
   17227 
   17228    IRTemp zeroVec = newTemp( Ity_V128 );
   17229    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   17230 
   17231    putYMMReg( rG, binop( Iop_V128HLtoV256,
   17232                          binop( Iop_InterleaveHI16x8,
   17233                                 mkexpr(zeroVec),
   17234                                 binop( Iop_InterleaveLO16x8,
   17235                                        mkexpr(zeroVec), mkexpr(srcVec) ) ),
   17236                          binop( Iop_InterleaveLO16x8,
   17237                                 mkexpr(zeroVec),
   17238                                 binop( Iop_InterleaveLO16x8,
   17239                                        mkexpr(zeroVec), mkexpr(srcVec) ) ) ) );
   17240    return delta;
   17241 }
   17242 
   17243 
   17244 /* Handles 128 bit versions of PMOVZXDQ and PMOVSXDQ. */
   17245 static Long dis_PMOVxXDQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
   17246                                Long delta, Bool isAvx, Bool xIsZ )
   17247 {
   17248    IRTemp addr   = IRTemp_INVALID;
   17249    Int    alen   = 0;
   17250    HChar  dis_buf[50];
   17251    IRTemp srcI64 = newTemp(Ity_I64);
   17252    IRTemp srcVec = newTemp(Ity_V128);
   17253    UChar  modrm  = getUChar(delta);
   17254    const HChar* mbV = isAvx ? "v" : "";
   17255    const HChar  how = xIsZ ? 'z' : 's';
   17256    UInt   rG     = gregOfRexRM(pfx, modrm);
   17257    /* Compute both srcI64 -- the value to expand -- and srcVec -- same
   17258       thing in a V128, with arbitrary junk in the top 64 bits.  Use
   17259       one or both of them and let iropt clean up afterwards (as
   17260       usual). */
   17261    if ( epartIsReg(modrm) ) {
   17262       UInt rE = eregOfRexRM(pfx, modrm);
   17263       assign( srcVec, getXMMReg(rE) );
   17264       assign( srcI64, unop(Iop_V128to64, mkexpr(srcVec)) );
   17265       delta += 1;
   17266       DIP( "%spmov%cxdq %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
   17267    } else {
   17268       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17269       assign( srcI64, loadLE(Ity_I64, mkexpr(addr)) );
   17270       assign( srcVec, unop( Iop_64UtoV128, mkexpr(srcI64)) );
   17271       delta += alen;
   17272       DIP( "%spmov%cxdq %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
   17273    }
   17274 
   17275    IRExpr* res
   17276       = xIsZ /* do math for either zero or sign extend */
   17277         ? binop( Iop_InterleaveLO32x4,
   17278                  IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) )
   17279         : binop( Iop_64HLtoV128,
   17280                  unop( Iop_32Sto64,
   17281                        unop( Iop_64HIto32, mkexpr(srcI64) ) ),
   17282                  unop( Iop_32Sto64,
   17283                        unop( Iop_64to32, mkexpr(srcI64) ) ) );
   17284 
   17285    (isAvx ? putYMMRegLoAndZU : putXMMReg) ( rG, res );
   17286 
   17287    return delta;
   17288 }
   17289 
   17290 
   17291 /* Handles 256 bit versions of PMOVZXDQ and PMOVSXDQ. */
   17292 static Long dis_PMOVxXDQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
   17293                                Long delta, Bool xIsZ )
   17294 {
   17295    IRTemp addr   = IRTemp_INVALID;
   17296    Int    alen   = 0;
   17297    HChar  dis_buf[50];
   17298    IRTemp srcVec = newTemp(Ity_V128);
   17299    UChar  modrm  = getUChar(delta);
   17300    UChar  how    = xIsZ ? 'z' : 's';
   17301    UInt   rG     = gregOfRexRM(pfx, modrm);
   17302    /* Compute both srcI64 -- the value to expand -- and srcVec -- same
   17303       thing in a V128, with arbitrary junk in the top 64 bits.  Use
   17304       one or both of them and let iropt clean up afterwards (as
   17305       usual). */
   17306    if ( epartIsReg(modrm) ) {
   17307       UInt rE = eregOfRexRM(pfx, modrm);
   17308       assign( srcVec, getXMMReg(rE) );
   17309       delta += 1;
   17310       DIP( "vpmov%cxdq %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
   17311    } else {
   17312       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17313       assign( srcVec, loadLE(Ity_V128, mkexpr(addr)) );
   17314       delta += alen;
   17315       DIP( "vpmov%cxdq %s,%s\n", how, dis_buf, nameYMMReg(rG) );
   17316    }
   17317 
   17318    IRExpr* res;
   17319    if (xIsZ)
   17320       res = binop( Iop_V128HLtoV256,
   17321                    binop( Iop_InterleaveHI32x4,
   17322                           IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ),
   17323                    binop( Iop_InterleaveLO32x4,
   17324                           IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) );
   17325    else {
   17326       IRTemp s3, s2, s1, s0;
   17327       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   17328       breakupV128to32s( srcVec, &s3, &s2, &s1, &s0 );
   17329       res = binop( Iop_V128HLtoV256,
   17330                    binop( Iop_64HLtoV128,
   17331                           unop( Iop_32Sto64, mkexpr(s3) ),
   17332                           unop( Iop_32Sto64, mkexpr(s2) ) ),
   17333                    binop( Iop_64HLtoV128,
   17334                           unop( Iop_32Sto64, mkexpr(s1) ),
   17335                           unop( Iop_32Sto64, mkexpr(s0) ) ) );
   17336    }
   17337 
   17338    putYMMReg ( rG, res );
   17339 
   17340    return delta;
   17341 }
   17342 
   17343 
   17344 /* Handles 128 bit versions of PMOVZXBD and PMOVSXBD. */
   17345 static Long dis_PMOVxXBD_128 ( const VexAbiInfo* vbi, Prefix pfx,
   17346                                Long delta, Bool isAvx, Bool xIsZ )
   17347 {
   17348    IRTemp addr   = IRTemp_INVALID;
   17349    Int    alen   = 0;
   17350    HChar  dis_buf[50];
   17351    IRTemp srcVec = newTemp(Ity_V128);
   17352    UChar  modrm  = getUChar(delta);
   17353    const HChar* mbV = isAvx ? "v" : "";
   17354    const HChar  how = xIsZ ? 'z' : 's';
   17355    UInt   rG     = gregOfRexRM(pfx, modrm);
   17356    if ( epartIsReg(modrm) ) {
   17357       UInt rE = eregOfRexRM(pfx, modrm);
   17358       assign( srcVec, getXMMReg(rE) );
   17359       delta += 1;
   17360       DIP( "%spmov%cxbd %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
   17361    } else {
   17362       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17363       assign( srcVec,
   17364               unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
   17365       delta += alen;
   17366       DIP( "%spmov%cxbd %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
   17367    }
   17368 
   17369    IRTemp zeroVec = newTemp(Ity_V128);
   17370    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   17371 
   17372    IRExpr* res
   17373       = binop(Iop_InterleaveLO8x16,
   17374               mkexpr(zeroVec),
   17375               binop(Iop_InterleaveLO8x16,
   17376                     mkexpr(zeroVec), mkexpr(srcVec)));
   17377    if (!xIsZ)
   17378       res = binop(Iop_SarN32x4,
   17379                   binop(Iop_ShlN32x4, res, mkU8(24)), mkU8(24));
   17380 
   17381    (isAvx ? putYMMRegLoAndZU : putXMMReg) ( rG, res );
   17382 
   17383    return delta;
   17384 }
   17385 
   17386 
   17387 /* Handles 256 bit versions of PMOVZXBD and PMOVSXBD. */
   17388 static Long dis_PMOVxXBD_256 ( const VexAbiInfo* vbi, Prefix pfx,
   17389                                Long delta, Bool xIsZ )
   17390 {
   17391    IRTemp addr   = IRTemp_INVALID;
   17392    Int    alen   = 0;
   17393    HChar  dis_buf[50];
   17394    IRTemp srcVec = newTemp(Ity_V128);
   17395    UChar  modrm  = getUChar(delta);
   17396    UChar  how    = xIsZ ? 'z' : 's';
   17397    UInt   rG     = gregOfRexRM(pfx, modrm);
   17398    if ( epartIsReg(modrm) ) {
   17399       UInt rE = eregOfRexRM(pfx, modrm);
   17400       assign( srcVec, getXMMReg(rE) );
   17401       delta += 1;
   17402       DIP( "vpmov%cxbd %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
   17403    } else {
   17404       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17405       assign( srcVec,
   17406               unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
   17407       delta += alen;
   17408       DIP( "vpmov%cxbd %s,%s\n", how, dis_buf, nameYMMReg(rG) );
   17409    }
   17410 
   17411    IRTemp zeroVec = newTemp(Ity_V128);
   17412    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   17413 
   17414    IRExpr* res
   17415       = binop( Iop_V128HLtoV256,
   17416                binop(Iop_InterleaveHI8x16,
   17417                      mkexpr(zeroVec),
   17418                      binop(Iop_InterleaveLO8x16,
   17419                            mkexpr(zeroVec), mkexpr(srcVec)) ),
   17420                binop(Iop_InterleaveLO8x16,
   17421                      mkexpr(zeroVec),
   17422                      binop(Iop_InterleaveLO8x16,
   17423                            mkexpr(zeroVec), mkexpr(srcVec)) ) );
   17424    if (!xIsZ)
   17425       res = binop(Iop_SarN32x8,
   17426                   binop(Iop_ShlN32x8, res, mkU8(24)), mkU8(24));
   17427 
   17428    putYMMReg ( rG, res );
   17429 
   17430    return delta;
   17431 }
   17432 
   17433 
   17434 /* Handles 128 bit versions of PMOVSXBQ. */
   17435 static Long dis_PMOVSXBQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
   17436                                Long delta, Bool isAvx )
   17437 {
   17438    IRTemp addr     = IRTemp_INVALID;
   17439    Int    alen     = 0;
   17440    HChar  dis_buf[50];
   17441    IRTemp srcBytes = newTemp(Ity_I16);
   17442    UChar  modrm    = getUChar(delta);
   17443    const HChar* mbV = isAvx ? "v" : "";
   17444    UInt   rG       = gregOfRexRM(pfx, modrm);
   17445    if ( epartIsReg(modrm) ) {
   17446       UInt rE = eregOfRexRM(pfx, modrm);
   17447       assign( srcBytes, getXMMRegLane16( rE, 0 ) );
   17448       delta += 1;
   17449       DIP( "%spmovsxbq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
   17450    } else {
   17451       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17452       assign( srcBytes, loadLE( Ity_I16, mkexpr(addr) ) );
   17453       delta += alen;
   17454       DIP( "%spmovsxbq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
   17455    }
   17456 
   17457    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   17458       ( rG, binop( Iop_64HLtoV128,
   17459                    unop( Iop_8Sto64,
   17460                          unop( Iop_16HIto8, mkexpr(srcBytes) ) ),
   17461                    unop( Iop_8Sto64,
   17462                          unop( Iop_16to8, mkexpr(srcBytes) ) ) ) );
   17463    return delta;
   17464 }
   17465 
   17466 
   17467 /* Handles 256 bit versions of PMOVSXBQ. */
   17468 static Long dis_PMOVSXBQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
   17469                                Long delta )
   17470 {
   17471    IRTemp addr     = IRTemp_INVALID;
   17472    Int    alen     = 0;
   17473    HChar  dis_buf[50];
   17474    IRTemp srcBytes = newTemp(Ity_I32);
   17475    UChar  modrm    = getUChar(delta);
   17476    UInt   rG       = gregOfRexRM(pfx, modrm);
   17477    if ( epartIsReg(modrm) ) {
   17478       UInt rE = eregOfRexRM(pfx, modrm);
   17479       assign( srcBytes, getXMMRegLane32( rE, 0 ) );
   17480       delta += 1;
   17481       DIP( "vpmovsxbq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
   17482    } else {
   17483       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17484       assign( srcBytes, loadLE( Ity_I32, mkexpr(addr) ) );
   17485       delta += alen;
   17486       DIP( "vpmovsxbq %s,%s\n", dis_buf, nameYMMReg(rG) );
   17487    }
   17488 
   17489    putYMMReg
   17490       ( rG, binop( Iop_V128HLtoV256,
   17491                    binop( Iop_64HLtoV128,
   17492                           unop( Iop_8Sto64,
   17493                                 unop( Iop_16HIto8,
   17494                                       unop( Iop_32HIto16,
   17495                                             mkexpr(srcBytes) ) ) ),
   17496                           unop( Iop_8Sto64,
   17497                                 unop( Iop_16to8,
   17498                                       unop( Iop_32HIto16,
   17499                                             mkexpr(srcBytes) ) ) ) ),
   17500                    binop( Iop_64HLtoV128,
   17501                           unop( Iop_8Sto64,
   17502                                 unop( Iop_16HIto8,
   17503                                       unop( Iop_32to16,
   17504                                             mkexpr(srcBytes) ) ) ),
   17505                           unop( Iop_8Sto64,
   17506                                 unop( Iop_16to8,
   17507                                       unop( Iop_32to16,
   17508                                             mkexpr(srcBytes) ) ) ) ) ) );
   17509    return delta;
   17510 }
   17511 
   17512 
   17513 /* Handles 128 bit versions of PMOVZXBQ. */
   17514 static Long dis_PMOVZXBQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
   17515                                Long delta, Bool isAvx )
   17516 {
   17517    IRTemp addr     = IRTemp_INVALID;
   17518    Int    alen     = 0;
   17519    HChar  dis_buf[50];
   17520    IRTemp srcVec   = newTemp(Ity_V128);
   17521    UChar  modrm    = getUChar(delta);
   17522    const HChar* mbV = isAvx ? "v" : "";
   17523    UInt   rG       = gregOfRexRM(pfx, modrm);
   17524    if ( epartIsReg(modrm) ) {
   17525       UInt rE = eregOfRexRM(pfx, modrm);
   17526       assign( srcVec, getXMMReg(rE) );
   17527       delta += 1;
   17528       DIP( "%spmovzxbq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
   17529    } else {
   17530       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17531       assign( srcVec,
   17532               unop( Iop_32UtoV128,
   17533                     unop( Iop_16Uto32, loadLE( Ity_I16, mkexpr(addr) ))));
   17534       delta += alen;
   17535       DIP( "%spmovzxbq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
   17536    }
   17537 
   17538    IRTemp zeroVec = newTemp(Ity_V128);
   17539    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   17540 
   17541    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   17542       ( rG, binop( Iop_InterleaveLO8x16,
   17543                    mkexpr(zeroVec),
   17544                    binop( Iop_InterleaveLO8x16,
   17545                           mkexpr(zeroVec),
   17546                           binop( Iop_InterleaveLO8x16,
   17547                                  mkexpr(zeroVec), mkexpr(srcVec) ) ) ) );
   17548    return delta;
   17549 }
   17550 
   17551 
   17552 /* Handles 256 bit versions of PMOVZXBQ. */
   17553 static Long dis_PMOVZXBQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
   17554                                Long delta )
   17555 {
   17556    IRTemp addr     = IRTemp_INVALID;
   17557    Int    alen     = 0;
   17558    HChar  dis_buf[50];
   17559    IRTemp srcVec   = newTemp(Ity_V128);
   17560    UChar  modrm    = getUChar(delta);
   17561    UInt   rG       = gregOfRexRM(pfx, modrm);
   17562    if ( epartIsReg(modrm) ) {
   17563       UInt rE = eregOfRexRM(pfx, modrm);
   17564       assign( srcVec, getXMMReg(rE) );
   17565       delta += 1;
   17566       DIP( "vpmovzxbq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
   17567    } else {
   17568       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17569       assign( srcVec,
   17570               unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) )));
   17571       delta += alen;
   17572       DIP( "vpmovzxbq %s,%s\n", dis_buf, nameYMMReg(rG) );
   17573    }
   17574 
   17575    IRTemp zeroVec = newTemp(Ity_V128);
   17576    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   17577 
   17578    putYMMReg
   17579       ( rG, binop( Iop_V128HLtoV256,
   17580                    binop( Iop_InterleaveHI8x16,
   17581                           mkexpr(zeroVec),
   17582                           binop( Iop_InterleaveLO8x16,
   17583                                  mkexpr(zeroVec),
   17584                                  binop( Iop_InterleaveLO8x16,
   17585                                         mkexpr(zeroVec), mkexpr(srcVec) ) ) ),
   17586                    binop( Iop_InterleaveLO8x16,
   17587                           mkexpr(zeroVec),
   17588                           binop( Iop_InterleaveLO8x16,
   17589                                  mkexpr(zeroVec),
   17590                                  binop( Iop_InterleaveLO8x16,
   17591                                         mkexpr(zeroVec), mkexpr(srcVec) ) ) )
   17592                  ) );
   17593    return delta;
   17594 }
   17595 
   17596 
   17597 static Long dis_PHMINPOSUW_128 ( const VexAbiInfo* vbi, Prefix pfx,
   17598                                  Long delta, Bool isAvx )
   17599 {
   17600    IRTemp addr   = IRTemp_INVALID;
   17601    Int    alen   = 0;
   17602    HChar  dis_buf[50];
   17603    UChar  modrm  = getUChar(delta);
   17604    const HChar* mbV = isAvx ? "v" : "";
   17605    IRTemp sV     = newTemp(Ity_V128);
   17606    IRTemp sHi    = newTemp(Ity_I64);
   17607    IRTemp sLo    = newTemp(Ity_I64);
   17608    IRTemp dLo    = newTemp(Ity_I64);
   17609    UInt   rG     = gregOfRexRM(pfx,modrm);
   17610    if (epartIsReg(modrm)) {
   17611       UInt rE = eregOfRexRM(pfx,modrm);
   17612       assign( sV, getXMMReg(rE) );
   17613       delta += 1;
   17614       DIP("%sphminposuw %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG));
   17615    } else {
   17616       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   17617       if (!isAvx)
   17618          gen_SEGV_if_not_16_aligned(addr);
   17619       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   17620       delta += alen;
   17621       DIP("%sphminposuw %s,%s\n", mbV, dis_buf, nameXMMReg(rG));
   17622    }
   17623    assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   17624    assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   17625    assign( dLo, mkIRExprCCall(
   17626                    Ity_I64, 0/*regparms*/,
   17627                    "amd64g_calculate_sse_phminposuw",
   17628                    &amd64g_calculate_sse_phminposuw,
   17629                    mkIRExprVec_2( mkexpr(sLo), mkexpr(sHi) )
   17630          ));
   17631    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   17632       (rG, unop(Iop_64UtoV128, mkexpr(dLo)));
   17633    return delta;
   17634 }
   17635 
   17636 
   17637 static Long dis_AESx ( const VexAbiInfo* vbi, Prefix pfx,
   17638                        Long delta, Bool isAvx, UChar opc )
   17639 {
   17640    IRTemp addr   = IRTemp_INVALID;
   17641    Int    alen   = 0;
   17642    HChar  dis_buf[50];
   17643    UChar  modrm  = getUChar(delta);
   17644    UInt   rG     = gregOfRexRM(pfx, modrm);
   17645    UInt   regNoL = 0;
   17646    UInt   regNoR = (isAvx && opc != 0xDB) ? getVexNvvvv(pfx) : rG;
   17647 
   17648    /* This is a nasty kludge.  We need to pass 2 x V128 to the
   17649       helper.  Since we can't do that, use a dirty
   17650       helper to compute the results directly from the XMM regs in
   17651       the guest state.  That means for the memory case, we need to
   17652       move the left operand into a pseudo-register (XMM16, let's
   17653       call it). */
   17654    if (epartIsReg(modrm)) {
   17655       regNoL = eregOfRexRM(pfx, modrm);
   17656       delta += 1;
   17657    } else {
   17658       regNoL = 16; /* use XMM16 as an intermediary */
   17659       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17660       /* alignment check needed ???? */
   17661       stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) ));
   17662       delta += alen;
   17663    }
   17664 
   17665    void*  fn = &amd64g_dirtyhelper_AES;
   17666    const HChar* nm = "amd64g_dirtyhelper_AES";
   17667 
   17668    /* Round up the arguments.  Note that this is a kludge -- the
   17669       use of mkU64 rather than mkIRExpr_HWord implies the
   17670       assumption that the host's word size is 64-bit. */
   17671    UInt gstOffD = ymmGuestRegOffset(rG);
   17672    UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL);
   17673    UInt gstOffR = ymmGuestRegOffset(regNoR);
   17674    IRExpr*  opc4         = mkU64(opc);
   17675    IRExpr*  gstOffDe     = mkU64(gstOffD);
   17676    IRExpr*  gstOffLe     = mkU64(gstOffL);
   17677    IRExpr*  gstOffRe     = mkU64(gstOffR);
   17678    IRExpr** args
   17679       = mkIRExprVec_5( IRExpr_BBPTR(), opc4, gstOffDe, gstOffLe, gstOffRe );
   17680 
   17681    IRDirty* d    = unsafeIRDirty_0_N( 0/*regparms*/, nm, fn, args );
   17682    /* It's not really a dirty call, but we can't use the clean helper
   17683       mechanism here for the very lame reason that we can't pass 2 x
   17684       V128s by value to a helper.  Hence this roundabout scheme. */
   17685    d->nFxState = 2;
   17686    vex_bzero(&d->fxState, sizeof(d->fxState));
   17687    /* AES{ENC,ENCLAST,DEC,DECLAST} read both registers, and writes
   17688       the second for !isAvx or the third for isAvx.
   17689       AESIMC (0xDB) reads the first register, and writes the second. */
   17690    d->fxState[0].fx     = Ifx_Read;
   17691    d->fxState[0].offset = gstOffL;
   17692    d->fxState[0].size   = sizeof(U128);
   17693    d->fxState[1].offset = gstOffR;
   17694    d->fxState[1].size   = sizeof(U128);
   17695    if (opc == 0xDB)
   17696       d->fxState[1].fx   = Ifx_Write;
   17697    else if (!isAvx || rG == regNoR)
   17698       d->fxState[1].fx   = Ifx_Modify;
   17699    else {
   17700       d->fxState[1].fx     = Ifx_Read;
   17701       d->nFxState++;
   17702       d->fxState[2].fx     = Ifx_Write;
   17703       d->fxState[2].offset = gstOffD;
   17704       d->fxState[2].size   = sizeof(U128);
   17705    }
   17706 
   17707    stmt( IRStmt_Dirty(d) );
   17708    {
   17709       const HChar* opsuf;
   17710       switch (opc) {
   17711          case 0xDC: opsuf = "enc"; break;
   17712          case 0XDD: opsuf = "enclast"; break;
   17713          case 0xDE: opsuf = "dec"; break;
   17714          case 0xDF: opsuf = "declast"; break;
   17715          case 0xDB: opsuf = "imc"; break;
   17716          default: vassert(0);
   17717       }
   17718       DIP("%saes%s %s,%s%s%s\n", isAvx ? "v" : "", opsuf,
   17719           (regNoL == 16 ? dis_buf : nameXMMReg(regNoL)),
   17720           nameXMMReg(regNoR),
   17721           (isAvx && opc != 0xDB) ? "," : "",
   17722           (isAvx && opc != 0xDB) ? nameXMMReg(rG) : "");
   17723    }
   17724    if (isAvx)
   17725       putYMMRegLane128( rG, 1, mkV128(0) );
   17726    return delta;
   17727 }
   17728 
   17729 static Long dis_AESKEYGENASSIST ( const VexAbiInfo* vbi, Prefix pfx,
   17730                                   Long delta, Bool isAvx )
   17731 {
   17732    IRTemp addr   = IRTemp_INVALID;
   17733    Int    alen   = 0;
   17734    HChar  dis_buf[50];
   17735    UChar  modrm  = getUChar(delta);
   17736    UInt   regNoL = 0;
   17737    UInt   regNoR = gregOfRexRM(pfx, modrm);
   17738    UChar  imm    = 0;
   17739 
   17740    /* This is a nasty kludge.  See AESENC et al. instructions. */
   17741    modrm = getUChar(delta);
   17742    if (epartIsReg(modrm)) {
   17743       regNoL = eregOfRexRM(pfx, modrm);
   17744       imm = getUChar(delta+1);
   17745       delta += 1+1;
   17746    } else {
   17747       regNoL = 16; /* use XMM16 as an intermediary */
   17748       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   17749       /* alignment check ???? . */
   17750       stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) ));
   17751       imm = getUChar(delta+alen);
   17752       delta += alen+1;
   17753    }
   17754 
   17755    /* Who ya gonna call?  Presumably not Ghostbusters. */
   17756    void*  fn = &amd64g_dirtyhelper_AESKEYGENASSIST;
   17757    const HChar* nm = "amd64g_dirtyhelper_AESKEYGENASSIST";
   17758 
   17759    /* Round up the arguments.  Note that this is a kludge -- the
   17760       use of mkU64 rather than mkIRExpr_HWord implies the
   17761       assumption that the host's word size is 64-bit. */
   17762    UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL);
   17763    UInt gstOffR = ymmGuestRegOffset(regNoR);
   17764 
   17765    IRExpr*  imme          = mkU64(imm & 0xFF);
   17766    IRExpr*  gstOffLe     = mkU64(gstOffL);
   17767    IRExpr*  gstOffRe     = mkU64(gstOffR);
   17768    IRExpr** args
   17769       = mkIRExprVec_4( IRExpr_BBPTR(), imme, gstOffLe, gstOffRe );
   17770 
   17771    IRDirty* d    = unsafeIRDirty_0_N( 0/*regparms*/, nm, fn, args );
   17772    /* It's not really a dirty call, but we can't use the clean helper
   17773       mechanism here for the very lame reason that we can't pass 2 x
   17774       V128s by value to a helper.  Hence this roundabout scheme. */
   17775    d->nFxState = 2;
   17776    vex_bzero(&d->fxState, sizeof(d->fxState));
   17777    d->fxState[0].fx     = Ifx_Read;
   17778    d->fxState[0].offset = gstOffL;
   17779    d->fxState[0].size   = sizeof(U128);
   17780    d->fxState[1].fx     = Ifx_Write;
   17781    d->fxState[1].offset = gstOffR;
   17782    d->fxState[1].size   = sizeof(U128);
   17783    stmt( IRStmt_Dirty(d) );
   17784 
   17785    DIP("%saeskeygenassist $%x,%s,%s\n", isAvx ? "v" : "", (UInt)imm,
   17786        (regNoL == 16 ? dis_buf : nameXMMReg(regNoL)),
   17787        nameXMMReg(regNoR));
   17788    if (isAvx)
   17789       putYMMRegLane128( regNoR, 1, mkV128(0) );
   17790    return delta;
   17791 }
   17792 
   17793 
   17794 __attribute__((noinline))
   17795 static
   17796 Long dis_ESC_0F38__SSE4 ( Bool* decode_OK,
   17797                           const VexAbiInfo* vbi,
   17798                           Prefix pfx, Int sz, Long deltaIN )
   17799 {
   17800    IRTemp addr  = IRTemp_INVALID;
   17801    UChar  modrm = 0;
   17802    Int    alen  = 0;
   17803    HChar  dis_buf[50];
   17804 
   17805    *decode_OK = False;
   17806 
   17807    Long   delta = deltaIN;
   17808    UChar  opc   = getUChar(delta);
   17809    delta++;
   17810    switch (opc) {
   17811 
   17812    case 0x10:
   17813    case 0x14:
   17814    case 0x15:
   17815       /* 66 0F 38 10 /r = PBLENDVB xmm1, xmm2/m128  (byte gran)
   17816          66 0F 38 14 /r = BLENDVPS xmm1, xmm2/m128  (float gran)
   17817          66 0F 38 15 /r = BLENDVPD xmm1, xmm2/m128  (double gran)
   17818          Blend at various granularities, with XMM0 (implicit operand)
   17819          providing the controlling mask.
   17820       */
   17821       if (have66noF2noF3(pfx) && sz == 2) {
   17822          modrm = getUChar(delta);
   17823 
   17824          const HChar* nm    = NULL;
   17825          UInt   gran  = 0;
   17826          IROp   opSAR = Iop_INVALID;
   17827          switch (opc) {
   17828             case 0x10:
   17829                nm = "pblendvb"; gran = 1; opSAR = Iop_SarN8x16;
   17830                break;
   17831             case 0x14:
   17832                nm = "blendvps"; gran = 4; opSAR = Iop_SarN32x4;
   17833                break;
   17834             case 0x15:
   17835                nm = "blendvpd"; gran = 8; opSAR = Iop_SarN64x2;
   17836                break;
   17837          }
   17838          vassert(nm);
   17839 
   17840          IRTemp vecE = newTemp(Ity_V128);
   17841          IRTemp vecG = newTemp(Ity_V128);
   17842          IRTemp vec0 = newTemp(Ity_V128);
   17843 
   17844          if ( epartIsReg(modrm) ) {
   17845             assign(vecE, getXMMReg(eregOfRexRM(pfx, modrm)));
   17846             delta += 1;
   17847             DIP( "%s %s,%s\n", nm,
   17848                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   17849                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   17850          } else {
   17851             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17852             gen_SEGV_if_not_16_aligned( addr );
   17853             assign(vecE, loadLE( Ity_V128, mkexpr(addr) ));
   17854             delta += alen;
   17855             DIP( "%s %s,%s\n", nm,
   17856                  dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   17857          }
   17858 
   17859          assign(vecG, getXMMReg(gregOfRexRM(pfx, modrm)));
   17860          assign(vec0, getXMMReg(0));
   17861 
   17862          IRTemp res = math_PBLENDVB_128( vecE, vecG, vec0, gran, opSAR );
   17863          putXMMReg(gregOfRexRM(pfx, modrm), mkexpr(res));
   17864 
   17865          goto decode_success;
   17866       }
   17867       break;
   17868 
   17869    case 0x17:
   17870       /* 66 0F 38 17 /r = PTEST xmm1, xmm2/m128
   17871          Logical compare (set ZF and CF from AND/ANDN of the operands) */
   17872       if (have66noF2noF3(pfx)
   17873           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   17874          delta = dis_xTESTy_128( vbi, pfx, delta, False/*!isAvx*/, 0 );
   17875          goto decode_success;
   17876       }
   17877       break;
   17878 
   17879    case 0x20:
   17880       /* 66 0F 38 20 /r = PMOVSXBW xmm1, xmm2/m64
   17881          Packed Move with Sign Extend from Byte to Word (XMM) */
   17882       if (have66noF2noF3(pfx) && sz == 2) {
   17883          delta = dis_PMOVxXBW_128( vbi, pfx, delta,
   17884                                    False/*!isAvx*/, False/*!xIsZ*/ );
   17885          goto decode_success;
   17886       }
   17887       break;
   17888 
   17889    case 0x21:
   17890       /* 66 0F 38 21 /r = PMOVSXBD xmm1, xmm2/m32
   17891          Packed Move with Sign Extend from Byte to DWord (XMM) */
   17892       if (have66noF2noF3(pfx) && sz == 2) {
   17893          delta = dis_PMOVxXBD_128( vbi, pfx, delta,
   17894                                    False/*!isAvx*/, False/*!xIsZ*/ );
   17895          goto decode_success;
   17896       }
   17897       break;
   17898 
   17899    case 0x22:
   17900       /* 66 0F 38 22 /r = PMOVSXBQ xmm1, xmm2/m16
   17901          Packed Move with Sign Extend from Byte to QWord (XMM) */
   17902       if (have66noF2noF3(pfx) && sz == 2) {
   17903          delta = dis_PMOVSXBQ_128( vbi, pfx, delta, False/*!isAvx*/ );
   17904          goto decode_success;
   17905       }
   17906       break;
   17907 
   17908    case 0x23:
   17909       /* 66 0F 38 23 /r = PMOVSXWD xmm1, xmm2/m64
   17910          Packed Move with Sign Extend from Word to DWord (XMM) */
   17911       if (have66noF2noF3(pfx) && sz == 2) {
   17912          delta = dis_PMOVxXWD_128(vbi, pfx, delta,
   17913                                   False/*!isAvx*/, False/*!xIsZ*/);
   17914          goto decode_success;
   17915       }
   17916       break;
   17917 
   17918    case 0x24:
   17919       /* 66 0F 38 24 /r = PMOVSXWQ xmm1, xmm2/m32
   17920          Packed Move with Sign Extend from Word to QWord (XMM) */
   17921       if (have66noF2noF3(pfx) && sz == 2) {
   17922          delta = dis_PMOVSXWQ_128( vbi, pfx, delta, False/*!isAvx*/ );
   17923          goto decode_success;
   17924       }
   17925       break;
   17926 
   17927    case 0x25:
   17928       /* 66 0F 38 25 /r = PMOVSXDQ xmm1, xmm2/m64
   17929          Packed Move with Sign Extend from Double Word to Quad Word (XMM) */
   17930       if (have66noF2noF3(pfx) && sz == 2) {
   17931          delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
   17932                                    False/*!isAvx*/, False/*!xIsZ*/ );
   17933          goto decode_success;
   17934       }
   17935       break;
   17936 
   17937    case 0x28:
   17938       /* 66 0F 38 28 = PMULDQ -- signed widening multiply of 32-lanes
   17939          0 x 0 to form lower 64-bit half and lanes 2 x 2 to form upper
   17940          64-bit half */
   17941       /* This is a really poor translation -- could be improved if
   17942          performance critical.  It's a copy-paste of PMULUDQ, too. */
   17943       if (have66noF2noF3(pfx) && sz == 2) {
   17944          IRTemp sV = newTemp(Ity_V128);
   17945          IRTemp dV = newTemp(Ity_V128);
   17946          modrm = getUChar(delta);
   17947          UInt rG = gregOfRexRM(pfx,modrm);
   17948          assign( dV, getXMMReg(rG) );
   17949          if (epartIsReg(modrm)) {
   17950             UInt rE = eregOfRexRM(pfx,modrm);
   17951             assign( sV, getXMMReg(rE) );
   17952             delta += 1;
   17953             DIP("pmuldq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   17954          } else {
   17955             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   17956             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   17957             delta += alen;
   17958             DIP("pmuldq %s,%s\n", dis_buf, nameXMMReg(rG));
   17959          }
   17960 
   17961          putXMMReg( rG, mkexpr(math_PMULDQ_128( dV, sV )) );
   17962          goto decode_success;
   17963       }
   17964       break;
   17965 
   17966    case 0x29:
   17967       /* 66 0F 38 29 = PCMPEQQ
   17968          64x2 equality comparison */
   17969       if (have66noF2noF3(pfx) && sz == 2) {
   17970          /* FIXME: this needs an alignment check */
   17971          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   17972                                     "pcmpeqq", Iop_CmpEQ64x2, False );
   17973          goto decode_success;
   17974       }
   17975       break;
   17976 
   17977    case 0x2A:
   17978       /* 66 0F 38 2A = MOVNTDQA
   17979          "non-temporal" "streaming" load
   17980          Handle like MOVDQA but only memory operand is allowed */
   17981       if (have66noF2noF3(pfx) && sz == 2) {
   17982          modrm = getUChar(delta);
   17983          if (!epartIsReg(modrm)) {
   17984             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   17985             gen_SEGV_if_not_16_aligned( addr );
   17986             putXMMReg( gregOfRexRM(pfx,modrm),
   17987                        loadLE(Ity_V128, mkexpr(addr)) );
   17988             DIP("movntdqa %s,%s\n", dis_buf,
   17989                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   17990             delta += alen;
   17991             goto decode_success;
   17992          }
   17993       }
   17994       break;
   17995 
   17996    case 0x2B:
   17997       /* 66 0f 38 2B /r = PACKUSDW xmm1, xmm2/m128
   17998          2x 32x4 S->U saturating narrow from xmm2/m128 to xmm1 */
   17999       if (have66noF2noF3(pfx) && sz == 2) {
   18000 
   18001          modrm = getUChar(delta);
   18002 
   18003          IRTemp argL = newTemp(Ity_V128);
   18004          IRTemp argR = newTemp(Ity_V128);
   18005 
   18006          if ( epartIsReg(modrm) ) {
   18007             assign( argL, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   18008             delta += 1;
   18009             DIP( "packusdw %s,%s\n",
   18010                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   18011                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18012          } else {
   18013             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   18014             gen_SEGV_if_not_16_aligned( addr );
   18015             assign( argL, loadLE( Ity_V128, mkexpr(addr) ));
   18016             delta += alen;
   18017             DIP( "packusdw %s,%s\n",
   18018                  dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18019          }
   18020 
   18021          assign(argR, getXMMReg( gregOfRexRM(pfx, modrm) ));
   18022 
   18023          putXMMReg( gregOfRexRM(pfx, modrm),
   18024                     binop( Iop_QNarrowBin32Sto16Ux8,
   18025                            mkexpr(argL), mkexpr(argR)) );
   18026 
   18027          goto decode_success;
   18028       }
   18029       break;
   18030 
   18031    case 0x30:
   18032       /* 66 0F 38 30 /r = PMOVZXBW xmm1, xmm2/m64
   18033          Packed Move with Zero Extend from Byte to Word (XMM) */
   18034       if (have66noF2noF3(pfx) && sz == 2) {
   18035          delta = dis_PMOVxXBW_128( vbi, pfx, delta,
   18036                                    False/*!isAvx*/, True/*xIsZ*/ );
   18037          goto decode_success;
   18038       }
   18039       break;
   18040 
   18041    case 0x31:
   18042       /* 66 0F 38 31 /r = PMOVZXBD xmm1, xmm2/m32
   18043          Packed Move with Zero Extend from Byte to DWord (XMM) */
   18044       if (have66noF2noF3(pfx) && sz == 2) {
   18045          delta = dis_PMOVxXBD_128( vbi, pfx, delta,
   18046                                    False/*!isAvx*/, True/*xIsZ*/ );
   18047          goto decode_success;
   18048       }
   18049       break;
   18050 
   18051    case 0x32:
   18052       /* 66 0F 38 32 /r = PMOVZXBQ xmm1, xmm2/m16
   18053          Packed Move with Zero Extend from Byte to QWord (XMM) */
   18054       if (have66noF2noF3(pfx) && sz == 2) {
   18055          delta = dis_PMOVZXBQ_128( vbi, pfx, delta, False/*!isAvx*/ );
   18056          goto decode_success;
   18057       }
   18058       break;
   18059 
   18060    case 0x33:
   18061       /* 66 0F 38 33 /r = PMOVZXWD xmm1, xmm2/m64
   18062          Packed Move with Zero Extend from Word to DWord (XMM) */
   18063       if (have66noF2noF3(pfx) && sz == 2) {
   18064          delta = dis_PMOVxXWD_128( vbi, pfx, delta,
   18065                                    False/*!isAvx*/, True/*xIsZ*/ );
   18066          goto decode_success;
   18067       }
   18068       break;
   18069 
   18070    case 0x34:
   18071       /* 66 0F 38 34 /r = PMOVZXWQ xmm1, xmm2/m32
   18072          Packed Move with Zero Extend from Word to QWord (XMM) */
   18073       if (have66noF2noF3(pfx) && sz == 2) {
   18074          delta = dis_PMOVZXWQ_128( vbi, pfx, delta, False/*!isAvx*/ );
   18075          goto decode_success;
   18076       }
   18077       break;
   18078 
   18079    case 0x35:
   18080       /* 66 0F 38 35 /r = PMOVZXDQ xmm1, xmm2/m64
   18081          Packed Move with Zero Extend from DWord to QWord (XMM) */
   18082       if (have66noF2noF3(pfx) && sz == 2) {
   18083          delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
   18084                                    False/*!isAvx*/, True/*xIsZ*/ );
   18085          goto decode_success;
   18086       }
   18087       break;
   18088 
   18089    case 0x37:
   18090       /* 66 0F 38 37 = PCMPGTQ
   18091          64x2 comparison (signed, presumably; the Intel docs don't say :-)
   18092       */
   18093       if (have66noF2noF3(pfx) && sz == 2) {
   18094          /* FIXME: this needs an alignment check */
   18095          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   18096                                     "pcmpgtq", Iop_CmpGT64Sx2, False );
   18097          goto decode_success;
   18098       }
   18099       break;
   18100 
   18101    case 0x38:
   18102    case 0x3C:
   18103       /* 66 0F 38 38 /r = PMINSB xmm1, xmm2/m128    8Sx16 (signed) min
   18104          66 0F 38 3C /r = PMAXSB xmm1, xmm2/m128    8Sx16 (signed) max
   18105       */
   18106       if (have66noF2noF3(pfx) && sz == 2) {
   18107          /* FIXME: this needs an alignment check */
   18108          Bool isMAX = opc == 0x3C;
   18109          delta = dis_SSEint_E_to_G(
   18110                     vbi, pfx, delta,
   18111                     isMAX ? "pmaxsb" : "pminsb",
   18112                     isMAX ? Iop_Max8Sx16 : Iop_Min8Sx16,
   18113                     False
   18114                  );
   18115          goto decode_success;
   18116       }
   18117       break;
   18118 
   18119    case 0x39:
   18120    case 0x3D:
   18121       /* 66 0F 38 39 /r = PMINSD xmm1, xmm2/m128
   18122          Minimum of Packed Signed Double Word Integers (XMM)
   18123          66 0F 38 3D /r = PMAXSD xmm1, xmm2/m128
   18124          Maximum of Packed Signed Double Word Integers (XMM)
   18125       */
   18126       if (have66noF2noF3(pfx) && sz == 2) {
   18127          /* FIXME: this needs an alignment check */
   18128          Bool isMAX = opc == 0x3D;
   18129          delta = dis_SSEint_E_to_G(
   18130                     vbi, pfx, delta,
   18131                     isMAX ? "pmaxsd" : "pminsd",
   18132                     isMAX ? Iop_Max32Sx4 : Iop_Min32Sx4,
   18133                     False
   18134                  );
   18135          goto decode_success;
   18136       }
   18137       break;
   18138 
   18139    case 0x3A:
   18140    case 0x3E:
   18141       /* 66 0F 38 3A /r = PMINUW xmm1, xmm2/m128
   18142          Minimum of Packed Unsigned Word Integers (XMM)
   18143          66 0F 38 3E /r = PMAXUW xmm1, xmm2/m128
   18144          Maximum of Packed Unsigned Word Integers (XMM)
   18145       */
   18146       if (have66noF2noF3(pfx) && sz == 2) {
   18147          /* FIXME: this needs an alignment check */
   18148          Bool isMAX = opc == 0x3E;
   18149          delta = dis_SSEint_E_to_G(
   18150                     vbi, pfx, delta,
   18151                     isMAX ? "pmaxuw" : "pminuw",
   18152                     isMAX ? Iop_Max16Ux8 : Iop_Min16Ux8,
   18153                     False
   18154                  );
   18155          goto decode_success;
   18156       }
   18157       break;
   18158 
   18159    case 0x3B:
   18160    case 0x3F:
   18161       /* 66 0F 38 3B /r = PMINUD xmm1, xmm2/m128
   18162          Minimum of Packed Unsigned Doubleword Integers (XMM)
   18163          66 0F 38 3F /r = PMAXUD xmm1, xmm2/m128
   18164          Maximum of Packed Unsigned Doubleword Integers (XMM)
   18165       */
   18166       if (have66noF2noF3(pfx) && sz == 2) {
   18167          /* FIXME: this needs an alignment check */
   18168          Bool isMAX = opc == 0x3F;
   18169          delta = dis_SSEint_E_to_G(
   18170                     vbi, pfx, delta,
   18171                     isMAX ? "pmaxud" : "pminud",
   18172                     isMAX ? Iop_Max32Ux4 : Iop_Min32Ux4,
   18173                     False
   18174                  );
   18175          goto decode_success;
   18176       }
   18177       break;
   18178 
   18179    case 0x40:
   18180       /* 66 0F 38 40 /r = PMULLD xmm1, xmm2/m128
   18181          32x4 integer multiply from xmm2/m128 to xmm1 */
   18182       if (have66noF2noF3(pfx) && sz == 2) {
   18183 
   18184          modrm = getUChar(delta);
   18185 
   18186          IRTemp argL = newTemp(Ity_V128);
   18187          IRTemp argR = newTemp(Ity_V128);
   18188 
   18189          if ( epartIsReg(modrm) ) {
   18190             assign( argL, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   18191             delta += 1;
   18192             DIP( "pmulld %s,%s\n",
   18193                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   18194                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18195          } else {
   18196             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   18197             gen_SEGV_if_not_16_aligned( addr );
   18198             assign( argL, loadLE( Ity_V128, mkexpr(addr) ));
   18199             delta += alen;
   18200             DIP( "pmulld %s,%s\n",
   18201                  dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18202          }
   18203 
   18204          assign(argR, getXMMReg( gregOfRexRM(pfx, modrm) ));
   18205 
   18206          putXMMReg( gregOfRexRM(pfx, modrm),
   18207                     binop( Iop_Mul32x4, mkexpr(argL), mkexpr(argR)) );
   18208 
   18209          goto decode_success;
   18210       }
   18211       break;
   18212 
   18213    case 0x41:
   18214       /* 66 0F 38 41 /r = PHMINPOSUW xmm1, xmm2/m128
   18215          Packed Horizontal Word Minimum from xmm2/m128 to xmm1 */
   18216       if (have66noF2noF3(pfx) && sz == 2) {
   18217          delta = dis_PHMINPOSUW_128( vbi, pfx, delta, False/*!isAvx*/ );
   18218          goto decode_success;
   18219       }
   18220       break;
   18221 
   18222    case 0xDC:
   18223    case 0xDD:
   18224    case 0xDE:
   18225    case 0xDF:
   18226    case 0xDB:
   18227       /* 66 0F 38 DC /r = AESENC xmm1, xmm2/m128
   18228                   DD /r = AESENCLAST xmm1, xmm2/m128
   18229                   DE /r = AESDEC xmm1, xmm2/m128
   18230                   DF /r = AESDECLAST xmm1, xmm2/m128
   18231 
   18232                   DB /r = AESIMC xmm1, xmm2/m128 */
   18233       if (have66noF2noF3(pfx) && sz == 2) {
   18234          delta = dis_AESx( vbi, pfx, delta, False/*!isAvx*/, opc );
   18235          goto decode_success;
   18236       }
   18237       break;
   18238 
   18239    case 0xF0:
   18240    case 0xF1:
   18241       /* F2 0F 38 F0 /r = CRC32 r/m8, r32 (REX.W ok, 66 not ok)
   18242          F2 0F 38 F1 /r = CRC32 r/m{16,32,64}, r32
   18243          The decoding on this is a bit unusual.
   18244       */
   18245       if (haveF2noF3(pfx)
   18246           && (opc == 0xF1 || (opc == 0xF0 && !have66(pfx)))) {
   18247          modrm = getUChar(delta);
   18248 
   18249          if (opc == 0xF0)
   18250             sz = 1;
   18251          else
   18252             vassert(sz == 2 || sz == 4 || sz == 8);
   18253 
   18254          IRType tyE = szToITy(sz);
   18255          IRTemp valE = newTemp(tyE);
   18256 
   18257          if (epartIsReg(modrm)) {
   18258             assign(valE, getIRegE(sz, pfx, modrm));
   18259             delta += 1;
   18260             DIP("crc32b %s,%s\n", nameIRegE(sz, pfx, modrm),
   18261                 nameIRegG(1==getRexW(pfx) ? 8 : 4, pfx, modrm));
   18262          } else {
   18263             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   18264             assign(valE, loadLE(tyE, mkexpr(addr)));
   18265             delta += alen;
   18266             DIP("crc32b %s,%s\n", dis_buf,
   18267                 nameIRegG(1==getRexW(pfx) ? 8 : 4, pfx, modrm));
   18268          }
   18269 
   18270          /* Somewhat funny getting/putting of the crc32 value, in order
   18271             to ensure that it turns into 64-bit gets and puts.  However,
   18272             mask off the upper 32 bits so as to not get memcheck false
   18273             +ves around the helper call. */
   18274          IRTemp valG0 = newTemp(Ity_I64);
   18275          assign(valG0, binop(Iop_And64, getIRegG(8, pfx, modrm),
   18276                              mkU64(0xFFFFFFFF)));
   18277 
   18278          const HChar* nm = NULL;
   18279          void*  fn = NULL;
   18280          switch (sz) {
   18281             case 1: nm = "amd64g_calc_crc32b";
   18282                     fn = &amd64g_calc_crc32b; break;
   18283             case 2: nm = "amd64g_calc_crc32w";
   18284                     fn = &amd64g_calc_crc32w; break;
   18285             case 4: nm = "amd64g_calc_crc32l";
   18286                     fn = &amd64g_calc_crc32l; break;
   18287             case 8: nm = "amd64g_calc_crc32q";
   18288                     fn = &amd64g_calc_crc32q; break;
   18289          }
   18290          vassert(nm && fn);
   18291          IRTemp valG1 = newTemp(Ity_I64);
   18292          assign(valG1,
   18293                 mkIRExprCCall(Ity_I64, 0/*regparm*/, nm, fn,
   18294                               mkIRExprVec_2(mkexpr(valG0),
   18295                                             widenUto64(mkexpr(valE)))));
   18296 
   18297          putIRegG(4, pfx, modrm, unop(Iop_64to32, mkexpr(valG1)));
   18298          goto decode_success;
   18299       }
   18300       break;
   18301 
   18302    default:
   18303       break;
   18304 
   18305    }
   18306 
   18307   //decode_failure:
   18308    *decode_OK = False;
   18309    return deltaIN;
   18310 
   18311   decode_success:
   18312    *decode_OK = True;
   18313    return delta;
   18314 }
   18315 
   18316 
   18317 /*------------------------------------------------------------*/
   18318 /*---                                                      ---*/
   18319 /*--- Top-level SSE4: dis_ESC_0F3A__SSE4                   ---*/
   18320 /*---                                                      ---*/
   18321 /*------------------------------------------------------------*/
   18322 
   18323 static Long dis_PEXTRW ( const VexAbiInfo* vbi, Prefix pfx,
   18324                          Long delta, Bool isAvx )
   18325 {
   18326    IRTemp addr  = IRTemp_INVALID;
   18327    IRTemp t0    = IRTemp_INVALID;
   18328    IRTemp t1    = IRTemp_INVALID;
   18329    IRTemp t2    = IRTemp_INVALID;
   18330    IRTemp t3    = IRTemp_INVALID;
   18331    UChar  modrm = getUChar(delta);
   18332    Int    alen  = 0;
   18333    HChar  dis_buf[50];
   18334    UInt   rG    = gregOfRexRM(pfx,modrm);
   18335    Int    imm8_20;
   18336    IRTemp xmm_vec = newTemp(Ity_V128);
   18337    IRTemp d16   = newTemp(Ity_I16);
   18338    const HChar* mbV = isAvx ? "v" : "";
   18339 
   18340    vassert(0==getRexW(pfx)); /* ensured by caller */
   18341    assign( xmm_vec, getXMMReg(rG) );
   18342    breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
   18343 
   18344    if ( epartIsReg( modrm ) ) {
   18345       imm8_20 = (Int)(getUChar(delta+1) & 7);
   18346    } else {
   18347       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   18348       imm8_20 = (Int)(getUChar(delta+alen) & 7);
   18349    }
   18350 
   18351    switch (imm8_20) {
   18352       case 0:  assign(d16, unop(Iop_32to16,   mkexpr(t0))); break;
   18353       case 1:  assign(d16, unop(Iop_32HIto16, mkexpr(t0))); break;
   18354       case 2:  assign(d16, unop(Iop_32to16,   mkexpr(t1))); break;
   18355       case 3:  assign(d16, unop(Iop_32HIto16, mkexpr(t1))); break;
   18356       case 4:  assign(d16, unop(Iop_32to16,   mkexpr(t2))); break;
   18357       case 5:  assign(d16, unop(Iop_32HIto16, mkexpr(t2))); break;
   18358       case 6:  assign(d16, unop(Iop_32to16,   mkexpr(t3))); break;
   18359       case 7:  assign(d16, unop(Iop_32HIto16, mkexpr(t3))); break;
   18360       default: vassert(0);
   18361    }
   18362 
   18363    if ( epartIsReg( modrm ) ) {
   18364       UInt rE = eregOfRexRM(pfx,modrm);
   18365       putIReg32( rE, unop(Iop_16Uto32, mkexpr(d16)) );
   18366       delta += 1+1;
   18367       DIP( "%spextrw $%d, %s,%s\n", mbV, imm8_20,
   18368            nameXMMReg( rG ), nameIReg32( rE ) );
   18369    } else {
   18370       storeLE( mkexpr(addr), mkexpr(d16) );
   18371       delta += alen+1;
   18372       DIP( "%spextrw $%d, %s,%s\n", mbV, imm8_20, nameXMMReg( rG ), dis_buf );
   18373    }
   18374    return delta;
   18375 }
   18376 
   18377 
   18378 static Long dis_PEXTRD ( const VexAbiInfo* vbi, Prefix pfx,
   18379                          Long delta, Bool isAvx )
   18380 {
   18381    IRTemp addr  = IRTemp_INVALID;
   18382    IRTemp t0    = IRTemp_INVALID;
   18383    IRTemp t1    = IRTemp_INVALID;
   18384    IRTemp t2    = IRTemp_INVALID;
   18385    IRTemp t3    = IRTemp_INVALID;
   18386    UChar  modrm = 0;
   18387    Int    alen  = 0;
   18388    HChar  dis_buf[50];
   18389 
   18390    Int    imm8_10;
   18391    IRTemp xmm_vec   = newTemp(Ity_V128);
   18392    IRTemp src_dword = newTemp(Ity_I32);
   18393    const HChar* mbV = isAvx ? "v" : "";
   18394 
   18395    vassert(0==getRexW(pfx)); /* ensured by caller */
   18396    modrm = getUChar(delta);
   18397    assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
   18398    breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
   18399 
   18400    if ( epartIsReg( modrm ) ) {
   18401       imm8_10 = (Int)(getUChar(delta+1) & 3);
   18402    } else {
   18403       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   18404       imm8_10 = (Int)(getUChar(delta+alen) & 3);
   18405    }
   18406 
   18407    switch ( imm8_10 ) {
   18408       case 0:  assign( src_dword, mkexpr(t0) ); break;
   18409       case 1:  assign( src_dword, mkexpr(t1) ); break;
   18410       case 2:  assign( src_dword, mkexpr(t2) ); break;
   18411       case 3:  assign( src_dword, mkexpr(t3) ); break;
   18412       default: vassert(0);
   18413    }
   18414 
   18415    if ( epartIsReg( modrm ) ) {
   18416       putIReg32( eregOfRexRM(pfx,modrm), mkexpr(src_dword) );
   18417       delta += 1+1;
   18418       DIP( "%spextrd $%d, %s,%s\n", mbV, imm8_10,
   18419            nameXMMReg( gregOfRexRM(pfx, modrm) ),
   18420            nameIReg32( eregOfRexRM(pfx, modrm) ) );
   18421    } else {
   18422       storeLE( mkexpr(addr), mkexpr(src_dword) );
   18423       delta += alen+1;
   18424       DIP( "%spextrd $%d, %s,%s\n", mbV,
   18425            imm8_10, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
   18426    }
   18427    return delta;
   18428 }
   18429 
   18430 
   18431 static Long dis_PEXTRQ ( const VexAbiInfo* vbi, Prefix pfx,
   18432                          Long delta, Bool isAvx )
   18433 {
   18434    IRTemp addr  = IRTemp_INVALID;
   18435    UChar  modrm = 0;
   18436    Int    alen  = 0;
   18437    HChar  dis_buf[50];
   18438 
   18439    Int imm8_0;
   18440    IRTemp xmm_vec   = newTemp(Ity_V128);
   18441    IRTemp src_qword = newTemp(Ity_I64);
   18442    const HChar* mbV = isAvx ? "v" : "";
   18443 
   18444    vassert(1==getRexW(pfx)); /* ensured by caller */
   18445    modrm = getUChar(delta);
   18446    assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
   18447 
   18448    if ( epartIsReg( modrm ) ) {
   18449       imm8_0 = (Int)(getUChar(delta+1) & 1);
   18450    } else {
   18451       addr   = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   18452       imm8_0 = (Int)(getUChar(delta+alen) & 1);
   18453    }
   18454 
   18455    switch ( imm8_0 ) {
   18456       case 0:  assign( src_qword, unop(Iop_V128to64,   mkexpr(xmm_vec)) );
   18457                break;
   18458       case 1:  assign( src_qword, unop(Iop_V128HIto64, mkexpr(xmm_vec)) );
   18459                break;
   18460       default: vassert(0);
   18461    }
   18462 
   18463    if ( epartIsReg( modrm ) ) {
   18464       putIReg64( eregOfRexRM(pfx,modrm), mkexpr(src_qword) );
   18465       delta += 1+1;
   18466       DIP( "%spextrq $%d, %s,%s\n", mbV, imm8_0,
   18467            nameXMMReg( gregOfRexRM(pfx, modrm) ),
   18468            nameIReg64( eregOfRexRM(pfx, modrm) ) );
   18469    } else {
   18470       storeLE( mkexpr(addr), mkexpr(src_qword) );
   18471       delta += alen+1;
   18472       DIP( "%spextrq $%d, %s,%s\n", mbV,
   18473            imm8_0, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
   18474    }
   18475    return delta;
   18476 }
   18477 
   18478 static IRExpr* math_CTZ32(IRExpr *exp)
   18479 {
   18480    /* Iop_Ctz32 isn't implemented by the amd64 back end, so use Iop_Ctz64. */
   18481    return unop(Iop_64to32, unop(Iop_Ctz64, unop(Iop_32Uto64, exp)));
   18482 }
   18483 
   18484 static Long dis_PCMPISTRI_3A ( UChar modrm, UInt regNoL, UInt regNoR,
   18485                                Long delta, UChar opc, UChar imm,
   18486                                HChar dis_buf[])
   18487 {
   18488    /* We only handle PCMPISTRI for now */
   18489    vassert((opc & 0x03) == 0x03);
   18490    /* And only an immediate byte of 0x38 or 0x3A */
   18491    vassert((imm & ~0x02) == 0x38);
   18492 
   18493    /* FIXME: Is this correct when RegNoL == 16 ? */
   18494    IRTemp argL = newTemp(Ity_V128);
   18495    assign(argL, getXMMReg(regNoL));
   18496    IRTemp argR = newTemp(Ity_V128);
   18497    assign(argR, getXMMReg(regNoR));
   18498 
   18499    IRTemp zmaskL = newTemp(Ity_I32);
   18500    assign(zmaskL, unop(Iop_16Uto32,
   18501                        unop(Iop_GetMSBs8x16,
   18502                             binop(Iop_CmpEQ8x16, mkexpr(argL), mkV128(0)))));
   18503    IRTemp zmaskR = newTemp(Ity_I32);
   18504    assign(zmaskR, unop(Iop_16Uto32,
   18505                        unop(Iop_GetMSBs8x16,
   18506                             binop(Iop_CmpEQ8x16, mkexpr(argR), mkV128(0)))));
   18507 
   18508    /* We want validL = ~(zmaskL | -zmaskL)
   18509 
   18510       But this formulation kills memcheck's validity tracking when any
   18511       bits above the first "1" are invalid.  So reformulate as:
   18512 
   18513       validL = (zmaskL ? (1 << ctz(zmaskL)) : 0) - 1
   18514    */
   18515 
   18516    IRExpr *ctzL = unop(Iop_32to8, math_CTZ32(mkexpr(zmaskL)));
   18517 
   18518    /* Generate a bool expression which is zero iff the original is
   18519       zero.  Do this carefully so memcheck can propagate validity bits
   18520       correctly.
   18521     */
   18522    IRTemp zmaskL_zero = newTemp(Ity_I1);
   18523    assign(zmaskL_zero, binop(Iop_ExpCmpNE32, mkexpr(zmaskL), mkU32(0)));
   18524 
   18525    IRTemp validL = newTemp(Ity_I32);
   18526    assign(validL, binop(Iop_Sub32,
   18527                         IRExpr_ITE(mkexpr(zmaskL_zero),
   18528                                    binop(Iop_Shl32, mkU32(1), ctzL),
   18529                                    mkU32(0)),
   18530                         mkU32(1)));
   18531 
   18532    /* And similarly for validR. */
   18533    IRExpr *ctzR = unop(Iop_32to8, math_CTZ32(mkexpr(zmaskR)));
   18534    IRTemp zmaskR_zero = newTemp(Ity_I1);
   18535    assign(zmaskR_zero, binop(Iop_ExpCmpNE32, mkexpr(zmaskR), mkU32(0)));
   18536    IRTemp validR = newTemp(Ity_I32);
   18537    assign(validR, binop(Iop_Sub32,
   18538                         IRExpr_ITE(mkexpr(zmaskR_zero),
   18539                                    binop(Iop_Shl32, mkU32(1), ctzR),
   18540                                    mkU32(0)),
   18541                         mkU32(1)));
   18542 
   18543    /* Do the actual comparison. */
   18544    IRExpr *boolResII = unop(Iop_16Uto32,
   18545                             unop(Iop_GetMSBs8x16,
   18546                                  binop(Iop_CmpEQ8x16, mkexpr(argL),
   18547                                                       mkexpr(argR))));
   18548 
   18549    /* Compute boolresII & validL & validR (i.e., if both valid, use
   18550       comparison result) */
   18551    IRExpr *intRes1_a = binop(Iop_And32, boolResII,
   18552                              binop(Iop_And32,
   18553                                    mkexpr(validL), mkexpr(validR)));
   18554 
   18555    /* Compute ~(validL | validR); i.e., if both invalid, force 1. */
   18556    IRExpr *intRes1_b = unop(Iop_Not32, binop(Iop_Or32,
   18557                                              mkexpr(validL), mkexpr(validR)));
   18558    /* Otherwise, zero. */
   18559    IRExpr *intRes1 = binop(Iop_And32, mkU32(0xFFFF),
   18560                            binop(Iop_Or32, intRes1_a, intRes1_b));
   18561 
   18562    /* The "0x30" in imm=0x3A means "polarity=3" means XOR validL with
   18563       result. */
   18564    IRTemp intRes2 = newTemp(Ity_I32);
   18565    assign(intRes2, binop(Iop_And32, mkU32(0xFFFF),
   18566                          binop(Iop_Xor32, intRes1, mkexpr(validL))));
   18567 
   18568    /* If the 0x40 bit were set in imm=0x3A, we would return the index
   18569       of the msb.  Since it is clear, we return the index of the
   18570       lsb. */
   18571    IRExpr *newECX = math_CTZ32(binop(Iop_Or32,
   18572                                      mkexpr(intRes2), mkU32(0x10000)));
   18573 
   18574    /* And thats our rcx. */
   18575    putIReg32(R_RCX, newECX);
   18576 
   18577    /* Now for the condition codes... */
   18578 
   18579    /* C == 0 iff intRes2 == 0 */
   18580    IRExpr *c_bit = IRExpr_ITE( binop(Iop_ExpCmpNE32, mkexpr(intRes2),
   18581                                      mkU32(0)),
   18582                                mkU32(1 << AMD64G_CC_SHIFT_C),
   18583                                mkU32(0));
   18584    /* Z == 1 iff any in argL is 0 */
   18585    IRExpr *z_bit = IRExpr_ITE( mkexpr(zmaskL_zero),
   18586                                mkU32(1 << AMD64G_CC_SHIFT_Z),
   18587                                mkU32(0));
   18588    /* S == 1 iff any in argR is 0 */
   18589    IRExpr *s_bit = IRExpr_ITE( mkexpr(zmaskR_zero),
   18590                                mkU32(1 << AMD64G_CC_SHIFT_S),
   18591                                mkU32(0));
   18592    /* O == IntRes2[0] */
   18593    IRExpr *o_bit = binop(Iop_Shl32, binop(Iop_And32, mkexpr(intRes2),
   18594                                           mkU32(0x01)),
   18595                          mkU8(AMD64G_CC_SHIFT_O));
   18596 
   18597    /* Put them all together */
   18598    IRTemp cc = newTemp(Ity_I64);
   18599    assign(cc, widenUto64(binop(Iop_Or32,
   18600                                binop(Iop_Or32, c_bit, z_bit),
   18601                                binop(Iop_Or32, s_bit, o_bit))));
   18602    stmt(IRStmt_Put(OFFB_CC_OP, mkU64(AMD64G_CC_OP_COPY)));
   18603    stmt(IRStmt_Put(OFFB_CC_DEP1, mkexpr(cc)));
   18604    stmt(IRStmt_Put(OFFB_CC_DEP2, mkU64(0)));
   18605    stmt(IRStmt_Put(OFFB_CC_NDEP, mkU64(0)));
   18606 
   18607    return delta;
   18608 }
   18609 
   18610 /* This can fail, in which case it returns the original (unchanged)
   18611    delta. */
   18612 static Long dis_PCMPxSTRx ( const VexAbiInfo* vbi, Prefix pfx,
   18613                             Long delta, Bool isAvx, UChar opc )
   18614 {
   18615    Long   delta0  = delta;
   18616    UInt   isISTRx = opc & 2;
   18617    UInt   isxSTRM = (opc & 1) ^ 1;
   18618    UInt   regNoL  = 0;
   18619    UInt   regNoR  = 0;
   18620    UChar  imm     = 0;
   18621    IRTemp addr    = IRTemp_INVALID;
   18622    Int    alen    = 0;
   18623    HChar  dis_buf[50];
   18624 
   18625    /* This is a nasty kludge.  We need to pass 2 x V128 to the helper
   18626       (which is clean).  Since we can't do that, use a dirty helper to
   18627       compute the results directly from the XMM regs in the guest
   18628       state.  That means for the memory case, we need to move the left
   18629       operand into a pseudo-register (XMM16, let's call it). */
   18630    UChar modrm = getUChar(delta);
   18631    if (epartIsReg(modrm)) {
   18632       regNoL = eregOfRexRM(pfx, modrm);
   18633       regNoR = gregOfRexRM(pfx, modrm);
   18634       imm = getUChar(delta+1);
   18635       delta += 1+1;
   18636    } else {
   18637       regNoL = 16; /* use XMM16 as an intermediary */
   18638       regNoR = gregOfRexRM(pfx, modrm);
   18639       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   18640       /* No alignment check; I guess that makes sense, given that
   18641          these insns are for dealing with C style strings. */
   18642       stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) ));
   18643       imm = getUChar(delta+alen);
   18644       delta += alen+1;
   18645    }
   18646 
   18647    /* Print the insn here, since dis_PCMPISTRI_3A doesn't do so
   18648       itself. */
   18649    if (regNoL == 16) {
   18650       DIP("%spcmp%cstr%c $%x,%s,%s\n",
   18651           isAvx ? "v" : "", isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i',
   18652           (UInt)imm, dis_buf, nameXMMReg(regNoR));
   18653    } else {
   18654       DIP("%spcmp%cstr%c $%x,%s,%s\n",
   18655           isAvx ? "v" : "", isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i',
   18656           (UInt)imm, nameXMMReg(regNoL), nameXMMReg(regNoR));
   18657    }
   18658 
   18659    /* Handle special case(s). */
   18660    if (imm == 0x3A && isISTRx && !isxSTRM) {
   18661       return dis_PCMPISTRI_3A ( modrm, regNoL, regNoR, delta,
   18662                                 opc, imm, dis_buf);
   18663    }
   18664 
   18665    /* Now we know the XMM reg numbers for the operands, and the
   18666       immediate byte.  Is it one we can actually handle? Throw out any
   18667       cases for which the helper function has not been verified. */
   18668    switch (imm) {
   18669       case 0x00: case 0x02: case 0x08: case 0x0A: case 0x0C: case 0x0E:
   18670       case 0x12: case 0x14: case 0x18: case 0x1A:
   18671       case 0x30: case 0x34: case 0x38: case 0x3A:
   18672       case 0x40: case 0x42: case 0x44: case 0x46: case 0x4A:
   18673          break;
   18674       // the 16-bit character versions of the above
   18675       case 0x01: case 0x03: case 0x09: case 0x0B: case 0x0D:
   18676       case 0x13:                       case 0x1B:
   18677                             case 0x39: case 0x3B:
   18678                             case 0x45:            case 0x4B:
   18679          break;
   18680       default:
   18681          return delta0; /*FAIL*/
   18682    }
   18683 
   18684    /* Who ya gonna call?  Presumably not Ghostbusters. */
   18685    void*  fn = &amd64g_dirtyhelper_PCMPxSTRx;
   18686    const HChar* nm = "amd64g_dirtyhelper_PCMPxSTRx";
   18687 
   18688    /* Round up the arguments.  Note that this is a kludge -- the use
   18689       of mkU64 rather than mkIRExpr_HWord implies the assumption that
   18690       the host's word size is 64-bit. */
   18691    UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL);
   18692    UInt gstOffR = ymmGuestRegOffset(regNoR);
   18693 
   18694    IRExpr*  opc4_and_imm = mkU64((opc << 8) | (imm & 0xFF));
   18695    IRExpr*  gstOffLe     = mkU64(gstOffL);
   18696    IRExpr*  gstOffRe     = mkU64(gstOffR);
   18697    IRExpr*  edxIN        = isISTRx ? mkU64(0) : getIRegRDX(8);
   18698    IRExpr*  eaxIN        = isISTRx ? mkU64(0) : getIRegRAX(8);
   18699    IRExpr** args
   18700       = mkIRExprVec_6( IRExpr_BBPTR(),
   18701                        opc4_and_imm, gstOffLe, gstOffRe, edxIN, eaxIN );
   18702 
   18703    IRTemp   resT = newTemp(Ity_I64);
   18704    IRDirty* d    = unsafeIRDirty_1_N( resT, 0/*regparms*/, nm, fn, args );
   18705    /* It's not really a dirty call, but we can't use the clean helper
   18706       mechanism here for the very lame reason that we can't pass 2 x
   18707       V128s by value to a helper.  Hence this roundabout scheme. */
   18708    d->nFxState = 2;
   18709    vex_bzero(&d->fxState, sizeof(d->fxState));
   18710    d->fxState[0].fx     = Ifx_Read;
   18711    d->fxState[0].offset = gstOffL;
   18712    d->fxState[0].size   = sizeof(U128);
   18713    d->fxState[1].fx     = Ifx_Read;
   18714    d->fxState[1].offset = gstOffR;
   18715    d->fxState[1].size   = sizeof(U128);
   18716    if (isxSTRM) {
   18717       /* Declare that the helper writes XMM0. */
   18718       d->nFxState = 3;
   18719       d->fxState[2].fx     = Ifx_Write;
   18720       d->fxState[2].offset = ymmGuestRegOffset(0);
   18721       d->fxState[2].size   = sizeof(U128);
   18722    }
   18723 
   18724    stmt( IRStmt_Dirty(d) );
   18725 
   18726    /* Now resT[15:0] holds the new OSZACP values, so the condition
   18727       codes must be updated. And for a xSTRI case, resT[31:16] holds
   18728       the new ECX value, so stash that too. */
   18729    if (!isxSTRM) {
   18730       putIReg64(R_RCX, binop(Iop_And64,
   18731                              binop(Iop_Shr64, mkexpr(resT), mkU8(16)),
   18732                              mkU64(0xFFFF)));
   18733    }
   18734 
   18735    /* Zap the upper half of the dest reg as per AVX conventions. */
   18736    if (isxSTRM && isAvx)
   18737       putYMMRegLane128(/*YMM*/0, 1, mkV128(0));
   18738 
   18739    stmt( IRStmt_Put(
   18740             OFFB_CC_DEP1,
   18741             binop(Iop_And64, mkexpr(resT), mkU64(0xFFFF))
   18742    ));
   18743    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   18744    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   18745    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   18746 
   18747    return delta;
   18748 }
   18749 
   18750 
   18751 static IRTemp math_PINSRB_128 ( IRTemp v128, IRTemp u8, UInt imm8 )
   18752 {
   18753    vassert(imm8 >= 0 && imm8 <= 15);
   18754 
   18755    // Create a V128 value which has the selected byte in the
   18756    // specified lane, and zeroes everywhere else.
   18757    IRTemp tmp128    = newTemp(Ity_V128);
   18758    IRTemp halfshift = newTemp(Ity_I64);
   18759    assign(halfshift, binop(Iop_Shl64,
   18760                            unop(Iop_8Uto64, mkexpr(u8)),
   18761                            mkU8(8 * (imm8 & 7))));
   18762    if (imm8 < 8) {
   18763       assign(tmp128, binop(Iop_64HLtoV128, mkU64(0), mkexpr(halfshift)));
   18764    } else {
   18765       assign(tmp128, binop(Iop_64HLtoV128, mkexpr(halfshift), mkU64(0)));
   18766    }
   18767 
   18768    UShort mask = ~(1 << imm8);
   18769    IRTemp res  = newTemp(Ity_V128);
   18770    assign( res, binop(Iop_OrV128,
   18771                       mkexpr(tmp128),
   18772                       binop(Iop_AndV128, mkexpr(v128), mkV128(mask))) );
   18773    return res;
   18774 }
   18775 
   18776 
   18777 static IRTemp math_PINSRD_128 ( IRTemp v128, IRTemp u32, UInt imm8 )
   18778 {
   18779    IRTemp z32 = newTemp(Ity_I32);
   18780    assign(z32, mkU32(0));
   18781 
   18782    /* Surround u32 with zeroes as per imm, giving us something we can
   18783       OR into a suitably masked-out v128.*/
   18784    IRTemp withZs = newTemp(Ity_V128);
   18785    UShort mask = 0;
   18786    switch (imm8) {
   18787       case 3:  mask = 0x0FFF;
   18788                assign(withZs, mkV128from32s(u32, z32, z32, z32));
   18789                break;
   18790       case 2:  mask = 0xF0FF;
   18791                assign(withZs, mkV128from32s(z32, u32, z32, z32));
   18792                break;
   18793       case 1:  mask = 0xFF0F;
   18794                assign(withZs, mkV128from32s(z32, z32, u32, z32));
   18795                break;
   18796       case 0:  mask = 0xFFF0;
   18797                assign(withZs, mkV128from32s(z32, z32, z32, u32));
   18798                break;
   18799       default: vassert(0);
   18800    }
   18801 
   18802    IRTemp res = newTemp(Ity_V128);
   18803    assign(res, binop( Iop_OrV128,
   18804                       mkexpr(withZs),
   18805                       binop( Iop_AndV128, mkexpr(v128), mkV128(mask) ) ) );
   18806    return res;
   18807 }
   18808 
   18809 
   18810 static IRTemp math_PINSRQ_128 ( IRTemp v128, IRTemp u64, UInt imm8 )
   18811 {
   18812    /* Surround u64 with zeroes as per imm, giving us something we can
   18813       OR into a suitably masked-out v128.*/
   18814    IRTemp withZs = newTemp(Ity_V128);
   18815    UShort mask = 0;
   18816    if (imm8 == 0) {
   18817       mask = 0xFF00;
   18818       assign(withZs, binop(Iop_64HLtoV128, mkU64(0), mkexpr(u64)));
   18819    } else {
   18820       vassert(imm8 == 1);
   18821       mask = 0x00FF;
   18822       assign( withZs, binop(Iop_64HLtoV128, mkexpr(u64), mkU64(0)));
   18823    }
   18824 
   18825    IRTemp res = newTemp(Ity_V128);
   18826    assign( res, binop( Iop_OrV128,
   18827                        mkexpr(withZs),
   18828                        binop( Iop_AndV128, mkexpr(v128), mkV128(mask) ) ) );
   18829    return res;
   18830 }
   18831 
   18832 
   18833 static IRTemp math_INSERTPS ( IRTemp dstV, IRTemp toInsertD, UInt imm8 )
   18834 {
   18835    const IRTemp inval = IRTemp_INVALID;
   18836    IRTemp dstDs[4] = { inval, inval, inval, inval };
   18837    breakupV128to32s( dstV, &dstDs[3], &dstDs[2], &dstDs[1], &dstDs[0] );
   18838 
   18839    vassert(imm8 <= 255);
   18840    dstDs[(imm8 >> 4) & 3] = toInsertD; /* "imm8_count_d" */
   18841 
   18842    UInt imm8_zmask = (imm8 & 15);
   18843    IRTemp zero_32 = newTemp(Ity_I32);
   18844    assign( zero_32, mkU32(0) );
   18845    IRTemp resV = newTemp(Ity_V128);
   18846    assign( resV, mkV128from32s(
   18847                     ((imm8_zmask & 8) == 8) ? zero_32 : dstDs[3],
   18848                     ((imm8_zmask & 4) == 4) ? zero_32 : dstDs[2],
   18849                     ((imm8_zmask & 2) == 2) ? zero_32 : dstDs[1],
   18850                     ((imm8_zmask & 1) == 1) ? zero_32 : dstDs[0]) );
   18851    return resV;
   18852 }
   18853 
   18854 
   18855 static Long dis_PEXTRB_128_GtoE ( const VexAbiInfo* vbi, Prefix pfx,
   18856                                   Long delta, Bool isAvx )
   18857 {
   18858    IRTemp addr     = IRTemp_INVALID;
   18859    Int    alen     = 0;
   18860    HChar  dis_buf[50];
   18861    IRTemp xmm_vec  = newTemp(Ity_V128);
   18862    IRTemp sel_lane = newTemp(Ity_I32);
   18863    IRTemp shr_lane = newTemp(Ity_I32);
   18864    const HChar* mbV = isAvx ? "v" : "";
   18865    UChar  modrm    = getUChar(delta);
   18866    IRTemp t3, t2, t1, t0;
   18867    Int    imm8;
   18868    assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
   18869    t3 = t2 = t1 = t0 = IRTemp_INVALID;
   18870    breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
   18871 
   18872    if ( epartIsReg( modrm ) ) {
   18873       imm8 = (Int)getUChar(delta+1);
   18874    } else {
   18875       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   18876       imm8 = (Int)getUChar(delta+alen);
   18877    }
   18878    switch ( (imm8 >> 2) & 3 ) {
   18879       case 0:  assign( sel_lane, mkexpr(t0) ); break;
   18880       case 1:  assign( sel_lane, mkexpr(t1) ); break;
   18881       case 2:  assign( sel_lane, mkexpr(t2) ); break;
   18882       case 3:  assign( sel_lane, mkexpr(t3) ); break;
   18883       default: vassert(0);
   18884    }
   18885    assign( shr_lane,
   18886            binop( Iop_Shr32, mkexpr(sel_lane), mkU8(((imm8 & 3)*8)) ) );
   18887 
   18888    if ( epartIsReg( modrm ) ) {
   18889       putIReg64( eregOfRexRM(pfx,modrm),
   18890                  unop( Iop_32Uto64,
   18891                        binop(Iop_And32, mkexpr(shr_lane), mkU32(255)) ) );
   18892       delta += 1+1;
   18893       DIP( "%spextrb $%d, %s,%s\n", mbV, imm8,
   18894            nameXMMReg( gregOfRexRM(pfx, modrm) ),
   18895            nameIReg64( eregOfRexRM(pfx, modrm) ) );
   18896    } else {
   18897       storeLE( mkexpr(addr), unop(Iop_32to8, mkexpr(shr_lane) ) );
   18898       delta += alen+1;
   18899       DIP( "%spextrb $%d,%s,%s\n", mbV,
   18900            imm8, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
   18901    }
   18902 
   18903    return delta;
   18904 }
   18905 
   18906 
   18907 static IRTemp math_DPPD_128 ( IRTemp src_vec, IRTemp dst_vec, UInt imm8 )
   18908 {
   18909    vassert(imm8 < 256);
   18910    UShort imm8_perms[4] = { 0x0000, 0x00FF, 0xFF00, 0xFFFF };
   18911    IRTemp and_vec = newTemp(Ity_V128);
   18912    IRTemp sum_vec = newTemp(Ity_V128);
   18913    IRTemp rm      = newTemp(Ity_I32);
   18914    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   18915    assign( and_vec, binop( Iop_AndV128,
   18916                            triop( Iop_Mul64Fx2,
   18917                                   mkexpr(rm),
   18918                                   mkexpr(dst_vec), mkexpr(src_vec) ),
   18919                            mkV128( imm8_perms[ ((imm8 >> 4) & 3) ] ) ) );
   18920 
   18921    assign( sum_vec, binop( Iop_Add64F0x2,
   18922                            binop( Iop_InterleaveHI64x2,
   18923                                   mkexpr(and_vec), mkexpr(and_vec) ),
   18924                            binop( Iop_InterleaveLO64x2,
   18925                                   mkexpr(and_vec), mkexpr(and_vec) ) ) );
   18926    IRTemp res = newTemp(Ity_V128);
   18927    assign(res, binop( Iop_AndV128,
   18928                       binop( Iop_InterleaveLO64x2,
   18929                              mkexpr(sum_vec), mkexpr(sum_vec) ),
   18930                       mkV128( imm8_perms[ (imm8 & 3) ] ) ) );
   18931    return res;
   18932 }
   18933 
   18934 
   18935 static IRTemp math_DPPS_128 ( IRTemp src_vec, IRTemp dst_vec, UInt imm8 )
   18936 {
   18937    vassert(imm8 < 256);
   18938    IRTemp tmp_prod_vec = newTemp(Ity_V128);
   18939    IRTemp prod_vec     = newTemp(Ity_V128);
   18940    IRTemp sum_vec      = newTemp(Ity_V128);
   18941    IRTemp rm           = newTemp(Ity_I32);
   18942    IRTemp v3, v2, v1, v0;
   18943    v3 = v2 = v1 = v0   = IRTemp_INVALID;
   18944    UShort imm8_perms[16] = { 0x0000, 0x000F, 0x00F0, 0x00FF, 0x0F00,
   18945                              0x0F0F, 0x0FF0, 0x0FFF, 0xF000, 0xF00F,
   18946                              0xF0F0, 0xF0FF, 0xFF00, 0xFF0F, 0xFFF0,
   18947                              0xFFFF };
   18948 
   18949    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   18950    assign( tmp_prod_vec,
   18951            binop( Iop_AndV128,
   18952                   triop( Iop_Mul32Fx4,
   18953                          mkexpr(rm), mkexpr(dst_vec), mkexpr(src_vec) ),
   18954                   mkV128( imm8_perms[((imm8 >> 4)& 15)] ) ) );
   18955    breakupV128to32s( tmp_prod_vec, &v3, &v2, &v1, &v0 );
   18956    assign( prod_vec, mkV128from32s( v3, v1, v2, v0 ) );
   18957 
   18958    assign( sum_vec, triop( Iop_Add32Fx4,
   18959                            mkexpr(rm),
   18960                            binop( Iop_InterleaveHI32x4,
   18961                                   mkexpr(prod_vec), mkexpr(prod_vec) ),
   18962                            binop( Iop_InterleaveLO32x4,
   18963                                   mkexpr(prod_vec), mkexpr(prod_vec) ) ) );
   18964 
   18965    IRTemp res = newTemp(Ity_V128);
   18966    assign( res, binop( Iop_AndV128,
   18967                        triop( Iop_Add32Fx4,
   18968                               mkexpr(rm),
   18969                               binop( Iop_InterleaveHI32x4,
   18970                                      mkexpr(sum_vec), mkexpr(sum_vec) ),
   18971                               binop( Iop_InterleaveLO32x4,
   18972                                      mkexpr(sum_vec), mkexpr(sum_vec) ) ),
   18973                        mkV128( imm8_perms[ (imm8 & 15) ] ) ) );
   18974    return res;
   18975 }
   18976 
   18977 
   18978 static IRTemp math_MPSADBW_128 ( IRTemp dst_vec, IRTemp src_vec, UInt imm8 )
   18979 {
   18980    /* Mask out bits of the operands we don't need.  This isn't
   18981       strictly necessary, but it does ensure Memcheck doesn't
   18982       give us any false uninitialised value errors as a
   18983       result. */
   18984    UShort src_mask[4] = { 0x000F, 0x00F0, 0x0F00, 0xF000 };
   18985    UShort dst_mask[2] = { 0x07FF, 0x7FF0 };
   18986 
   18987    IRTemp src_maskV = newTemp(Ity_V128);
   18988    IRTemp dst_maskV = newTemp(Ity_V128);
   18989    assign(src_maskV, mkV128( src_mask[ imm8 & 3 ] ));
   18990    assign(dst_maskV, mkV128( dst_mask[ (imm8 >> 2) & 1 ] ));
   18991 
   18992    IRTemp src_masked = newTemp(Ity_V128);
   18993    IRTemp dst_masked = newTemp(Ity_V128);
   18994    assign(src_masked, binop(Iop_AndV128, mkexpr(src_vec), mkexpr(src_maskV)));
   18995    assign(dst_masked, binop(Iop_AndV128, mkexpr(dst_vec), mkexpr(dst_maskV)));
   18996 
   18997    /* Generate 4 64 bit values that we can hand to a clean helper */
   18998    IRTemp sHi = newTemp(Ity_I64);
   18999    IRTemp sLo = newTemp(Ity_I64);
   19000    assign( sHi, unop(Iop_V128HIto64, mkexpr(src_masked)) );
   19001    assign( sLo, unop(Iop_V128to64,   mkexpr(src_masked)) );
   19002 
   19003    IRTemp dHi = newTemp(Ity_I64);
   19004    IRTemp dLo = newTemp(Ity_I64);
   19005    assign( dHi, unop(Iop_V128HIto64, mkexpr(dst_masked)) );
   19006    assign( dLo, unop(Iop_V128to64,   mkexpr(dst_masked)) );
   19007 
   19008    /* Compute halves of the result separately */
   19009    IRTemp resHi = newTemp(Ity_I64);
   19010    IRTemp resLo = newTemp(Ity_I64);
   19011 
   19012    IRExpr** argsHi
   19013       = mkIRExprVec_5( mkexpr(sHi), mkexpr(sLo), mkexpr(dHi), mkexpr(dLo),
   19014                        mkU64( 0x80 | (imm8 & 7) ));
   19015    IRExpr** argsLo
   19016       = mkIRExprVec_5( mkexpr(sHi), mkexpr(sLo), mkexpr(dHi), mkexpr(dLo),
   19017                        mkU64( 0x00 | (imm8 & 7) ));
   19018 
   19019    assign(resHi, mkIRExprCCall( Ity_I64, 0/*regparm*/,
   19020                                 "amd64g_calc_mpsadbw",
   19021                                 &amd64g_calc_mpsadbw, argsHi ));
   19022    assign(resLo, mkIRExprCCall( Ity_I64, 0/*regparm*/,
   19023                                 "amd64g_calc_mpsadbw",
   19024                                 &amd64g_calc_mpsadbw, argsLo ));
   19025 
   19026    IRTemp res = newTemp(Ity_V128);
   19027    assign(res, binop(Iop_64HLtoV128, mkexpr(resHi), mkexpr(resLo)));
   19028    return res;
   19029 }
   19030 
   19031 static Long dis_EXTRACTPS ( const VexAbiInfo* vbi, Prefix pfx,
   19032                             Long delta, Bool isAvx )
   19033 {
   19034    IRTemp addr       = IRTemp_INVALID;
   19035    Int    alen       = 0;
   19036    HChar  dis_buf[50];
   19037    UChar  modrm      = getUChar(delta);
   19038    Int imm8_10;
   19039    IRTemp xmm_vec    = newTemp(Ity_V128);
   19040    IRTemp src_dword  = newTemp(Ity_I32);
   19041    UInt   rG         = gregOfRexRM(pfx,modrm);
   19042    IRTemp t3, t2, t1, t0;
   19043    t3 = t2 = t1 = t0 = IRTemp_INVALID;
   19044 
   19045    assign( xmm_vec, getXMMReg( rG ) );
   19046    breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
   19047 
   19048    if ( epartIsReg( modrm ) ) {
   19049       imm8_10 = (Int)(getUChar(delta+1) & 3);
   19050    } else {
   19051       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   19052       imm8_10 = (Int)(getUChar(delta+alen) & 3);
   19053    }
   19054 
   19055    switch ( imm8_10 ) {
   19056       case 0:  assign( src_dword, mkexpr(t0) ); break;
   19057       case 1:  assign( src_dword, mkexpr(t1) ); break;
   19058       case 2:  assign( src_dword, mkexpr(t2) ); break;
   19059       case 3:  assign( src_dword, mkexpr(t3) ); break;
   19060       default: vassert(0);
   19061    }
   19062 
   19063    if ( epartIsReg( modrm ) ) {
   19064       UInt rE = eregOfRexRM(pfx,modrm);
   19065       putIReg32( rE, mkexpr(src_dword) );
   19066       delta += 1+1;
   19067       DIP( "%sextractps $%d, %s,%s\n", isAvx ? "v" : "", imm8_10,
   19068            nameXMMReg( rG ), nameIReg32( rE ) );
   19069    } else {
   19070       storeLE( mkexpr(addr), mkexpr(src_dword) );
   19071       delta += alen+1;
   19072       DIP( "%sextractps $%d, %s,%s\n", isAvx ? "v" : "", imm8_10,
   19073            nameXMMReg( rG ), dis_buf );
   19074    }
   19075 
   19076    return delta;
   19077 }
   19078 
   19079 
   19080 static IRTemp math_PCLMULQDQ( IRTemp dV, IRTemp sV, UInt imm8 )
   19081 {
   19082    IRTemp t0 = newTemp(Ity_I64);
   19083    IRTemp t1 = newTemp(Ity_I64);
   19084    assign(t0, unop((imm8&1)? Iop_V128HIto64 : Iop_V128to64,
   19085               mkexpr(dV)));
   19086    assign(t1, unop((imm8&16) ? Iop_V128HIto64 : Iop_V128to64,
   19087               mkexpr(sV)));
   19088 
   19089    IRTemp t2 = newTemp(Ity_I64);
   19090    IRTemp t3 = newTemp(Ity_I64);
   19091 
   19092    IRExpr** args;
   19093 
   19094    args = mkIRExprVec_3(mkexpr(t0), mkexpr(t1), mkU64(0));
   19095    assign(t2, mkIRExprCCall(Ity_I64,0, "amd64g_calculate_pclmul",
   19096                             &amd64g_calculate_pclmul, args));
   19097    args = mkIRExprVec_3(mkexpr(t0), mkexpr(t1), mkU64(1));
   19098    assign(t3, mkIRExprCCall(Ity_I64,0, "amd64g_calculate_pclmul",
   19099                             &amd64g_calculate_pclmul, args));
   19100 
   19101    IRTemp res     = newTemp(Ity_V128);
   19102    assign(res, binop(Iop_64HLtoV128, mkexpr(t3), mkexpr(t2)));
   19103    return res;
   19104 }
   19105 
   19106 
   19107 __attribute__((noinline))
   19108 static
   19109 Long dis_ESC_0F3A__SSE4 ( Bool* decode_OK,
   19110                           const VexAbiInfo* vbi,
   19111                           Prefix pfx, Int sz, Long deltaIN )
   19112 {
   19113    IRTemp addr  = IRTemp_INVALID;
   19114    UChar  modrm = 0;
   19115    Int    alen  = 0;
   19116    HChar  dis_buf[50];
   19117 
   19118    *decode_OK = False;
   19119 
   19120    Long   delta = deltaIN;
   19121    UChar  opc   = getUChar(delta);
   19122    delta++;
   19123    switch (opc) {
   19124 
   19125    case 0x08:
   19126       /* 66 0F 3A 08 /r ib = ROUNDPS imm8, xmm2/m128, xmm1 */
   19127       if (have66noF2noF3(pfx) && sz == 2) {
   19128 
   19129          IRTemp src0 = newTemp(Ity_F32);
   19130          IRTemp src1 = newTemp(Ity_F32);
   19131          IRTemp src2 = newTemp(Ity_F32);
   19132          IRTemp src3 = newTemp(Ity_F32);
   19133          IRTemp res0 = newTemp(Ity_F32);
   19134          IRTemp res1 = newTemp(Ity_F32);
   19135          IRTemp res2 = newTemp(Ity_F32);
   19136          IRTemp res3 = newTemp(Ity_F32);
   19137          IRTemp rm   = newTemp(Ity_I32);
   19138          Int    imm  = 0;
   19139 
   19140          modrm = getUChar(delta);
   19141 
   19142          if (epartIsReg(modrm)) {
   19143             assign( src0,
   19144                     getXMMRegLane32F( eregOfRexRM(pfx, modrm), 0 ) );
   19145             assign( src1,
   19146                     getXMMRegLane32F( eregOfRexRM(pfx, modrm), 1 ) );
   19147             assign( src2,
   19148                     getXMMRegLane32F( eregOfRexRM(pfx, modrm), 2 ) );
   19149             assign( src3,
   19150                     getXMMRegLane32F( eregOfRexRM(pfx, modrm), 3 ) );
   19151             imm = getUChar(delta+1);
   19152             if (imm & ~15) goto decode_failure;
   19153             delta += 1+1;
   19154             DIP( "roundps $%d,%s,%s\n",
   19155                  imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
   19156                       nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   19157          } else {
   19158             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   19159             gen_SEGV_if_not_16_aligned(addr);
   19160             assign( src0, loadLE(Ity_F32,
   19161                                  binop(Iop_Add64, mkexpr(addr), mkU64(0) )));
   19162             assign( src1, loadLE(Ity_F32,
   19163                                  binop(Iop_Add64, mkexpr(addr), mkU64(4) )));
   19164             assign( src2, loadLE(Ity_F32,
   19165                                  binop(Iop_Add64, mkexpr(addr), mkU64(8) )));
   19166             assign( src3, loadLE(Ity_F32,
   19167                                  binop(Iop_Add64, mkexpr(addr), mkU64(12) )));
   19168             imm = getUChar(delta+alen);
   19169             if (imm & ~15) goto decode_failure;
   19170             delta += alen+1;
   19171             DIP( "roundps $%d,%s,%s\n",
   19172                  imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   19173          }
   19174 
   19175          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   19176             that encoding is the same as the encoding for IRRoundingMode,
   19177             we can use that value directly in the IR as a rounding
   19178             mode. */
   19179          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   19180 
   19181          assign(res0, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src0)) );
   19182          assign(res1, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src1)) );
   19183          assign(res2, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src2)) );
   19184          assign(res3, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src3)) );
   19185 
   19186          putXMMRegLane32F( gregOfRexRM(pfx, modrm), 0, mkexpr(res0) );
   19187          putXMMRegLane32F( gregOfRexRM(pfx, modrm), 1, mkexpr(res1) );
   19188          putXMMRegLane32F( gregOfRexRM(pfx, modrm), 2, mkexpr(res2) );
   19189          putXMMRegLane32F( gregOfRexRM(pfx, modrm), 3, mkexpr(res3) );
   19190 
   19191          goto decode_success;
   19192       }
   19193       break;
   19194 
   19195    case 0x09:
   19196       /* 66 0F 3A 09 /r ib = ROUNDPD imm8, xmm2/m128, xmm1 */
   19197       if (have66noF2noF3(pfx) && sz == 2) {
   19198 
   19199          IRTemp src0 = newTemp(Ity_F64);
   19200          IRTemp src1 = newTemp(Ity_F64);
   19201          IRTemp res0 = newTemp(Ity_F64);
   19202          IRTemp res1 = newTemp(Ity_F64);
   19203          IRTemp rm   = newTemp(Ity_I32);
   19204          Int    imm  = 0;
   19205 
   19206          modrm = getUChar(delta);
   19207 
   19208          if (epartIsReg(modrm)) {
   19209             assign( src0,
   19210                     getXMMRegLane64F( eregOfRexRM(pfx, modrm), 0 ) );
   19211             assign( src1,
   19212                     getXMMRegLane64F( eregOfRexRM(pfx, modrm), 1 ) );
   19213             imm = getUChar(delta+1);
   19214             if (imm & ~15) goto decode_failure;
   19215             delta += 1+1;
   19216             DIP( "roundpd $%d,%s,%s\n",
   19217                  imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
   19218                       nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   19219          } else {
   19220             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   19221             gen_SEGV_if_not_16_aligned(addr);
   19222             assign( src0, loadLE(Ity_F64,
   19223                                  binop(Iop_Add64, mkexpr(addr), mkU64(0) )));
   19224             assign( src1, loadLE(Ity_F64,
   19225                                  binop(Iop_Add64, mkexpr(addr), mkU64(8) )));
   19226             imm = getUChar(delta+alen);
   19227             if (imm & ~15) goto decode_failure;
   19228             delta += alen+1;
   19229             DIP( "roundpd $%d,%s,%s\n",
   19230                  imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   19231          }
   19232 
   19233          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   19234             that encoding is the same as the encoding for IRRoundingMode,
   19235             we can use that value directly in the IR as a rounding
   19236             mode. */
   19237          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   19238 
   19239          assign(res0, binop(Iop_RoundF64toInt, mkexpr(rm), mkexpr(src0)) );
   19240          assign(res1, binop(Iop_RoundF64toInt, mkexpr(rm), mkexpr(src1)) );
   19241 
   19242          putXMMRegLane64F( gregOfRexRM(pfx, modrm), 0, mkexpr(res0) );
   19243          putXMMRegLane64F( gregOfRexRM(pfx, modrm), 1, mkexpr(res1) );
   19244 
   19245          goto decode_success;
   19246       }
   19247       break;
   19248 
   19249    case 0x0A:
   19250    case 0x0B:
   19251       /* 66 0F 3A 0A /r ib = ROUNDSS imm8, xmm2/m32, xmm1
   19252          66 0F 3A 0B /r ib = ROUNDSD imm8, xmm2/m64, xmm1
   19253       */
   19254       if (have66noF2noF3(pfx) && sz == 2) {
   19255 
   19256          Bool   isD = opc == 0x0B;
   19257          IRTemp src = newTemp(isD ? Ity_F64 : Ity_F32);
   19258          IRTemp res = newTemp(isD ? Ity_F64 : Ity_F32);
   19259          Int    imm = 0;
   19260 
   19261          modrm = getUChar(delta);
   19262 
   19263          if (epartIsReg(modrm)) {
   19264             assign( src,
   19265                     isD ? getXMMRegLane64F( eregOfRexRM(pfx, modrm), 0 )
   19266                         : getXMMRegLane32F( eregOfRexRM(pfx, modrm), 0 ) );
   19267             imm = getUChar(delta+1);
   19268             if (imm & ~15) goto decode_failure;
   19269             delta += 1+1;
   19270             DIP( "rounds%c $%d,%s,%s\n",
   19271                  isD ? 'd' : 's',
   19272                  imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
   19273                       nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   19274          } else {
   19275             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   19276             assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
   19277             imm = getUChar(delta+alen);
   19278             if (imm & ~15) goto decode_failure;
   19279             delta += alen+1;
   19280             DIP( "rounds%c $%d,%s,%s\n",
   19281                  isD ? 'd' : 's',
   19282                  imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   19283          }
   19284 
   19285          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   19286             that encoding is the same as the encoding for IRRoundingMode,
   19287             we can use that value directly in the IR as a rounding
   19288             mode. */
   19289          assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
   19290                            (imm & 4) ? get_sse_roundingmode()
   19291                                      : mkU32(imm & 3),
   19292                            mkexpr(src)) );
   19293 
   19294          if (isD)
   19295             putXMMRegLane64F( gregOfRexRM(pfx, modrm), 0, mkexpr(res) );
   19296          else
   19297             putXMMRegLane32F( gregOfRexRM(pfx, modrm), 0, mkexpr(res) );
   19298 
   19299          goto decode_success;
   19300       }
   19301       break;
   19302 
   19303    case 0x0C:
   19304       /* 66 0F 3A 0C /r ib = BLENDPS xmm1, xmm2/m128, imm8
   19305          Blend Packed Single Precision Floating-Point Values (XMM) */
   19306       if (have66noF2noF3(pfx) && sz == 2) {
   19307 
   19308          Int imm8;
   19309          IRTemp dst_vec = newTemp(Ity_V128);
   19310          IRTemp src_vec = newTemp(Ity_V128);
   19311 
   19312          modrm = getUChar(delta);
   19313 
   19314          assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
   19315 
   19316          if ( epartIsReg( modrm ) ) {
   19317             imm8 = (Int)getUChar(delta+1);
   19318             assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   19319             delta += 1+1;
   19320             DIP( "blendps $%d, %s,%s\n", imm8,
   19321                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   19322                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   19323          } else {
   19324             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   19325                              1/* imm8 is 1 byte after the amode */ );
   19326             gen_SEGV_if_not_16_aligned( addr );
   19327             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   19328             imm8 = (Int)getUChar(delta+alen);
   19329             delta += alen+1;
   19330             DIP( "blendpd $%d, %s,%s\n",
   19331                  imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   19332          }
   19333 
   19334          putXMMReg( gregOfRexRM(pfx, modrm),
   19335                     mkexpr( math_BLENDPS_128( src_vec, dst_vec, imm8) ) );
   19336          goto decode_success;
   19337       }
   19338       break;
   19339 
   19340    case 0x0D:
   19341       /* 66 0F 3A 0D /r ib = BLENDPD xmm1, xmm2/m128, imm8
   19342          Blend Packed Double Precision Floating-Point Values (XMM) */
   19343       if (have66noF2noF3(pfx) && sz == 2) {
   19344 
   19345          Int imm8;
   19346          IRTemp dst_vec = newTemp(Ity_V128);
   19347          IRTemp src_vec = newTemp(Ity_V128);
   19348 
   19349          modrm = getUChar(delta);
   19350          assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
   19351 
   19352          if ( epartIsReg( modrm ) ) {
   19353             imm8 = (Int)getUChar(delta+1);
   19354             assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   19355             delta += 1+1;
   19356             DIP( "blendpd $%d, %s,%s\n", imm8,
   19357                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   19358                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   19359          } else {
   19360             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   19361                              1/* imm8 is 1 byte after the amode */ );
   19362             gen_SEGV_if_not_16_aligned( addr );
   19363             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   19364             imm8 = (Int)getUChar(delta+alen);
   19365             delta += alen+1;
   19366             DIP( "blendpd $%d, %s,%s\n",
   19367                  imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   19368          }
   19369 
   19370          putXMMReg( gregOfRexRM(pfx, modrm),
   19371                     mkexpr( math_BLENDPD_128( src_vec, dst_vec, imm8) ) );
   19372          goto decode_success;
   19373       }
   19374       break;
   19375 
   19376    case 0x0E:
   19377       /* 66 0F 3A 0E /r ib = PBLENDW xmm1, xmm2/m128, imm8
   19378          Blend Packed Words (XMM) */
   19379       if (have66noF2noF3(pfx) && sz == 2) {
   19380 
   19381          Int imm8;
   19382          IRTemp dst_vec = newTemp(Ity_V128);
   19383          IRTemp src_vec = newTemp(Ity_V128);
   19384 
   19385          modrm = getUChar(delta);
   19386 
   19387          assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
   19388 
   19389          if ( epartIsReg( modrm ) ) {
   19390             imm8 = (Int)getUChar(delta+1);
   19391             assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   19392             delta += 1+1;
   19393             DIP( "pblendw $%d, %s,%s\n", imm8,
   19394                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   19395                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   19396          } else {
   19397             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   19398                              1/* imm8 is 1 byte after the amode */ );
   19399             gen_SEGV_if_not_16_aligned( addr );
   19400             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   19401             imm8 = (Int)getUChar(delta+alen);
   19402             delta += alen+1;
   19403             DIP( "pblendw $%d, %s,%s\n",
   19404                  imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   19405          }
   19406 
   19407          putXMMReg( gregOfRexRM(pfx, modrm),
   19408                     mkexpr( math_PBLENDW_128( src_vec, dst_vec, imm8) ) );
   19409          goto decode_success;
   19410       }
   19411       break;
   19412 
   19413    case 0x14:
   19414       /* 66 0F 3A 14 /r ib = PEXTRB r/m16, xmm, imm8
   19415          Extract Byte from xmm, store in mem or zero-extend + store in gen.reg.
   19416          (XMM) */
   19417       if (have66noF2noF3(pfx) && sz == 2) {
   19418          delta = dis_PEXTRB_128_GtoE( vbi, pfx, delta, False/*!isAvx*/ );
   19419          goto decode_success;
   19420       }
   19421       break;
   19422 
   19423    case 0x15:
   19424       /* 66 0F 3A 15 /r ib = PEXTRW r/m16, xmm, imm8
   19425          Extract Word from xmm, store in mem or zero-extend + store in gen.reg.
   19426          (XMM) */
   19427       if (have66noF2noF3(pfx) && sz == 2) {
   19428          delta = dis_PEXTRW( vbi, pfx, delta, False/*!isAvx*/ );
   19429          goto decode_success;
   19430       }
   19431       break;
   19432 
   19433    case 0x16:
   19434       /* 66 no-REX.W 0F 3A 16 /r ib = PEXTRD reg/mem32, xmm2, imm8
   19435          Extract Doubleword int from xmm reg and store in gen.reg or mem. (XMM)
   19436          Note that this insn has the same opcodes as PEXTRQ, but
   19437          here the REX.W bit is _not_ present */
   19438       if (have66noF2noF3(pfx)
   19439           && sz == 2 /* REX.W is _not_ present */) {
   19440          delta = dis_PEXTRD( vbi, pfx, delta, False/*!isAvx*/ );
   19441          goto decode_success;
   19442       }
   19443       /* 66 REX.W 0F 3A 16 /r ib = PEXTRQ reg/mem64, xmm2, imm8
   19444          Extract Quadword int from xmm reg and store in gen.reg or mem. (XMM)
   19445          Note that this insn has the same opcodes as PEXTRD, but
   19446          here the REX.W bit is present */
   19447       if (have66noF2noF3(pfx)
   19448           && sz == 8 /* REX.W is present */) {
   19449          delta = dis_PEXTRQ( vbi, pfx, delta, False/*!isAvx*/);
   19450          goto decode_success;
   19451       }
   19452       break;
   19453 
   19454    case 0x17:
   19455       /* 66 0F 3A 17 /r ib = EXTRACTPS reg/mem32, xmm2, imm8 Extract
   19456          float from xmm reg and store in gen.reg or mem.  This is
   19457          identical to PEXTRD, except that REX.W appears to be ignored.
   19458       */
   19459       if (have66noF2noF3(pfx)
   19460           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   19461          delta = dis_EXTRACTPS( vbi, pfx, delta, False/*!isAvx*/ );
   19462          goto decode_success;
   19463       }
   19464       break;
   19465 
   19466    case 0x20:
   19467       /* 66 0F 3A 20 /r ib = PINSRB xmm1, r32/m8, imm8
   19468          Extract byte from r32/m8 and insert into xmm1 */
   19469       if (have66noF2noF3(pfx) && sz == 2) {
   19470          Int    imm8;
   19471          IRTemp new8 = newTemp(Ity_I8);
   19472          modrm = getUChar(delta);
   19473          UInt rG = gregOfRexRM(pfx, modrm);
   19474          if ( epartIsReg( modrm ) ) {
   19475             UInt rE = eregOfRexRM(pfx,modrm);
   19476             imm8 = (Int)(getUChar(delta+1) & 0xF);
   19477             assign( new8, unop(Iop_32to8, getIReg32(rE)) );
   19478             delta += 1+1;
   19479             DIP( "pinsrb $%d,%s,%s\n", imm8,
   19480                  nameIReg32(rE), nameXMMReg(rG) );
   19481          } else {
   19482             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   19483             imm8 = (Int)(getUChar(delta+alen) & 0xF);
   19484             assign( new8, loadLE( Ity_I8, mkexpr(addr) ) );
   19485             delta += alen+1;
   19486             DIP( "pinsrb $%d,%s,%s\n",
   19487                  imm8, dis_buf, nameXMMReg(rG) );
   19488          }
   19489          IRTemp src_vec = newTemp(Ity_V128);
   19490          assign(src_vec, getXMMReg( gregOfRexRM(pfx, modrm) ));
   19491          IRTemp res = math_PINSRB_128( src_vec, new8, imm8 );
   19492          putXMMReg( rG, mkexpr(res) );
   19493          goto decode_success;
   19494       }
   19495       break;
   19496 
   19497    case 0x21:
   19498       /* 66 0F 3A 21 /r ib = INSERTPS imm8, xmm2/m32, xmm1
   19499          Insert Packed Single Precision Floating-Point Value (XMM) */
   19500       if (have66noF2noF3(pfx) && sz == 2) {
   19501          UInt   imm8;
   19502          IRTemp d2ins = newTemp(Ity_I32); /* comes from the E part */
   19503          const IRTemp inval = IRTemp_INVALID;
   19504 
   19505          modrm = getUChar(delta);
   19506          UInt rG = gregOfRexRM(pfx, modrm);
   19507 
   19508          if ( epartIsReg( modrm ) ) {
   19509             UInt   rE = eregOfRexRM(pfx, modrm);
   19510             IRTemp vE = newTemp(Ity_V128);
   19511             assign( vE, getXMMReg(rE) );
   19512             IRTemp dsE[4] = { inval, inval, inval, inval };
   19513             breakupV128to32s( vE, &dsE[3], &dsE[2], &dsE[1], &dsE[0] );
   19514             imm8 = getUChar(delta+1);
   19515             d2ins = dsE[(imm8 >> 6) & 3]; /* "imm8_count_s" */
   19516             delta += 1+1;
   19517             DIP( "insertps $%u, %s,%s\n",
   19518                  imm8, nameXMMReg(rE), nameXMMReg(rG) );
   19519          } else {
   19520             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   19521             assign( d2ins, loadLE( Ity_I32, mkexpr(addr) ) );
   19522             imm8 = getUChar(delta+alen);
   19523             delta += alen+1;
   19524             DIP( "insertps $%u, %s,%s\n",
   19525                  imm8, dis_buf, nameXMMReg(rG) );
   19526          }
   19527 
   19528          IRTemp vG = newTemp(Ity_V128);
   19529          assign( vG, getXMMReg(rG) );
   19530 
   19531          putXMMReg( rG, mkexpr(math_INSERTPS( vG, d2ins, imm8 )) );
   19532          goto decode_success;
   19533       }
   19534       break;
   19535 
   19536    case 0x22:
   19537       /* 66 no-REX.W 0F 3A 22 /r ib = PINSRD xmm1, r/m32, imm8
   19538          Extract Doubleword int from gen.reg/mem32 and insert into xmm1 */
   19539       if (have66noF2noF3(pfx)
   19540           && sz == 2 /* REX.W is NOT present */) {
   19541          Int    imm8_10;
   19542          IRTemp src_u32 = newTemp(Ity_I32);
   19543          modrm = getUChar(delta);
   19544          UInt rG = gregOfRexRM(pfx, modrm);
   19545 
   19546          if ( epartIsReg( modrm ) ) {
   19547             UInt rE = eregOfRexRM(pfx,modrm);
   19548             imm8_10 = (Int)(getUChar(delta+1) & 3);
   19549             assign( src_u32, getIReg32( rE ) );
   19550             delta += 1+1;
   19551             DIP( "pinsrd $%d, %s,%s\n",
   19552                  imm8_10, nameIReg32(rE), nameXMMReg(rG) );
   19553          } else {
   19554             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   19555             imm8_10 = (Int)(getUChar(delta+alen) & 3);
   19556             assign( src_u32, loadLE( Ity_I32, mkexpr(addr) ) );
   19557             delta += alen+1;
   19558             DIP( "pinsrd $%d, %s,%s\n",
   19559                  imm8_10, dis_buf, nameXMMReg(rG) );
   19560          }
   19561 
   19562          IRTemp src_vec = newTemp(Ity_V128);
   19563          assign(src_vec, getXMMReg( rG ));
   19564          IRTemp res_vec = math_PINSRD_128( src_vec, src_u32, imm8_10 );
   19565          putXMMReg( rG, mkexpr(res_vec) );
   19566          goto decode_success;
   19567       }
   19568       /* 66 REX.W 0F 3A 22 /r ib = PINSRQ xmm1, r/m64, imm8
   19569          Extract Quadword int from gen.reg/mem64 and insert into xmm1 */
   19570       if (have66noF2noF3(pfx)
   19571           && sz == 8 /* REX.W is present */) {
   19572          Int imm8_0;
   19573          IRTemp src_u64 = newTemp(Ity_I64);
   19574          modrm = getUChar(delta);
   19575          UInt rG = gregOfRexRM(pfx, modrm);
   19576 
   19577          if ( epartIsReg( modrm ) ) {
   19578             UInt rE = eregOfRexRM(pfx,modrm);
   19579             imm8_0 = (Int)(getUChar(delta+1) & 1);
   19580             assign( src_u64, getIReg64( rE ) );
   19581             delta += 1+1;
   19582             DIP( "pinsrq $%d, %s,%s\n",
   19583                  imm8_0, nameIReg64(rE), nameXMMReg(rG) );
   19584          } else {
   19585             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   19586             imm8_0 = (Int)(getUChar(delta+alen) & 1);
   19587             assign( src_u64, loadLE( Ity_I64, mkexpr(addr) ) );
   19588             delta += alen+1;
   19589             DIP( "pinsrq $%d, %s,%s\n",
   19590                  imm8_0, dis_buf, nameXMMReg(rG) );
   19591          }
   19592 
   19593          IRTemp src_vec = newTemp(Ity_V128);
   19594          assign(src_vec, getXMMReg( rG ));
   19595          IRTemp res_vec = math_PINSRQ_128( src_vec, src_u64, imm8_0 );
   19596          putXMMReg( rG, mkexpr(res_vec) );
   19597          goto decode_success;
   19598       }
   19599       break;
   19600 
   19601    case 0x40:
   19602       /* 66 0F 3A 40 /r ib = DPPS xmm1, xmm2/m128, imm8
   19603          Dot Product of Packed Single Precision Floating-Point Values (XMM) */
   19604       if (have66noF2noF3(pfx) && sz == 2) {
   19605          modrm = getUChar(delta);
   19606          Int    imm8;
   19607          IRTemp src_vec = newTemp(Ity_V128);
   19608          IRTemp dst_vec = newTemp(Ity_V128);
   19609          UInt   rG      = gregOfRexRM(pfx, modrm);
   19610          assign( dst_vec, getXMMReg( rG ) );
   19611          if ( epartIsReg( modrm ) ) {
   19612             UInt rE = eregOfRexRM(pfx, modrm);
   19613             imm8 = (Int)getUChar(delta+1);
   19614             assign( src_vec, getXMMReg(rE) );
   19615             delta += 1+1;
   19616             DIP( "dpps $%d, %s,%s\n",
   19617                  imm8, nameXMMReg(rE), nameXMMReg(rG) );
   19618          } else {
   19619             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   19620                              1/* imm8 is 1 byte after the amode */ );
   19621             gen_SEGV_if_not_16_aligned( addr );
   19622             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   19623             imm8 = (Int)getUChar(delta+alen);
   19624             delta += alen+1;
   19625             DIP( "dpps $%d, %s,%s\n",
   19626                  imm8, dis_buf, nameXMMReg(rG) );
   19627          }
   19628          IRTemp res = math_DPPS_128( src_vec, dst_vec, imm8 );
   19629          putXMMReg( rG, mkexpr(res) );
   19630          goto decode_success;
   19631       }
   19632       break;
   19633 
   19634    case 0x41:
   19635       /* 66 0F 3A 41 /r ib = DPPD xmm1, xmm2/m128, imm8
   19636          Dot Product of Packed Double Precision Floating-Point Values (XMM) */
   19637       if (have66noF2noF3(pfx) && sz == 2) {
   19638          modrm = getUChar(delta);
   19639          Int    imm8;
   19640          IRTemp src_vec = newTemp(Ity_V128);
   19641          IRTemp dst_vec = newTemp(Ity_V128);
   19642          UInt   rG      = gregOfRexRM(pfx, modrm);
   19643          assign( dst_vec, getXMMReg( rG ) );
   19644          if ( epartIsReg( modrm ) ) {
   19645             UInt rE = eregOfRexRM(pfx, modrm);
   19646             imm8 = (Int)getUChar(delta+1);
   19647             assign( src_vec, getXMMReg(rE) );
   19648             delta += 1+1;
   19649             DIP( "dppd $%d, %s,%s\n",
   19650                  imm8, nameXMMReg(rE), nameXMMReg(rG) );
   19651          } else {
   19652             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   19653                              1/* imm8 is 1 byte after the amode */ );
   19654             gen_SEGV_if_not_16_aligned( addr );
   19655             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   19656             imm8 = (Int)getUChar(delta+alen);
   19657             delta += alen+1;
   19658             DIP( "dppd $%d, %s,%s\n",
   19659                  imm8, dis_buf, nameXMMReg(rG) );
   19660          }
   19661          IRTemp res = math_DPPD_128( src_vec, dst_vec, imm8 );
   19662          putXMMReg( rG, mkexpr(res) );
   19663          goto decode_success;
   19664       }
   19665       break;
   19666 
   19667    case 0x42:
   19668       /* 66 0F 3A 42 /r ib = MPSADBW xmm1, xmm2/m128, imm8
   19669          Multiple Packed Sums of Absolule Difference (XMM) */
   19670       if (have66noF2noF3(pfx) && sz == 2) {
   19671          Int    imm8;
   19672          IRTemp src_vec = newTemp(Ity_V128);
   19673          IRTemp dst_vec = newTemp(Ity_V128);
   19674          modrm          = getUChar(delta);
   19675          UInt   rG      = gregOfRexRM(pfx, modrm);
   19676 
   19677          assign( dst_vec, getXMMReg(rG) );
   19678 
   19679          if ( epartIsReg( modrm ) ) {
   19680             UInt rE = eregOfRexRM(pfx, modrm);
   19681 
   19682             imm8 = (Int)getUChar(delta+1);
   19683             assign( src_vec, getXMMReg(rE) );
   19684             delta += 1+1;
   19685             DIP( "mpsadbw $%d, %s,%s\n", imm8,
   19686                  nameXMMReg(rE), nameXMMReg(rG) );
   19687          } else {
   19688             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   19689                              1/* imm8 is 1 byte after the amode */ );
   19690             gen_SEGV_if_not_16_aligned( addr );
   19691             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   19692             imm8 = (Int)getUChar(delta+alen);
   19693             delta += alen+1;
   19694             DIP( "mpsadbw $%d, %s,%s\n", imm8, dis_buf, nameXMMReg(rG) );
   19695          }
   19696 
   19697          putXMMReg( rG, mkexpr( math_MPSADBW_128(dst_vec, src_vec, imm8) ) );
   19698          goto decode_success;
   19699       }
   19700       break;
   19701 
   19702    case 0x44:
   19703       /* 66 0F 3A 44 /r ib = PCLMULQDQ xmm1, xmm2/m128, imm8
   19704        * Carry-less multiplication of selected XMM quadwords into XMM
   19705        * registers (a.k.a multiplication of polynomials over GF(2))
   19706        */
   19707       if (have66noF2noF3(pfx) && sz == 2) {
   19708 
   19709          Int imm8;
   19710          IRTemp svec = newTemp(Ity_V128);
   19711          IRTemp dvec = newTemp(Ity_V128);
   19712          modrm       = getUChar(delta);
   19713          UInt   rG   = gregOfRexRM(pfx, modrm);
   19714 
   19715          assign( dvec, getXMMReg(rG) );
   19716 
   19717          if ( epartIsReg( modrm ) ) {
   19718             UInt rE = eregOfRexRM(pfx, modrm);
   19719             imm8 = (Int)getUChar(delta+1);
   19720             assign( svec, getXMMReg(rE) );
   19721             delta += 1+1;
   19722             DIP( "pclmulqdq $%d, %s,%s\n", imm8,
   19723                  nameXMMReg(rE), nameXMMReg(rG) );
   19724          } else {
   19725             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   19726                              1/* imm8 is 1 byte after the amode */ );
   19727             gen_SEGV_if_not_16_aligned( addr );
   19728             assign( svec, loadLE( Ity_V128, mkexpr(addr) ) );
   19729             imm8 = (Int)getUChar(delta+alen);
   19730             delta += alen+1;
   19731             DIP( "pclmulqdq $%d, %s,%s\n",
   19732                  imm8, dis_buf, nameXMMReg(rG) );
   19733          }
   19734 
   19735          putXMMReg( rG, mkexpr( math_PCLMULQDQ(dvec, svec, imm8) ) );
   19736          goto decode_success;
   19737       }
   19738       break;
   19739 
   19740    case 0x60:
   19741    case 0x61:
   19742    case 0x62:
   19743    case 0x63:
   19744       /* 66 0F 3A 63 /r ib = PCMPISTRI imm8, xmm2/m128, xmm1
   19745          66 0F 3A 62 /r ib = PCMPISTRM imm8, xmm2/m128, xmm1
   19746          66 0F 3A 61 /r ib = PCMPESTRI imm8, xmm2/m128, xmm1
   19747          66 0F 3A 60 /r ib = PCMPESTRM imm8, xmm2/m128, xmm1
   19748          (selected special cases that actually occur in glibc,
   19749           not by any means a complete implementation.)
   19750       */
   19751       if (have66noF2noF3(pfx) && sz == 2) {
   19752          Long delta0 = delta;
   19753          delta = dis_PCMPxSTRx( vbi, pfx, delta, False/*!isAvx*/, opc );
   19754          if (delta > delta0) goto decode_success;
   19755          /* else fall though; dis_PCMPxSTRx failed to decode it */
   19756       }
   19757       break;
   19758 
   19759    case 0xDF:
   19760       /* 66 0F 3A DF /r ib = AESKEYGENASSIST imm8, xmm2/m128, xmm1 */
   19761       if (have66noF2noF3(pfx) && sz == 2) {
   19762          delta = dis_AESKEYGENASSIST( vbi, pfx, delta, False/*!isAvx*/ );
   19763          goto decode_success;
   19764       }
   19765       break;
   19766 
   19767    default:
   19768       break;
   19769 
   19770    }
   19771 
   19772   decode_failure:
   19773    *decode_OK = False;
   19774    return deltaIN;
   19775 
   19776   decode_success:
   19777    *decode_OK = True;
   19778    return delta;
   19779 }
   19780 
   19781 
   19782 /*------------------------------------------------------------*/
   19783 /*---                                                      ---*/
   19784 /*--- Top-level post-escape decoders: dis_ESC_NONE         ---*/
   19785 /*---                                                      ---*/
   19786 /*------------------------------------------------------------*/
   19787 
   19788 __attribute__((noinline))
   19789 static
   19790 Long dis_ESC_NONE (
   19791         /*MB_OUT*/DisResult* dres,
   19792         /*MB_OUT*/Bool*      expect_CAS,
   19793         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
   19794         Bool         resteerCisOk,
   19795         void*        callback_opaque,
   19796         const VexArchInfo* archinfo,
   19797         const VexAbiInfo*  vbi,
   19798         Prefix pfx, Int sz, Long deltaIN
   19799      )
   19800 {
   19801    Long   d64   = 0;
   19802    UChar  abyte = 0;
   19803    IRTemp addr  = IRTemp_INVALID;
   19804    IRTemp t1    = IRTemp_INVALID;
   19805    IRTemp t2    = IRTemp_INVALID;
   19806    IRTemp t3    = IRTemp_INVALID;
   19807    IRTemp t4    = IRTemp_INVALID;
   19808    IRTemp t5    = IRTemp_INVALID;
   19809    IRType ty    = Ity_INVALID;
   19810    UChar  modrm = 0;
   19811    Int    am_sz = 0;
   19812    Int    d_sz  = 0;
   19813    Int    alen  = 0;
   19814    HChar  dis_buf[50];
   19815 
   19816    Long   delta = deltaIN;
   19817    UChar  opc   = getUChar(delta); delta++;
   19818 
   19819    /* delta now points at the modrm byte.  In most of the cases that
   19820       follow, neither the F2 nor F3 prefixes are allowed.  However,
   19821       for some basic arithmetic operations we have to allow F2/XACQ or
   19822       F3/XREL in the case where the destination is memory and the LOCK
   19823       prefix is also present.  Do this check by looking at the modrm
   19824       byte but not advancing delta over it. */
   19825    /* By default, F2 and F3 are not allowed, so let's start off with
   19826       that setting. */
   19827    Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
   19828    { UChar tmp_modrm = getUChar(delta);
   19829      switch (opc) {
   19830         case 0x00: /* ADD Gb,Eb */  case 0x01: /* ADD Gv,Ev */
   19831         case 0x08: /* OR  Gb,Eb */  case 0x09: /* OR  Gv,Ev */
   19832         case 0x10: /* ADC Gb,Eb */  case 0x11: /* ADC Gv,Ev */
   19833         case 0x18: /* SBB Gb,Eb */  case 0x19: /* SBB Gv,Ev */
   19834         case 0x20: /* AND Gb,Eb */  case 0x21: /* AND Gv,Ev */
   19835         case 0x28: /* SUB Gb,Eb */  case 0x29: /* SUB Gv,Ev */
   19836         case 0x30: /* XOR Gb,Eb */  case 0x31: /* XOR Gv,Ev */
   19837            if (!epartIsReg(tmp_modrm)
   19838                && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
   19839               /* dst is mem, and we have F2 or F3 but not both */
   19840               validF2orF3 = True;
   19841            }
   19842            break;
   19843         default:
   19844            break;
   19845      }
   19846    }
   19847 
   19848    /* Now, in the switch below, for the opc values examined by the
   19849       switch above, use validF2orF3 rather than looking at pfx
   19850       directly. */
   19851    switch (opc) {
   19852 
   19853    case 0x00: /* ADD Gb,Eb */
   19854       if (!validF2orF3) goto decode_failure;
   19855       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Add8, True, 1, delta, "add" );
   19856       return delta;
   19857    case 0x01: /* ADD Gv,Ev */
   19858       if (!validF2orF3) goto decode_failure;
   19859       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Add8, True, sz, delta, "add" );
   19860       return delta;
   19861 
   19862    case 0x02: /* ADD Eb,Gb */
   19863       if (haveF2orF3(pfx)) goto decode_failure;
   19864       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Add8, True, 1, delta, "add" );
   19865       return delta;
   19866    case 0x03: /* ADD Ev,Gv */
   19867       if (haveF2orF3(pfx)) goto decode_failure;
   19868       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Add8, True, sz, delta, "add" );
   19869       return delta;
   19870 
   19871    case 0x04: /* ADD Ib, AL */
   19872       if (haveF2orF3(pfx)) goto decode_failure;
   19873       delta = dis_op_imm_A( 1, False, Iop_Add8, True, delta, "add" );
   19874       return delta;
   19875    case 0x05: /* ADD Iv, eAX */
   19876       if (haveF2orF3(pfx)) goto decode_failure;
   19877       delta = dis_op_imm_A(sz, False, Iop_Add8, True, delta, "add" );
   19878       return delta;
   19879 
   19880    case 0x08: /* OR Gb,Eb */
   19881       if (!validF2orF3) goto decode_failure;
   19882       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Or8, True, 1, delta, "or" );
   19883       return delta;
   19884    case 0x09: /* OR Gv,Ev */
   19885       if (!validF2orF3) goto decode_failure;
   19886       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Or8, True, sz, delta, "or" );
   19887       return delta;
   19888 
   19889    case 0x0A: /* OR Eb,Gb */
   19890       if (haveF2orF3(pfx)) goto decode_failure;
   19891       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Or8, True, 1, delta, "or" );
   19892       return delta;
   19893    case 0x0B: /* OR Ev,Gv */
   19894       if (haveF2orF3(pfx)) goto decode_failure;
   19895       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Or8, True, sz, delta, "or" );
   19896       return delta;
   19897 
   19898    case 0x0C: /* OR Ib, AL */
   19899       if (haveF2orF3(pfx)) goto decode_failure;
   19900       delta = dis_op_imm_A( 1, False, Iop_Or8, True, delta, "or" );
   19901       return delta;
   19902    case 0x0D: /* OR Iv, eAX */
   19903       if (haveF2orF3(pfx)) goto decode_failure;
   19904       delta = dis_op_imm_A( sz, False, Iop_Or8, True, delta, "or" );
   19905       return delta;
   19906 
   19907    case 0x10: /* ADC Gb,Eb */
   19908       if (!validF2orF3) goto decode_failure;
   19909       delta = dis_op2_G_E ( vbi, pfx, True, Iop_Add8, True, 1, delta, "adc" );
   19910       return delta;
   19911    case 0x11: /* ADC Gv,Ev */
   19912       if (!validF2orF3) goto decode_failure;
   19913       delta = dis_op2_G_E ( vbi, pfx, True, Iop_Add8, True, sz, delta, "adc" );
   19914       return delta;
   19915 
   19916    case 0x12: /* ADC Eb,Gb */
   19917       if (haveF2orF3(pfx)) goto decode_failure;
   19918       delta = dis_op2_E_G ( vbi, pfx, True, Iop_Add8, True, 1, delta, "adc" );
   19919       return delta;
   19920    case 0x13: /* ADC Ev,Gv */
   19921       if (haveF2orF3(pfx)) goto decode_failure;
   19922       delta = dis_op2_E_G ( vbi, pfx, True, Iop_Add8, True, sz, delta, "adc" );
   19923       return delta;
   19924 
   19925    case 0x14: /* ADC Ib, AL */
   19926       if (haveF2orF3(pfx)) goto decode_failure;
   19927       delta = dis_op_imm_A( 1, True, Iop_Add8, True, delta, "adc" );
   19928       return delta;
   19929    case 0x15: /* ADC Iv, eAX */
   19930       if (haveF2orF3(pfx)) goto decode_failure;
   19931       delta = dis_op_imm_A( sz, True, Iop_Add8, True, delta, "adc" );
   19932       return delta;
   19933 
   19934    case 0x18: /* SBB Gb,Eb */
   19935       if (!validF2orF3) goto decode_failure;
   19936       delta = dis_op2_G_E ( vbi, pfx, True, Iop_Sub8, True, 1, delta, "sbb" );
   19937       return delta;
   19938    case 0x19: /* SBB Gv,Ev */
   19939       if (!validF2orF3) goto decode_failure;
   19940       delta = dis_op2_G_E ( vbi, pfx, True, Iop_Sub8, True, sz, delta, "sbb" );
   19941       return delta;
   19942 
   19943    case 0x1A: /* SBB Eb,Gb */
   19944       if (haveF2orF3(pfx)) goto decode_failure;
   19945       delta = dis_op2_E_G ( vbi, pfx, True, Iop_Sub8, True, 1, delta, "sbb" );
   19946       return delta;
   19947    case 0x1B: /* SBB Ev,Gv */
   19948       if (haveF2orF3(pfx)) goto decode_failure;
   19949       delta = dis_op2_E_G ( vbi, pfx, True, Iop_Sub8, True, sz, delta, "sbb" );
   19950       return delta;
   19951 
   19952    case 0x1C: /* SBB Ib, AL */
   19953       if (haveF2orF3(pfx)) goto decode_failure;
   19954       delta = dis_op_imm_A( 1, True, Iop_Sub8, True, delta, "sbb" );
   19955       return delta;
   19956    case 0x1D: /* SBB Iv, eAX */
   19957       if (haveF2orF3(pfx)) goto decode_failure;
   19958       delta = dis_op_imm_A( sz, True, Iop_Sub8, True, delta, "sbb" );
   19959       return delta;
   19960 
   19961    case 0x20: /* AND Gb,Eb */
   19962       if (!validF2orF3) goto decode_failure;
   19963       delta = dis_op2_G_E ( vbi, pfx, False, Iop_And8, True, 1, delta, "and" );
   19964       return delta;
   19965    case 0x21: /* AND Gv,Ev */
   19966       if (!validF2orF3) goto decode_failure;
   19967       delta = dis_op2_G_E ( vbi, pfx, False, Iop_And8, True, sz, delta, "and" );
   19968       return delta;
   19969 
   19970    case 0x22: /* AND Eb,Gb */
   19971       if (haveF2orF3(pfx)) goto decode_failure;
   19972       delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, True, 1, delta, "and" );
   19973       return delta;
   19974    case 0x23: /* AND Ev,Gv */
   19975       if (haveF2orF3(pfx)) goto decode_failure;
   19976       delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, True, sz, delta, "and" );
   19977       return delta;
   19978 
   19979    case 0x24: /* AND Ib, AL */
   19980       if (haveF2orF3(pfx)) goto decode_failure;
   19981       delta = dis_op_imm_A( 1, False, Iop_And8, True, delta, "and" );
   19982       return delta;
   19983    case 0x25: /* AND Iv, eAX */
   19984       if (haveF2orF3(pfx)) goto decode_failure;
   19985       delta = dis_op_imm_A( sz, False, Iop_And8, True, delta, "and" );
   19986       return delta;
   19987 
   19988    case 0x28: /* SUB Gb,Eb */
   19989       if (!validF2orF3) goto decode_failure;
   19990       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, True, 1, delta, "sub" );
   19991       return delta;
   19992    case 0x29: /* SUB Gv,Ev */
   19993       if (!validF2orF3) goto decode_failure;
   19994       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, True, sz, delta, "sub" );
   19995       return delta;
   19996 
   19997    case 0x2A: /* SUB Eb,Gb */
   19998       if (haveF2orF3(pfx)) goto decode_failure;
   19999       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, True, 1, delta, "sub" );
   20000       return delta;
   20001    case 0x2B: /* SUB Ev,Gv */
   20002       if (haveF2orF3(pfx)) goto decode_failure;
   20003       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, True, sz, delta, "sub" );
   20004       return delta;
   20005 
   20006    case 0x2C: /* SUB Ib, AL */
   20007       if (haveF2orF3(pfx)) goto decode_failure;
   20008       delta = dis_op_imm_A(1, False, Iop_Sub8, True, delta, "sub" );
   20009       return delta;
   20010    case 0x2D: /* SUB Iv, eAX */
   20011       if (haveF2orF3(pfx)) goto decode_failure;
   20012       delta = dis_op_imm_A( sz, False, Iop_Sub8, True, delta, "sub" );
   20013       return delta;
   20014 
   20015    case 0x30: /* XOR Gb,Eb */
   20016       if (!validF2orF3) goto decode_failure;
   20017       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Xor8, True, 1, delta, "xor" );
   20018       return delta;
   20019    case 0x31: /* XOR Gv,Ev */
   20020       if (!validF2orF3) goto decode_failure;
   20021       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Xor8, True, sz, delta, "xor" );
   20022       return delta;
   20023 
   20024    case 0x32: /* XOR Eb,Gb */
   20025       if (haveF2orF3(pfx)) goto decode_failure;
   20026       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Xor8, True, 1, delta, "xor" );
   20027       return delta;
   20028    case 0x33: /* XOR Ev,Gv */
   20029       if (haveF2orF3(pfx)) goto decode_failure;
   20030       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Xor8, True, sz, delta, "xor" );
   20031       return delta;
   20032 
   20033    case 0x34: /* XOR Ib, AL */
   20034       if (haveF2orF3(pfx)) goto decode_failure;
   20035       delta = dis_op_imm_A( 1, False, Iop_Xor8, True, delta, "xor" );
   20036       return delta;
   20037    case 0x35: /* XOR Iv, eAX */
   20038       if (haveF2orF3(pfx)) goto decode_failure;
   20039       delta = dis_op_imm_A( sz, False, Iop_Xor8, True, delta, "xor" );
   20040       return delta;
   20041 
   20042    case 0x38: /* CMP Gb,Eb */
   20043       if (haveF2orF3(pfx)) goto decode_failure;
   20044       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, False, 1, delta, "cmp" );
   20045       return delta;
   20046    case 0x39: /* CMP Gv,Ev */
   20047       if (haveF2orF3(pfx)) goto decode_failure;
   20048       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, False, sz, delta, "cmp" );
   20049       return delta;
   20050 
   20051    case 0x3A: /* CMP Eb,Gb */
   20052       if (haveF2orF3(pfx)) goto decode_failure;
   20053       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, False, 1, delta, "cmp" );
   20054       return delta;
   20055    case 0x3B: /* CMP Ev,Gv */
   20056       if (haveF2orF3(pfx)) goto decode_failure;
   20057       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, False, sz, delta, "cmp" );
   20058       return delta;
   20059 
   20060    case 0x3C: /* CMP Ib, AL */
   20061       if (haveF2orF3(pfx)) goto decode_failure;
   20062       delta = dis_op_imm_A( 1, False, Iop_Sub8, False, delta, "cmp" );
   20063       return delta;
   20064    case 0x3D: /* CMP Iv, eAX */
   20065       if (haveF2orF3(pfx)) goto decode_failure;
   20066       delta = dis_op_imm_A( sz, False, Iop_Sub8, False, delta, "cmp" );
   20067       return delta;
   20068 
   20069    case 0x50: /* PUSH eAX */
   20070    case 0x51: /* PUSH eCX */
   20071    case 0x52: /* PUSH eDX */
   20072    case 0x53: /* PUSH eBX */
   20073    case 0x55: /* PUSH eBP */
   20074    case 0x56: /* PUSH eSI */
   20075    case 0x57: /* PUSH eDI */
   20076    case 0x54: /* PUSH eSP */
   20077       /* This is the Right Way, in that the value to be pushed is
   20078          established before %rsp is changed, so that pushq %rsp
   20079          correctly pushes the old value. */
   20080       if (haveF2orF3(pfx)) goto decode_failure;
   20081       vassert(sz == 2 || sz == 4 || sz == 8);
   20082       if (sz == 4)
   20083          sz = 8; /* there is no encoding for 32-bit push in 64-bit mode */
   20084       ty = sz==2 ? Ity_I16 : Ity_I64;
   20085       t1 = newTemp(ty);
   20086       t2 = newTemp(Ity_I64);
   20087       assign(t1, getIRegRexB(sz, pfx, opc-0x50));
   20088       assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(sz)));
   20089       putIReg64(R_RSP, mkexpr(t2) );
   20090       storeLE(mkexpr(t2),mkexpr(t1));
   20091       DIP("push%c %s\n", nameISize(sz), nameIRegRexB(sz,pfx,opc-0x50));
   20092       return delta;
   20093 
   20094    case 0x58: /* POP eAX */
   20095    case 0x59: /* POP eCX */
   20096    case 0x5A: /* POP eDX */
   20097    case 0x5B: /* POP eBX */
   20098    case 0x5D: /* POP eBP */
   20099    case 0x5E: /* POP eSI */
   20100    case 0x5F: /* POP eDI */
   20101    case 0x5C: /* POP eSP */
   20102       if (haveF2orF3(pfx)) goto decode_failure;
   20103       vassert(sz == 2 || sz == 4 || sz == 8);
   20104       if (sz == 4)
   20105          sz = 8; /* there is no encoding for 32-bit pop in 64-bit mode */
   20106       t1 = newTemp(szToITy(sz));
   20107       t2 = newTemp(Ity_I64);
   20108       assign(t2, getIReg64(R_RSP));
   20109       assign(t1, loadLE(szToITy(sz),mkexpr(t2)));
   20110       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t2), mkU64(sz)));
   20111       putIRegRexB(sz, pfx, opc-0x58, mkexpr(t1));
   20112       DIP("pop%c %s\n", nameISize(sz), nameIRegRexB(sz,pfx,opc-0x58));
   20113       return delta;
   20114 
   20115    case 0x63: /* MOVSX */
   20116       if (haveF2orF3(pfx)) goto decode_failure;
   20117       if (haveREX(pfx) && 1==getRexW(pfx)) {
   20118          vassert(sz == 8);
   20119          /* movsx r/m32 to r64 */
   20120          modrm = getUChar(delta);
   20121          if (epartIsReg(modrm)) {
   20122             delta++;
   20123             putIRegG(8, pfx, modrm,
   20124                              unop(Iop_32Sto64,
   20125                                   getIRegE(4, pfx, modrm)));
   20126             DIP("movslq %s,%s\n",
   20127                 nameIRegE(4, pfx, modrm),
   20128                 nameIRegG(8, pfx, modrm));
   20129             return delta;
   20130          } else {
   20131             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   20132             delta += alen;
   20133             putIRegG(8, pfx, modrm,
   20134                              unop(Iop_32Sto64,
   20135                                   loadLE(Ity_I32, mkexpr(addr))));
   20136             DIP("movslq %s,%s\n", dis_buf,
   20137                 nameIRegG(8, pfx, modrm));
   20138             return delta;
   20139          }
   20140       } else {
   20141          goto decode_failure;
   20142       }
   20143 
   20144    case 0x68: /* PUSH Iv */
   20145       if (haveF2orF3(pfx)) goto decode_failure;
   20146       /* Note, sz==4 is not possible in 64-bit mode.  Hence ... */
   20147       if (sz == 4) sz = 8;
   20148       d64 = getSDisp(imin(4,sz),delta);
   20149       delta += imin(4,sz);
   20150       goto do_push_I;
   20151 
   20152    case 0x69: /* IMUL Iv, Ev, Gv */
   20153       if (haveF2orF3(pfx)) goto decode_failure;
   20154       delta = dis_imul_I_E_G ( vbi, pfx, sz, delta, sz );
   20155       return delta;
   20156 
   20157    case 0x6A: /* PUSH Ib, sign-extended to sz */
   20158       if (haveF2orF3(pfx)) goto decode_failure;
   20159       /* Note, sz==4 is not possible in 64-bit mode.  Hence ... */
   20160       if (sz == 4) sz = 8;
   20161       d64 = getSDisp8(delta); delta += 1;
   20162       goto do_push_I;
   20163    do_push_I:
   20164       ty = szToITy(sz);
   20165       t1 = newTemp(Ity_I64);
   20166       t2 = newTemp(ty);
   20167       assign( t1, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
   20168       putIReg64(R_RSP, mkexpr(t1) );
   20169       /* stop mkU16 asserting if d32 is a negative 16-bit number
   20170          (bug #132813) */
   20171       if (ty == Ity_I16)
   20172          d64 &= 0xFFFF;
   20173       storeLE( mkexpr(t1), mkU(ty,d64) );
   20174       DIP("push%c $%lld\n", nameISize(sz), (Long)d64);
   20175       return delta;
   20176 
   20177    case 0x6B: /* IMUL Ib, Ev, Gv */
   20178       delta = dis_imul_I_E_G ( vbi, pfx, sz, delta, 1 );
   20179       return delta;
   20180 
   20181    case 0x70:
   20182    case 0x71:
   20183    case 0x72:   /* JBb/JNAEb (jump below) */
   20184    case 0x73:   /* JNBb/JAEb (jump not below) */
   20185    case 0x74:   /* JZb/JEb (jump zero) */
   20186    case 0x75:   /* JNZb/JNEb (jump not zero) */
   20187    case 0x76:   /* JBEb/JNAb (jump below or equal) */
   20188    case 0x77:   /* JNBEb/JAb (jump not below or equal) */
   20189    case 0x78:   /* JSb (jump negative) */
   20190    case 0x79:   /* JSb (jump not negative) */
   20191    case 0x7A:   /* JP (jump parity even) */
   20192    case 0x7B:   /* JNP/JPO (jump parity odd) */
   20193    case 0x7C:   /* JLb/JNGEb (jump less) */
   20194    case 0x7D:   /* JGEb/JNLb (jump greater or equal) */
   20195    case 0x7E:   /* JLEb/JNGb (jump less or equal) */
   20196    case 0x7F: { /* JGb/JNLEb (jump greater) */
   20197       Long   jmpDelta;
   20198       const HChar* comment  = "";
   20199       if (haveF3(pfx)) goto decode_failure;
   20200       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   20201       jmpDelta = getSDisp8(delta);
   20202       vassert(-128 <= jmpDelta && jmpDelta < 128);
   20203       d64 = (guest_RIP_bbstart+delta+1) + jmpDelta;
   20204       delta++;
   20205       if (resteerCisOk
   20206           && vex_control.guest_chase_cond
   20207           && (Addr64)d64 != (Addr64)guest_RIP_bbstart
   20208           && jmpDelta < 0
   20209           && resteerOkFn( callback_opaque, (Addr64)d64) ) {
   20210          /* Speculation: assume this backward branch is taken.  So we
   20211             need to emit a side-exit to the insn following this one,
   20212             on the negation of the condition, and continue at the
   20213             branch target address (d64).  If we wind up back at the
   20214             first instruction of the trace, just stop; it's better to
   20215             let the IR loop unroller handle that case. */
   20216          stmt( IRStmt_Exit(
   20217                   mk_amd64g_calculate_condition(
   20218                      (AMD64Condcode)(1 ^ (opc - 0x70))),
   20219                   Ijk_Boring,
   20220                   IRConst_U64(guest_RIP_bbstart+delta),
   20221                   OFFB_RIP ) );
   20222          dres->whatNext   = Dis_ResteerC;
   20223          dres->continueAt = d64;
   20224          comment = "(assumed taken)";
   20225       }
   20226       else
   20227       if (resteerCisOk
   20228           && vex_control.guest_chase_cond
   20229           && (Addr64)d64 != (Addr64)guest_RIP_bbstart
   20230           && jmpDelta >= 0
   20231           && resteerOkFn( callback_opaque, guest_RIP_bbstart+delta ) ) {
   20232          /* Speculation: assume this forward branch is not taken.  So
   20233             we need to emit a side-exit to d64 (the dest) and continue
   20234             disassembling at the insn immediately following this
   20235             one. */
   20236          stmt( IRStmt_Exit(
   20237                   mk_amd64g_calculate_condition((AMD64Condcode)(opc - 0x70)),
   20238                   Ijk_Boring,
   20239                   IRConst_U64(d64),
   20240                   OFFB_RIP ) );
   20241          dres->whatNext   = Dis_ResteerC;
   20242          dres->continueAt = guest_RIP_bbstart+delta;
   20243          comment = "(assumed not taken)";
   20244       }
   20245       else {
   20246          /* Conservative default translation - end the block at this
   20247             point. */
   20248          jcc_01( dres, (AMD64Condcode)(opc - 0x70),
   20249                  guest_RIP_bbstart+delta, d64 );
   20250          vassert(dres->whatNext == Dis_StopHere);
   20251       }
   20252       DIP("j%s-8 0x%llx %s\n", name_AMD64Condcode(opc - 0x70), (ULong)d64,
   20253           comment);
   20254       return delta;
   20255    }
   20256 
   20257    case 0x80: /* Grp1 Ib,Eb */
   20258       modrm = getUChar(delta);
   20259       /* Disallow F2/XACQ and F3/XREL for the non-mem case.  Allow
   20260          just one for the mem case and also require LOCK in this case.
   20261          Note that this erroneously allows XACQ/XREL on CMP since we
   20262          don't check the subopcode here.  No big deal. */
   20263       if (epartIsReg(modrm) && haveF2orF3(pfx))
   20264          goto decode_failure;
   20265       if (!epartIsReg(modrm) && haveF2andF3(pfx))
   20266          goto decode_failure;
   20267       if (!epartIsReg(modrm) && haveF2orF3(pfx) && !haveLOCK(pfx))
   20268          goto decode_failure;
   20269       am_sz = lengthAMode(pfx,delta);
   20270       sz    = 1;
   20271       d_sz  = 1;
   20272       d64   = getSDisp8(delta + am_sz);
   20273       delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
   20274       return delta;
   20275 
   20276    case 0x81: /* Grp1 Iv,Ev */
   20277       modrm = getUChar(delta);
   20278       /* Same comment as for case 0x80 just above. */
   20279       if (epartIsReg(modrm) && haveF2orF3(pfx))
   20280          goto decode_failure;
   20281       if (!epartIsReg(modrm) && haveF2andF3(pfx))
   20282          goto decode_failure;
   20283       if (!epartIsReg(modrm) && haveF2orF3(pfx) && !haveLOCK(pfx))
   20284          goto decode_failure;
   20285       am_sz = lengthAMode(pfx,delta);
   20286       d_sz  = imin(sz,4);
   20287       d64   = getSDisp(d_sz, delta + am_sz);
   20288       delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
   20289       return delta;
   20290 
   20291    case 0x83: /* Grp1 Ib,Ev */
   20292       if (haveF2orF3(pfx)) goto decode_failure;
   20293       modrm = getUChar(delta);
   20294       am_sz = lengthAMode(pfx,delta);
   20295       d_sz  = 1;
   20296       d64   = getSDisp8(delta + am_sz);
   20297       delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
   20298       return delta;
   20299 
   20300    case 0x84: /* TEST Eb,Gb */
   20301       if (haveF2orF3(pfx)) goto decode_failure;
   20302       delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, False, 1, delta, "test" );
   20303       return delta;
   20304 
   20305    case 0x85: /* TEST Ev,Gv */
   20306       if (haveF2orF3(pfx)) goto decode_failure;
   20307       delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, False, sz, delta, "test" );
   20308       return delta;
   20309 
   20310    /* XCHG reg,mem automatically asserts LOCK# even without a LOCK
   20311       prefix.  Therefore, generate CAS regardless of the presence or
   20312       otherwise of a LOCK prefix. */
   20313    case 0x86: /* XCHG Gb,Eb */
   20314       sz = 1;
   20315       /* Fall through ... */
   20316    case 0x87: /* XCHG Gv,Ev */
   20317       modrm = getUChar(delta);
   20318       /* Check whether F2 or F3 are allowable.  For the mem case, one
   20319          or the othter but not both are.  We don't care about the
   20320          presence of LOCK in this case -- XCHG is unusual in this
   20321          respect. */
   20322       if (haveF2orF3(pfx)) {
   20323          if (epartIsReg(modrm)) {
   20324             goto decode_failure;
   20325          } else {
   20326             if (haveF2andF3(pfx))
   20327                goto decode_failure;
   20328          }
   20329       }
   20330       ty = szToITy(sz);
   20331       t1 = newTemp(ty); t2 = newTemp(ty);
   20332       if (epartIsReg(modrm)) {
   20333          assign(t1, getIRegE(sz, pfx, modrm));
   20334          assign(t2, getIRegG(sz, pfx, modrm));
   20335          putIRegG(sz, pfx, modrm, mkexpr(t1));
   20336          putIRegE(sz, pfx, modrm, mkexpr(t2));
   20337          delta++;
   20338          DIP("xchg%c %s, %s\n",
   20339              nameISize(sz), nameIRegG(sz, pfx, modrm),
   20340                             nameIRegE(sz, pfx, modrm));
   20341       } else {
   20342          *expect_CAS = True;
   20343          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   20344          assign( t1, loadLE(ty, mkexpr(addr)) );
   20345          assign( t2, getIRegG(sz, pfx, modrm) );
   20346          casLE( mkexpr(addr),
   20347                 mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
   20348          putIRegG( sz, pfx, modrm, mkexpr(t1) );
   20349          delta += alen;
   20350          DIP("xchg%c %s, %s\n", nameISize(sz),
   20351                                 nameIRegG(sz, pfx, modrm), dis_buf);
   20352       }
   20353       return delta;
   20354 
   20355    case 0x88: { /* MOV Gb,Eb */
   20356       /* We let dis_mov_G_E decide whether F3(XRELEASE) is allowable. */
   20357       Bool ok = True;
   20358       delta = dis_mov_G_E(vbi, pfx, 1, delta, &ok);
   20359       if (!ok) goto decode_failure;
   20360       return delta;
   20361    }
   20362 
   20363    case 0x89: { /* MOV Gv,Ev */
   20364       /* We let dis_mov_G_E decide whether F3(XRELEASE) is allowable. */
   20365       Bool ok = True;
   20366       delta = dis_mov_G_E(vbi, pfx, sz, delta, &ok);
   20367       if (!ok) goto decode_failure;
   20368       return delta;
   20369    }
   20370 
   20371    case 0x8A: /* MOV Eb,Gb */
   20372       if (haveF2orF3(pfx)) goto decode_failure;
   20373       delta = dis_mov_E_G(vbi, pfx, 1, delta);
   20374       return delta;
   20375 
   20376    case 0x8B: /* MOV Ev,Gv */
   20377       if (haveF2orF3(pfx)) goto decode_failure;
   20378       delta = dis_mov_E_G(vbi, pfx, sz, delta);
   20379       return delta;
   20380 
   20381    case 0x8C: /* MOV S,E -- MOV from a SEGMENT REGISTER */
   20382       if (haveF2orF3(pfx)) goto decode_failure;
   20383       delta = dis_mov_S_E(vbi, pfx, sz, delta);
   20384       return delta;
   20385 
   20386    case 0x8D: /* LEA M,Gv */
   20387       if (haveF2orF3(pfx)) goto decode_failure;
   20388       if (sz != 4 && sz != 8)
   20389          goto decode_failure;
   20390       modrm = getUChar(delta);
   20391       if (epartIsReg(modrm))
   20392          goto decode_failure;
   20393       /* NOTE!  this is the one place where a segment override prefix
   20394          has no effect on the address calculation.  Therefore we clear
   20395          any segment override bits in pfx. */
   20396       addr = disAMode ( &alen, vbi, clearSegBits(pfx), delta, dis_buf, 0 );
   20397       delta += alen;
   20398       /* This is a hack.  But it isn't clear that really doing the
   20399          calculation at 32 bits is really worth it.  Hence for leal,
   20400          do the full 64-bit calculation and then truncate it. */
   20401       putIRegG( sz, pfx, modrm,
   20402                          sz == 4
   20403                             ? unop(Iop_64to32, mkexpr(addr))
   20404                             : mkexpr(addr)
   20405               );
   20406       DIP("lea%c %s, %s\n", nameISize(sz), dis_buf,
   20407                             nameIRegG(sz,pfx,modrm));
   20408       return delta;
   20409 
   20410    case 0x8F: { /* POPQ m64 / POPW m16 */
   20411       Int   len;
   20412       UChar rm;
   20413       /* There is no encoding for 32-bit pop in 64-bit mode.
   20414          So sz==4 actually means sz==8. */
   20415       if (haveF2orF3(pfx)) goto decode_failure;
   20416       vassert(sz == 2 || sz == 4
   20417               || /* tolerate redundant REX.W, see #210481 */ sz == 8);
   20418       if (sz == 4) sz = 8;
   20419       if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
   20420 
   20421       rm = getUChar(delta);
   20422 
   20423       /* make sure this instruction is correct POP */
   20424       if (epartIsReg(rm) || gregLO3ofRM(rm) != 0)
   20425          goto decode_failure;
   20426       /* and has correct size */
   20427       vassert(sz == 8);
   20428 
   20429       t1 = newTemp(Ity_I64);
   20430       t3 = newTemp(Ity_I64);
   20431       assign( t1, getIReg64(R_RSP) );
   20432       assign( t3, loadLE(Ity_I64, mkexpr(t1)) );
   20433 
   20434       /* Increase RSP; must be done before the STORE.  Intel manual
   20435          says: If the RSP register is used as a base register for
   20436          addressing a destination operand in memory, the POP
   20437          instruction computes the effective address of the operand
   20438          after it increments the RSP register.  */
   20439       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t1), mkU64(sz)) );
   20440 
   20441       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   20442       storeLE( mkexpr(addr), mkexpr(t3) );
   20443 
   20444       DIP("popl %s\n", dis_buf);
   20445 
   20446       delta += len;
   20447       return delta;
   20448    }
   20449 
   20450    case 0x90: /* XCHG eAX,eAX */
   20451       /* detect and handle F3 90 (rep nop) specially */
   20452       if (!have66(pfx) && !haveF2(pfx) && haveF3(pfx)) {
   20453          DIP("rep nop (P4 pause)\n");
   20454          /* "observe" the hint.  The Vex client needs to be careful not
   20455             to cause very long delays as a result, though. */
   20456          jmp_lit(dres, Ijk_Yield, guest_RIP_bbstart+delta);
   20457          vassert(dres->whatNext == Dis_StopHere);
   20458          return delta;
   20459       }
   20460       /* detect and handle NOPs specially */
   20461       if (/* F2/F3 probably change meaning completely */
   20462           !haveF2orF3(pfx)
   20463           /* If REX.B is 1, we're not exchanging rAX with itself */
   20464           && getRexB(pfx)==0 ) {
   20465          DIP("nop\n");
   20466          return delta;
   20467       }
   20468       /* else fall through to normal case. */
   20469    case 0x91: /* XCHG rAX,rCX */
   20470    case 0x92: /* XCHG rAX,rDX */
   20471    case 0x93: /* XCHG rAX,rBX */
   20472    case 0x94: /* XCHG rAX,rSP */
   20473    case 0x95: /* XCHG rAX,rBP */
   20474    case 0x96: /* XCHG rAX,rSI */
   20475    case 0x97: /* XCHG rAX,rDI */
   20476       /* guard against mutancy */
   20477       if (haveF2orF3(pfx)) goto decode_failure;
   20478       codegen_xchg_rAX_Reg ( pfx, sz, opc - 0x90 );
   20479       return delta;
   20480 
   20481    case 0x98: /* CBW */
   20482       if (haveF2orF3(pfx)) goto decode_failure;
   20483       if (sz == 8) {
   20484          putIRegRAX( 8, unop(Iop_32Sto64, getIRegRAX(4)) );
   20485          DIP(/*"cdqe\n"*/"cltq");
   20486          return delta;
   20487       }
   20488       if (sz == 4) {
   20489          putIRegRAX( 4, unop(Iop_16Sto32, getIRegRAX(2)) );
   20490          DIP("cwtl\n");
   20491          return delta;
   20492       }
   20493       if (sz == 2) {
   20494          putIRegRAX( 2, unop(Iop_8Sto16, getIRegRAX(1)) );
   20495          DIP("cbw\n");
   20496          return delta;
   20497       }
   20498       goto decode_failure;
   20499 
   20500    case 0x99: /* CWD/CDQ/CQO */
   20501       if (haveF2orF3(pfx)) goto decode_failure;
   20502       vassert(sz == 2 || sz == 4 || sz == 8);
   20503       ty = szToITy(sz);
   20504       putIRegRDX( sz,
   20505                   binop(mkSizedOp(ty,Iop_Sar8),
   20506                         getIRegRAX(sz),
   20507                         mkU8(sz == 2 ? 15 : (sz == 4 ? 31 : 63))) );
   20508       DIP(sz == 2 ? "cwd\n"
   20509                   : (sz == 4 ? /*"cdq\n"*/ "cltd\n"
   20510                              : "cqo\n"));
   20511       return delta;
   20512 
   20513    case 0x9B: /* FWAIT (X87 insn) */
   20514       /* ignore? */
   20515       DIP("fwait\n");
   20516       return delta;
   20517 
   20518    case 0x9C: /* PUSHF */ {
   20519       /* Note.  There is no encoding for a 32-bit pushf in 64-bit
   20520          mode.  So sz==4 actually means sz==8. */
   20521       /* 24 July 06: has also been seen with a redundant REX prefix,
   20522          so must also allow sz==8. */
   20523       if (haveF2orF3(pfx)) goto decode_failure;
   20524       vassert(sz == 2 || sz == 4 || sz == 8);
   20525       if (sz == 4) sz = 8;
   20526       if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
   20527 
   20528       t1 = newTemp(Ity_I64);
   20529       assign( t1, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
   20530       putIReg64(R_RSP, mkexpr(t1) );
   20531 
   20532       t2 = newTemp(Ity_I64);
   20533       assign( t2, mk_amd64g_calculate_rflags_all() );
   20534 
   20535       /* Patch in the D flag.  This can simply be a copy of bit 10 of
   20536          baseBlock[OFFB_DFLAG]. */
   20537       t3 = newTemp(Ity_I64);
   20538       assign( t3, binop(Iop_Or64,
   20539                         mkexpr(t2),
   20540                         binop(Iop_And64,
   20541                               IRExpr_Get(OFFB_DFLAG,Ity_I64),
   20542                               mkU64(1<<10)))
   20543             );
   20544 
   20545       /* And patch in the ID flag. */
   20546       t4 = newTemp(Ity_I64);
   20547       assign( t4, binop(Iop_Or64,
   20548                         mkexpr(t3),
   20549                         binop(Iop_And64,
   20550                               binop(Iop_Shl64, IRExpr_Get(OFFB_IDFLAG,Ity_I64),
   20551                                                mkU8(21)),
   20552                               mkU64(1<<21)))
   20553             );
   20554 
   20555       /* And patch in the AC flag too. */
   20556       t5 = newTemp(Ity_I64);
   20557       assign( t5, binop(Iop_Or64,
   20558                         mkexpr(t4),
   20559                         binop(Iop_And64,
   20560                               binop(Iop_Shl64, IRExpr_Get(OFFB_ACFLAG,Ity_I64),
   20561                                                mkU8(18)),
   20562                               mkU64(1<<18)))
   20563             );
   20564 
   20565       /* if sz==2, the stored value needs to be narrowed. */
   20566       if (sz == 2)
   20567         storeLE( mkexpr(t1), unop(Iop_32to16,
   20568                              unop(Iop_64to32,mkexpr(t5))) );
   20569       else
   20570         storeLE( mkexpr(t1), mkexpr(t5) );
   20571 
   20572       DIP("pushf%c\n", nameISize(sz));
   20573       return delta;
   20574    }
   20575 
   20576    case 0x9D: /* POPF */
   20577       /* Note.  There is no encoding for a 32-bit popf in 64-bit mode.
   20578          So sz==4 actually means sz==8. */
   20579       if (haveF2orF3(pfx)) goto decode_failure;
   20580       vassert(sz == 2 || sz == 4);
   20581       if (sz == 4) sz = 8;
   20582       if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
   20583       t1 = newTemp(Ity_I64); t2 = newTemp(Ity_I64);
   20584       assign(t2, getIReg64(R_RSP));
   20585       assign(t1, widenUto64(loadLE(szToITy(sz),mkexpr(t2))));
   20586       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t2), mkU64(sz)));
   20587       /* t1 is the flag word.  Mask out everything except OSZACP and
   20588          set the flags thunk to AMD64G_CC_OP_COPY. */
   20589       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   20590       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   20591       stmt( IRStmt_Put( OFFB_CC_DEP1,
   20592                         binop(Iop_And64,
   20593                               mkexpr(t1),
   20594                               mkU64( AMD64G_CC_MASK_C | AMD64G_CC_MASK_P
   20595                                      | AMD64G_CC_MASK_A | AMD64G_CC_MASK_Z
   20596                                      | AMD64G_CC_MASK_S| AMD64G_CC_MASK_O )
   20597                              )
   20598                        )
   20599           );
   20600 
   20601       /* Also need to set the D flag, which is held in bit 10 of t1.
   20602          If zero, put 1 in OFFB_DFLAG, else -1 in OFFB_DFLAG. */
   20603       stmt( IRStmt_Put(
   20604                OFFB_DFLAG,
   20605                IRExpr_ITE(
   20606                   unop(Iop_64to1,
   20607                        binop(Iop_And64,
   20608                              binop(Iop_Shr64, mkexpr(t1), mkU8(10)),
   20609                              mkU64(1))),
   20610                   mkU64(0xFFFFFFFFFFFFFFFFULL),
   20611                   mkU64(1)))
   20612           );
   20613 
   20614       /* And set the ID flag */
   20615       stmt( IRStmt_Put(
   20616                OFFB_IDFLAG,
   20617                IRExpr_ITE(
   20618                   unop(Iop_64to1,
   20619                        binop(Iop_And64,
   20620                              binop(Iop_Shr64, mkexpr(t1), mkU8(21)),
   20621                              mkU64(1))),
   20622                   mkU64(1),
   20623                   mkU64(0)))
   20624           );
   20625 
   20626       /* And set the AC flag too */
   20627       stmt( IRStmt_Put(
   20628                OFFB_ACFLAG,
   20629                IRExpr_ITE(
   20630                   unop(Iop_64to1,
   20631                        binop(Iop_And64,
   20632                              binop(Iop_Shr64, mkexpr(t1), mkU8(18)),
   20633                              mkU64(1))),
   20634                   mkU64(1),
   20635                   mkU64(0)))
   20636           );
   20637 
   20638       DIP("popf%c\n", nameISize(sz));
   20639       return delta;
   20640 
   20641    case 0x9E: /* SAHF */
   20642       codegen_SAHF();
   20643       DIP("sahf\n");
   20644       return delta;
   20645 
   20646    case 0x9F: /* LAHF */
   20647       codegen_LAHF();
   20648       DIP("lahf\n");
   20649       return delta;
   20650 
   20651    case 0xA0: /* MOV Ob,AL */
   20652       if (have66orF2orF3(pfx)) goto decode_failure;
   20653       sz = 1;
   20654       /* Fall through ... */
   20655    case 0xA1: /* MOV Ov,eAX */
   20656       if (sz != 8 && sz != 4 && sz != 2 && sz != 1)
   20657          goto decode_failure;
   20658       d64 = getDisp64(delta);
   20659       delta += 8;
   20660       ty = szToITy(sz);
   20661       addr = newTemp(Ity_I64);
   20662       assign( addr, handleAddrOverrides(vbi, pfx, mkU64(d64)) );
   20663       putIRegRAX(sz, loadLE( ty, mkexpr(addr) ));
   20664       DIP("mov%c %s0x%llx, %s\n", nameISize(sz),
   20665                                   segRegTxt(pfx), (ULong)d64,
   20666                                   nameIRegRAX(sz));
   20667       return delta;
   20668 
   20669    case 0xA2: /* MOV AL,Ob */
   20670       if (have66orF2orF3(pfx)) goto decode_failure;
   20671       sz = 1;
   20672       /* Fall through ... */
   20673    case 0xA3: /* MOV eAX,Ov */
   20674       if (sz != 8 && sz != 4 && sz != 2 && sz != 1)
   20675          goto decode_failure;
   20676       d64 = getDisp64(delta);
   20677       delta += 8;
   20678       ty = szToITy(sz);
   20679       addr = newTemp(Ity_I64);
   20680       assign( addr, handleAddrOverrides(vbi, pfx, mkU64(d64)) );
   20681       storeLE( mkexpr(addr), getIRegRAX(sz) );
   20682       DIP("mov%c %s, %s0x%llx\n", nameISize(sz), nameIRegRAX(sz),
   20683                                   segRegTxt(pfx), (ULong)d64);
   20684       return delta;
   20685 
   20686    case 0xA4:
   20687    case 0xA5:
   20688       /* F3 A4: rep movsb */
   20689       if (haveF3(pfx) && !haveF2(pfx)) {
   20690          if (opc == 0xA4)
   20691             sz = 1;
   20692          dis_REP_op ( dres, AMD64CondAlways, dis_MOVS, sz,
   20693                       guest_RIP_curr_instr,
   20694                       guest_RIP_bbstart+delta, "rep movs", pfx );
   20695         dres->whatNext = Dis_StopHere;
   20696         return delta;
   20697       }
   20698       /* A4: movsb */
   20699       if (!haveF3(pfx) && !haveF2(pfx)) {
   20700          if (opc == 0xA4)
   20701             sz = 1;
   20702          dis_string_op( dis_MOVS, sz, "movs", pfx );
   20703          return delta;
   20704       }
   20705       goto decode_failure;
   20706 
   20707    case 0xA6:
   20708    case 0xA7:
   20709       /* F3 A6/A7: repe cmps/rep cmps{w,l,q} */
   20710       if (haveF3(pfx) && !haveF2(pfx)) {
   20711          if (opc == 0xA6)
   20712             sz = 1;
   20713          dis_REP_op ( dres, AMD64CondZ, dis_CMPS, sz,
   20714                       guest_RIP_curr_instr,
   20715                       guest_RIP_bbstart+delta, "repe cmps", pfx );
   20716          dres->whatNext = Dis_StopHere;
   20717          return delta;
   20718       }
   20719       goto decode_failure;
   20720 
   20721    case 0xAA:
   20722    case 0xAB:
   20723       /* F3 AA/AB: rep stosb/rep stos{w,l,q} */
   20724       if (haveF3(pfx) && !haveF2(pfx)) {
   20725          if (opc == 0xAA)
   20726             sz = 1;
   20727          dis_REP_op ( dres, AMD64CondAlways, dis_STOS, sz,
   20728                       guest_RIP_curr_instr,
   20729                       guest_RIP_bbstart+delta, "rep stos", pfx );
   20730          vassert(dres->whatNext == Dis_StopHere);
   20731          return delta;
   20732       }
   20733       /* AA/AB: stosb/stos{w,l,q} */
   20734       if (!haveF3(pfx) && !haveF2(pfx)) {
   20735          if (opc == 0xAA)
   20736             sz = 1;
   20737          dis_string_op( dis_STOS, sz, "stos", pfx );
   20738          return delta;
   20739       }
   20740       goto decode_failure;
   20741 
   20742    case 0xA8: /* TEST Ib, AL */
   20743       if (haveF2orF3(pfx)) goto decode_failure;
   20744       delta = dis_op_imm_A( 1, False, Iop_And8, False, delta, "test" );
   20745       return delta;
   20746    case 0xA9: /* TEST Iv, eAX */
   20747       if (haveF2orF3(pfx)) goto decode_failure;
   20748       delta = dis_op_imm_A( sz, False, Iop_And8, False, delta, "test" );
   20749       return delta;
   20750 
   20751    case 0xAC: /* LODS, no REP prefix */
   20752    case 0xAD:
   20753       dis_string_op( dis_LODS, ( opc == 0xAC ? 1 : sz ), "lods", pfx );
   20754       return delta;
   20755 
   20756    case 0xAE:
   20757    case 0xAF:
   20758       /* F2 AE/AF: repne scasb/repne scas{w,l,q} */
   20759       if (haveF2(pfx) && !haveF3(pfx)) {
   20760          if (opc == 0xAE)
   20761             sz = 1;
   20762          dis_REP_op ( dres, AMD64CondNZ, dis_SCAS, sz,
   20763                       guest_RIP_curr_instr,
   20764                       guest_RIP_bbstart+delta, "repne scas", pfx );
   20765          vassert(dres->whatNext == Dis_StopHere);
   20766          return delta;
   20767       }
   20768       /* F3 AE/AF: repe scasb/repe scas{w,l,q} */
   20769       if (!haveF2(pfx) && haveF3(pfx)) {
   20770          if (opc == 0xAE)
   20771             sz = 1;
   20772          dis_REP_op ( dres, AMD64CondZ, dis_SCAS, sz,
   20773                       guest_RIP_curr_instr,
   20774                       guest_RIP_bbstart+delta, "repe scas", pfx );
   20775          vassert(dres->whatNext == Dis_StopHere);
   20776          return delta;
   20777       }
   20778       /* AE/AF: scasb/scas{w,l,q} */
   20779       if (!haveF2(pfx) && !haveF3(pfx)) {
   20780          if (opc == 0xAE)
   20781             sz = 1;
   20782          dis_string_op( dis_SCAS, sz, "scas", pfx );
   20783          return delta;
   20784       }
   20785       goto decode_failure;
   20786 
   20787    /* XXXX be careful here with moves to AH/BH/CH/DH */
   20788    case 0xB0: /* MOV imm,AL */
   20789    case 0xB1: /* MOV imm,CL */
   20790    case 0xB2: /* MOV imm,DL */
   20791    case 0xB3: /* MOV imm,BL */
   20792    case 0xB4: /* MOV imm,AH */
   20793    case 0xB5: /* MOV imm,CH */
   20794    case 0xB6: /* MOV imm,DH */
   20795    case 0xB7: /* MOV imm,BH */
   20796       if (haveF2orF3(pfx)) goto decode_failure;
   20797       d64 = getUChar(delta);
   20798       delta += 1;
   20799       putIRegRexB(1, pfx, opc-0xB0, mkU8(d64));
   20800       DIP("movb $%lld,%s\n", d64, nameIRegRexB(1,pfx,opc-0xB0));
   20801       return delta;
   20802 
   20803    case 0xB8: /* MOV imm,eAX */
   20804    case 0xB9: /* MOV imm,eCX */
   20805    case 0xBA: /* MOV imm,eDX */
   20806    case 0xBB: /* MOV imm,eBX */
   20807    case 0xBC: /* MOV imm,eSP */
   20808    case 0xBD: /* MOV imm,eBP */
   20809    case 0xBE: /* MOV imm,eSI */
   20810    case 0xBF: /* MOV imm,eDI */
   20811       /* This is the one-and-only place where 64-bit literals are
   20812          allowed in the instruction stream. */
   20813       if (haveF2orF3(pfx)) goto decode_failure;
   20814       if (sz == 8) {
   20815          d64 = getDisp64(delta);
   20816          delta += 8;
   20817          putIRegRexB(8, pfx, opc-0xB8, mkU64(d64));
   20818          DIP("movabsq $%lld,%s\n", (Long)d64,
   20819                                    nameIRegRexB(8,pfx,opc-0xB8));
   20820       } else {
   20821          d64 = getSDisp(imin(4,sz),delta);
   20822          delta += imin(4,sz);
   20823          putIRegRexB(sz, pfx, opc-0xB8,
   20824                          mkU(szToITy(sz), d64 & mkSizeMask(sz)));
   20825          DIP("mov%c $%lld,%s\n", nameISize(sz),
   20826                                  (Long)d64,
   20827                                  nameIRegRexB(sz,pfx,opc-0xB8));
   20828       }
   20829       return delta;
   20830 
   20831    case 0xC0: { /* Grp2 Ib,Eb */
   20832       Bool decode_OK = True;
   20833       if (haveF2orF3(pfx)) goto decode_failure;
   20834       modrm = getUChar(delta);
   20835       am_sz = lengthAMode(pfx,delta);
   20836       d_sz  = 1;
   20837       d64   = getUChar(delta + am_sz);
   20838       sz    = 1;
   20839       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   20840                          mkU8(d64 & 0xFF), NULL, &decode_OK );
   20841       if (!decode_OK) goto decode_failure;
   20842       return delta;
   20843    }
   20844 
   20845    case 0xC1: { /* Grp2 Ib,Ev */
   20846       Bool decode_OK = True;
   20847       if (haveF2orF3(pfx)) goto decode_failure;
   20848       modrm = getUChar(delta);
   20849       am_sz = lengthAMode(pfx,delta);
   20850       d_sz  = 1;
   20851       d64   = getUChar(delta + am_sz);
   20852       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   20853                          mkU8(d64 & 0xFF), NULL, &decode_OK );
   20854       if (!decode_OK) goto decode_failure;
   20855       return delta;
   20856    }
   20857 
   20858    case 0xC2: /* RET imm16 */
   20859       if (have66orF3(pfx)) goto decode_failure;
   20860       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   20861       d64 = getUDisp16(delta);
   20862       delta += 2;
   20863       dis_ret(dres, vbi, d64);
   20864       DIP("ret $%lld\n", d64);
   20865       return delta;
   20866 
   20867    case 0xC3: /* RET */
   20868       if (have66(pfx)) goto decode_failure;
   20869       /* F3 is acceptable on AMD. */
   20870       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   20871       dis_ret(dres, vbi, 0);
   20872       DIP(haveF3(pfx) ? "rep ; ret\n" : "ret\n");
   20873       return delta;
   20874 
   20875    case 0xC6: /* C6 /0 = MOV Ib,Eb */
   20876       sz = 1;
   20877       goto maybe_do_Mov_I_E;
   20878    case 0xC7: /* C7 /0 = MOV Iv,Ev */
   20879       goto maybe_do_Mov_I_E;
   20880    maybe_do_Mov_I_E:
   20881       modrm = getUChar(delta);
   20882       if (gregLO3ofRM(modrm) == 0) {
   20883          if (epartIsReg(modrm)) {
   20884             /* Neither F2 nor F3 are allowable. */
   20885             if (haveF2orF3(pfx)) goto decode_failure;
   20886             delta++; /* mod/rm byte */
   20887             d64 = getSDisp(imin(4,sz),delta);
   20888             delta += imin(4,sz);
   20889             putIRegE(sz, pfx, modrm,
   20890                          mkU(szToITy(sz), d64 & mkSizeMask(sz)));
   20891             DIP("mov%c $%lld, %s\n", nameISize(sz),
   20892                                      (Long)d64,
   20893                                      nameIRegE(sz,pfx,modrm));
   20894          } else {
   20895             if (haveF2(pfx)) goto decode_failure;
   20896             /* F3(XRELEASE) is allowable here */
   20897             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   20898                               /*xtra*/imin(4,sz) );
   20899             delta += alen;
   20900             d64 = getSDisp(imin(4,sz),delta);
   20901             delta += imin(4,sz);
   20902             storeLE(mkexpr(addr),
   20903                     mkU(szToITy(sz), d64 & mkSizeMask(sz)));
   20904             DIP("mov%c $%lld, %s\n", nameISize(sz), (Long)d64, dis_buf);
   20905          }
   20906          return delta;
   20907       }
   20908       /* BEGIN HACKY SUPPORT FOR xbegin */
   20909       if (opc == 0xC7 && modrm == 0xF8 && !have66orF2orF3(pfx) && sz == 4
   20910           && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   20911          delta++; /* mod/rm byte */
   20912          d64 = getSDisp(4,delta);
   20913          delta += 4;
   20914          guest_RIP_next_mustcheck = True;
   20915          guest_RIP_next_assumed   = guest_RIP_bbstart + delta;
   20916          Addr64 failAddr = guest_RIP_bbstart + delta + d64;
   20917          /* EAX contains the failure status code.  Bit 3 is "Set if an
   20918             internal buffer overflowed", which seems like the
   20919             least-bogus choice we can make here. */
   20920          putIRegRAX(4, mkU32(1<<3));
   20921          /* And jump to the fail address. */
   20922          jmp_lit(dres, Ijk_Boring, failAddr);
   20923          vassert(dres->whatNext == Dis_StopHere);
   20924          DIP("xbeginq 0x%llx\n", failAddr);
   20925          return delta;
   20926       }
   20927       /* END HACKY SUPPORT FOR xbegin */
   20928       /* BEGIN HACKY SUPPORT FOR xabort */
   20929       if (opc == 0xC6 && modrm == 0xF8 && !have66orF2orF3(pfx) && sz == 1
   20930           && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   20931          delta++; /* mod/rm byte */
   20932          abyte = getUChar(delta); delta++;
   20933          /* There is never a real transaction in progress, so do nothing. */
   20934          DIP("xabort $%d", (Int)abyte);
   20935          return delta;
   20936       }
   20937       /* END HACKY SUPPORT FOR xabort */
   20938       goto decode_failure;
   20939 
   20940    case 0xC8: /* ENTER */
   20941       /* Same comments re operand size as for LEAVE below apply.
   20942          Also, only handles the case "enter $imm16, $0"; other cases
   20943          for the second operand (nesting depth) are not handled. */
   20944       if (sz != 4)
   20945          goto decode_failure;
   20946       d64 = getUDisp16(delta);
   20947       delta += 2;
   20948       vassert(d64 >= 0 && d64 <= 0xFFFF);
   20949       if (getUChar(delta) != 0)
   20950          goto decode_failure;
   20951       delta++;
   20952       /* Intel docs seem to suggest:
   20953            push rbp
   20954            temp = rsp
   20955            rbp = temp
   20956            rsp = rsp - imm16
   20957       */
   20958       t1 = newTemp(Ity_I64);
   20959       assign(t1, getIReg64(R_RBP));
   20960       t2 = newTemp(Ity_I64);
   20961       assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   20962       putIReg64(R_RSP, mkexpr(t2));
   20963       storeLE(mkexpr(t2), mkexpr(t1));
   20964       putIReg64(R_RBP, mkexpr(t2));
   20965       if (d64 > 0) {
   20966          putIReg64(R_RSP, binop(Iop_Sub64, mkexpr(t2), mkU64(d64)));
   20967       }
   20968       DIP("enter $%u, $0\n", (UInt)d64);
   20969       return delta;
   20970 
   20971    case 0xC9: /* LEAVE */
   20972       /* In 64-bit mode this defaults to a 64-bit operand size.  There
   20973          is no way to encode a 32-bit variant.  Hence sz==4 but we do
   20974          it as if sz=8. */
   20975       if (sz != 4)
   20976          goto decode_failure;
   20977       t1 = newTemp(Ity_I64);
   20978       t2 = newTemp(Ity_I64);
   20979       assign(t1, getIReg64(R_RBP));
   20980       /* First PUT RSP looks redundant, but need it because RSP must
   20981          always be up-to-date for Memcheck to work... */
   20982       putIReg64(R_RSP, mkexpr(t1));
   20983       assign(t2, loadLE(Ity_I64,mkexpr(t1)));
   20984       putIReg64(R_RBP, mkexpr(t2));
   20985       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t1), mkU64(8)) );
   20986       DIP("leave\n");
   20987       return delta;
   20988 
   20989    case 0xCC: /* INT 3 */
   20990       jmp_lit(dres, Ijk_SigTRAP, guest_RIP_bbstart + delta);
   20991       vassert(dres->whatNext == Dis_StopHere);
   20992       DIP("int $0x3\n");
   20993       return delta;
   20994 
   20995    case 0xCD: /* INT imm8 */
   20996       d64 = getUChar(delta); delta++;
   20997 
   20998       /* Handle int $0xD2 (Solaris fasttrap syscalls). */
   20999       if (d64 == 0xD2) {
   21000          jmp_lit(dres, Ijk_Sys_int210, guest_RIP_bbstart + delta);
   21001          vassert(dres->whatNext == Dis_StopHere);
   21002          DIP("int $0xD2\n");
   21003          return delta;
   21004       }
   21005       goto decode_failure;
   21006 
   21007    case 0xD0: { /* Grp2 1,Eb */
   21008       Bool decode_OK = True;
   21009       if (haveF2orF3(pfx)) goto decode_failure;
   21010       modrm = getUChar(delta);
   21011       am_sz = lengthAMode(pfx,delta);
   21012       d_sz  = 0;
   21013       d64   = 1;
   21014       sz    = 1;
   21015       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   21016                          mkU8(d64), NULL, &decode_OK );
   21017       if (!decode_OK) goto decode_failure;
   21018       return delta;
   21019    }
   21020 
   21021    case 0xD1: { /* Grp2 1,Ev */
   21022       Bool decode_OK = True;
   21023       if (haveF2orF3(pfx)) goto decode_failure;
   21024       modrm = getUChar(delta);
   21025       am_sz = lengthAMode(pfx,delta);
   21026       d_sz  = 0;
   21027       d64   = 1;
   21028       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   21029                          mkU8(d64), NULL, &decode_OK );
   21030       if (!decode_OK) goto decode_failure;
   21031       return delta;
   21032    }
   21033 
   21034    case 0xD2: { /* Grp2 CL,Eb */
   21035       Bool decode_OK = True;
   21036       if (haveF2orF3(pfx)) goto decode_failure;
   21037       modrm = getUChar(delta);
   21038       am_sz = lengthAMode(pfx,delta);
   21039       d_sz  = 0;
   21040       sz    = 1;
   21041       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   21042                          getIRegCL(), "%cl", &decode_OK );
   21043       if (!decode_OK) goto decode_failure;
   21044       return delta;
   21045    }
   21046 
   21047    case 0xD3: { /* Grp2 CL,Ev */
   21048       Bool decode_OK = True;
   21049       if (haveF2orF3(pfx)) goto decode_failure;
   21050       modrm = getUChar(delta);
   21051       am_sz = lengthAMode(pfx,delta);
   21052       d_sz  = 0;
   21053       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   21054                          getIRegCL(), "%cl", &decode_OK );
   21055       if (!decode_OK) goto decode_failure;
   21056       return delta;
   21057    }
   21058 
   21059    case 0xD8: /* X87 instructions */
   21060    case 0xD9:
   21061    case 0xDA:
   21062    case 0xDB:
   21063    case 0xDC:
   21064    case 0xDD:
   21065    case 0xDE:
   21066    case 0xDF: {
   21067       Bool redundantREXWok = False;
   21068 
   21069       if (haveF2orF3(pfx))
   21070          goto decode_failure;
   21071 
   21072       /* kludge to tolerate redundant rex.w prefixes (should do this
   21073          properly one day) */
   21074       /* mono 1.1.18.1 produces 48 D9 FA, which is rex.w fsqrt */
   21075       if ( (opc == 0xD9 && getUChar(delta+0) == 0xFA)/*fsqrt*/ )
   21076          redundantREXWok = True;
   21077 
   21078       Bool size_OK = False;
   21079       if ( sz == 4 )
   21080          size_OK = True;
   21081       else if ( sz == 8 )
   21082          size_OK = redundantREXWok;
   21083       else if ( sz == 2 ) {
   21084          int mod_rm = getUChar(delta+0);
   21085          int reg = gregLO3ofRM(mod_rm);
   21086          /* The HotSpot JVM uses these */
   21087          if ( (opc == 0xDD) && (reg == 0 /* FLDL   */ ||
   21088                                 reg == 4 /* FNSAVE */ ||
   21089                                 reg == 6 /* FRSTOR */ ) )
   21090             size_OK = True;
   21091       }
   21092       /* AMD manual says 0x66 size override is ignored, except where
   21093          it is meaningful */
   21094       if (!size_OK)
   21095          goto decode_failure;
   21096 
   21097       Bool decode_OK = False;
   21098       delta = dis_FPU ( &decode_OK, vbi, pfx, delta );
   21099       if (!decode_OK)
   21100          goto decode_failure;
   21101 
   21102       return delta;
   21103    }
   21104 
   21105    case 0xE0: /* LOOPNE disp8: decrement count, jump if count != 0 && ZF==0 */
   21106    case 0xE1: /* LOOPE  disp8: decrement count, jump if count != 0 && ZF==1 */
   21107    case 0xE2: /* LOOP   disp8: decrement count, jump if count != 0 */
   21108     { /* The docs say this uses rCX as a count depending on the
   21109          address size override, not the operand one. */
   21110       IRExpr* zbit  = NULL;
   21111       IRExpr* count = NULL;
   21112       IRExpr* cond  = NULL;
   21113       const HChar* xtra = NULL;
   21114 
   21115       if (have66orF2orF3(pfx) || 1==getRexW(pfx)) goto decode_failure;
   21116       /* So at this point we've rejected any variants which appear to
   21117          be governed by the usual operand-size modifiers.  Hence only
   21118          the address size prefix can have an effect.  It changes the
   21119          size from 64 (default) to 32. */
   21120       d64 = guest_RIP_bbstart+delta+1 + getSDisp8(delta);
   21121       delta++;
   21122       if (haveASO(pfx)) {
   21123          /* 64to32 of 64-bit get is merely a get-put improvement
   21124             trick. */
   21125          putIReg32(R_RCX, binop(Iop_Sub32,
   21126                                 unop(Iop_64to32, getIReg64(R_RCX)),
   21127                                 mkU32(1)));
   21128       } else {
   21129          putIReg64(R_RCX, binop(Iop_Sub64, getIReg64(R_RCX), mkU64(1)));
   21130       }
   21131 
   21132       /* This is correct, both for 32- and 64-bit versions.  If we're
   21133          doing a 32-bit dec and the result is zero then the default
   21134          zero extension rule will cause the upper 32 bits to be zero
   21135          too.  Hence a 64-bit check against zero is OK. */
   21136       count = getIReg64(R_RCX);
   21137       cond = binop(Iop_CmpNE64, count, mkU64(0));
   21138       switch (opc) {
   21139          case 0xE2:
   21140             xtra = "";
   21141             break;
   21142          case 0xE1:
   21143             xtra = "e";
   21144             zbit = mk_amd64g_calculate_condition( AMD64CondZ );
   21145             cond = mkAnd1(cond, zbit);
   21146             break;
   21147          case 0xE0:
   21148             xtra = "ne";
   21149             zbit = mk_amd64g_calculate_condition( AMD64CondNZ );
   21150             cond = mkAnd1(cond, zbit);
   21151             break;
   21152          default:
   21153             vassert(0);
   21154       }
   21155       stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U64(d64), OFFB_RIP) );
   21156 
   21157       DIP("loop%s%s 0x%llx\n", xtra, haveASO(pfx) ? "l" : "", (ULong)d64);
   21158       return delta;
   21159     }
   21160 
   21161    case 0xE3:
   21162       /* JRCXZ or JECXZ, depending address size override. */
   21163       if (have66orF2orF3(pfx)) goto decode_failure;
   21164       d64 = (guest_RIP_bbstart+delta+1) + getSDisp8(delta);
   21165       delta++;
   21166       if (haveASO(pfx)) {
   21167          /* 32-bit */
   21168          stmt( IRStmt_Exit( binop(Iop_CmpEQ64,
   21169                                   unop(Iop_32Uto64, getIReg32(R_RCX)),
   21170                                   mkU64(0)),
   21171                             Ijk_Boring,
   21172                             IRConst_U64(d64),
   21173                             OFFB_RIP
   21174              ));
   21175          DIP("jecxz 0x%llx\n", (ULong)d64);
   21176       } else {
   21177          /* 64-bit */
   21178          stmt( IRStmt_Exit( binop(Iop_CmpEQ64,
   21179                                   getIReg64(R_RCX),
   21180                                   mkU64(0)),
   21181                             Ijk_Boring,
   21182                             IRConst_U64(d64),
   21183                             OFFB_RIP
   21184                ));
   21185          DIP("jrcxz 0x%llx\n", (ULong)d64);
   21186       }
   21187       return delta;
   21188 
   21189    case 0xE4: /* IN imm8, AL */
   21190       sz = 1;
   21191       t1 = newTemp(Ity_I64);
   21192       abyte = getUChar(delta); delta++;
   21193       assign(t1, mkU64( abyte & 0xFF ));
   21194       DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIRegRAX(sz));
   21195       goto do_IN;
   21196    case 0xE5: /* IN imm8, eAX */
   21197       if (!(sz == 2 || sz == 4)) goto decode_failure;
   21198       t1 = newTemp(Ity_I64);
   21199       abyte = getUChar(delta); delta++;
   21200       assign(t1, mkU64( abyte & 0xFF ));
   21201       DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIRegRAX(sz));
   21202       goto do_IN;
   21203    case 0xEC: /* IN %DX, AL */
   21204       sz = 1;
   21205       t1 = newTemp(Ity_I64);
   21206       assign(t1, unop(Iop_16Uto64, getIRegRDX(2)));
   21207       DIP("in%c %s,%s\n", nameISize(sz), nameIRegRDX(2),
   21208                                          nameIRegRAX(sz));
   21209       goto do_IN;
   21210    case 0xED: /* IN %DX, eAX */
   21211       if (!(sz == 2 || sz == 4)) goto decode_failure;
   21212       t1 = newTemp(Ity_I64);
   21213       assign(t1, unop(Iop_16Uto64, getIRegRDX(2)));
   21214       DIP("in%c %s,%s\n", nameISize(sz), nameIRegRDX(2),
   21215                                          nameIRegRAX(sz));
   21216       goto do_IN;
   21217    do_IN: {
   21218       /* At this point, sz indicates the width, and t1 is a 64-bit
   21219          value giving port number. */
   21220       IRDirty* d;
   21221       if (haveF2orF3(pfx)) goto decode_failure;
   21222       vassert(sz == 1 || sz == 2 || sz == 4);
   21223       ty = szToITy(sz);
   21224       t2 = newTemp(Ity_I64);
   21225       d = unsafeIRDirty_1_N(
   21226              t2,
   21227              0/*regparms*/,
   21228              "amd64g_dirtyhelper_IN",
   21229              &amd64g_dirtyhelper_IN,
   21230              mkIRExprVec_2( mkexpr(t1), mkU64(sz) )
   21231           );
   21232       /* do the call, dumping the result in t2. */
   21233       stmt( IRStmt_Dirty(d) );
   21234       putIRegRAX(sz, narrowTo( ty, mkexpr(t2) ) );
   21235       return delta;
   21236    }
   21237 
   21238    case 0xE6: /* OUT AL, imm8 */
   21239       sz = 1;
   21240       t1 = newTemp(Ity_I64);
   21241       abyte = getUChar(delta); delta++;
   21242       assign( t1, mkU64( abyte & 0xFF ) );
   21243       DIP("out%c %s,$%d\n", nameISize(sz), nameIRegRAX(sz), (Int)abyte);
   21244       goto do_OUT;
   21245    case 0xE7: /* OUT eAX, imm8 */
   21246       if (!(sz == 2 || sz == 4)) goto decode_failure;
   21247       t1 = newTemp(Ity_I64);
   21248       abyte = getUChar(delta); delta++;
   21249       assign( t1, mkU64( abyte & 0xFF ) );
   21250       DIP("out%c %s,$%d\n", nameISize(sz), nameIRegRAX(sz), (Int)abyte);
   21251       goto do_OUT;
   21252    case 0xEE: /* OUT AL, %DX */
   21253       sz = 1;
   21254       t1 = newTemp(Ity_I64);
   21255       assign( t1, unop(Iop_16Uto64, getIRegRDX(2)) );
   21256       DIP("out%c %s,%s\n", nameISize(sz), nameIRegRAX(sz),
   21257                                           nameIRegRDX(2));
   21258       goto do_OUT;
   21259    case 0xEF: /* OUT eAX, %DX */
   21260       if (!(sz == 2 || sz == 4)) goto decode_failure;
   21261       t1 = newTemp(Ity_I64);
   21262       assign( t1, unop(Iop_16Uto64, getIRegRDX(2)) );
   21263       DIP("out%c %s,%s\n", nameISize(sz), nameIRegRAX(sz),
   21264                                           nameIRegRDX(2));
   21265       goto do_OUT;
   21266    do_OUT: {
   21267       /* At this point, sz indicates the width, and t1 is a 64-bit
   21268          value giving port number. */
   21269       IRDirty* d;
   21270       if (haveF2orF3(pfx)) goto decode_failure;
   21271       vassert(sz == 1 || sz == 2 || sz == 4);
   21272       ty = szToITy(sz);
   21273       d = unsafeIRDirty_0_N(
   21274              0/*regparms*/,
   21275              "amd64g_dirtyhelper_OUT",
   21276              &amd64g_dirtyhelper_OUT,
   21277              mkIRExprVec_3( mkexpr(t1),
   21278                             widenUto64( getIRegRAX(sz) ),
   21279                             mkU64(sz) )
   21280           );
   21281       stmt( IRStmt_Dirty(d) );
   21282       return delta;
   21283    }
   21284 
   21285    case 0xE8: /* CALL J4 */
   21286       if (haveF3(pfx)) goto decode_failure;
   21287       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   21288       d64 = getSDisp32(delta); delta += 4;
   21289       d64 += (guest_RIP_bbstart+delta);
   21290       /* (guest_RIP_bbstart+delta) == return-to addr, d64 == call-to addr */
   21291       t1 = newTemp(Ity_I64);
   21292       assign(t1, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   21293       putIReg64(R_RSP, mkexpr(t1));
   21294       storeLE( mkexpr(t1), mkU64(guest_RIP_bbstart+delta));
   21295       t2 = newTemp(Ity_I64);
   21296       assign(t2, mkU64((Addr64)d64));
   21297       make_redzone_AbiHint(vbi, t1, t2/*nia*/, "call-d32");
   21298       if (resteerOkFn( callback_opaque, (Addr64)d64) ) {
   21299          /* follow into the call target. */
   21300          dres->whatNext   = Dis_ResteerU;
   21301          dres->continueAt = d64;
   21302       } else {
   21303          jmp_lit(dres, Ijk_Call, d64);
   21304          vassert(dres->whatNext == Dis_StopHere);
   21305       }
   21306       DIP("call 0x%llx\n", (ULong)d64);
   21307       return delta;
   21308 
   21309    case 0xE9: /* Jv (jump, 16/32 offset) */
   21310       if (haveF3(pfx)) goto decode_failure;
   21311       if (sz != 4)
   21312          goto decode_failure; /* JRS added 2004 July 11 */
   21313       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   21314       d64 = (guest_RIP_bbstart+delta+sz) + getSDisp(sz,delta);
   21315       delta += sz;
   21316       if (resteerOkFn(callback_opaque, (Addr64)d64)) {
   21317          dres->whatNext   = Dis_ResteerU;
   21318          dres->continueAt = d64;
   21319       } else {
   21320          jmp_lit(dres, Ijk_Boring, d64);
   21321          vassert(dres->whatNext == Dis_StopHere);
   21322       }
   21323       DIP("jmp 0x%llx\n", (ULong)d64);
   21324       return delta;
   21325 
   21326    case 0xEB: /* Jb (jump, byte offset) */
   21327       if (haveF3(pfx)) goto decode_failure;
   21328       if (sz != 4)
   21329          goto decode_failure; /* JRS added 2004 July 11 */
   21330       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   21331       d64 = (guest_RIP_bbstart+delta+1) + getSDisp8(delta);
   21332       delta++;
   21333       if (resteerOkFn(callback_opaque, (Addr64)d64)) {
   21334          dres->whatNext   = Dis_ResteerU;
   21335          dres->continueAt = d64;
   21336       } else {
   21337          jmp_lit(dres, Ijk_Boring, d64);
   21338          vassert(dres->whatNext == Dis_StopHere);
   21339       }
   21340       DIP("jmp-8 0x%llx\n", (ULong)d64);
   21341       return delta;
   21342 
   21343    case 0xF5: /* CMC */
   21344    case 0xF8: /* CLC */
   21345    case 0xF9: /* STC */
   21346       t1 = newTemp(Ity_I64);
   21347       t2 = newTemp(Ity_I64);
   21348       assign( t1, mk_amd64g_calculate_rflags_all() );
   21349       switch (opc) {
   21350          case 0xF5:
   21351             assign( t2, binop(Iop_Xor64, mkexpr(t1),
   21352                                          mkU64(AMD64G_CC_MASK_C)));
   21353             DIP("cmc\n");
   21354             break;
   21355          case 0xF8:
   21356             assign( t2, binop(Iop_And64, mkexpr(t1),
   21357                                          mkU64(~AMD64G_CC_MASK_C)));
   21358             DIP("clc\n");
   21359             break;
   21360          case 0xF9:
   21361             assign( t2, binop(Iop_Or64, mkexpr(t1),
   21362                                         mkU64(AMD64G_CC_MASK_C)));
   21363             DIP("stc\n");
   21364             break;
   21365          default:
   21366             vpanic("disInstr(x64)(cmc/clc/stc)");
   21367       }
   21368       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   21369       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   21370       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t2) ));
   21371       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   21372          elimination of previous stores to this field work better. */
   21373       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   21374       return delta;
   21375 
   21376    case 0xF6: { /* Grp3 Eb */
   21377       Bool decode_OK = True;
   21378       /* RM'd: if (haveF2orF3(pfx)) goto decode_failure; */
   21379       /* We now let dis_Grp3 itself decide if F2 and/or F3 are valid */
   21380       delta = dis_Grp3 ( vbi, pfx, 1, delta, &decode_OK );
   21381       if (!decode_OK) goto decode_failure;
   21382       return delta;
   21383    }
   21384 
   21385    case 0xF7: { /* Grp3 Ev */
   21386       Bool decode_OK = True;
   21387       /* RM'd: if (haveF2orF3(pfx)) goto decode_failure; */
   21388       /* We now let dis_Grp3 itself decide if F2 and/or F3 are valid */
   21389       delta = dis_Grp3 ( vbi, pfx, sz, delta, &decode_OK );
   21390       if (!decode_OK) goto decode_failure;
   21391       return delta;
   21392    }
   21393 
   21394    case 0xFC: /* CLD */
   21395       if (haveF2orF3(pfx)) goto decode_failure;
   21396       stmt( IRStmt_Put( OFFB_DFLAG, mkU64(1)) );
   21397       DIP("cld\n");
   21398       return delta;
   21399 
   21400    case 0xFD: /* STD */
   21401       if (haveF2orF3(pfx)) goto decode_failure;
   21402       stmt( IRStmt_Put( OFFB_DFLAG, mkU64(-1ULL)) );
   21403       DIP("std\n");
   21404       return delta;
   21405 
   21406    case 0xFE: { /* Grp4 Eb */
   21407       Bool decode_OK = True;
   21408       /* RM'd: if (haveF2orF3(pfx)) goto decode_failure; */
   21409       /* We now let dis_Grp4 itself decide if F2 and/or F3 are valid */
   21410       delta = dis_Grp4 ( vbi, pfx, delta, &decode_OK );
   21411       if (!decode_OK) goto decode_failure;
   21412       return delta;
   21413    }
   21414 
   21415    case 0xFF: { /* Grp5 Ev */
   21416       Bool decode_OK = True;
   21417       /* RM'd: if (haveF2orF3(pfx)) goto decode_failure; */
   21418       /* We now let dis_Grp5 itself decide if F2 and/or F3 are valid */
   21419       delta = dis_Grp5 ( vbi, pfx, sz, delta, dres, &decode_OK );
   21420       if (!decode_OK) goto decode_failure;
   21421       return delta;
   21422    }
   21423 
   21424    default:
   21425       break;
   21426 
   21427    }
   21428 
   21429   decode_failure:
   21430    return deltaIN; /* fail */
   21431 }
   21432 
   21433 
   21434 /*------------------------------------------------------------*/
   21435 /*---                                                      ---*/
   21436 /*--- Top-level post-escape decoders: dis_ESC_0F           ---*/
   21437 /*---                                                      ---*/
   21438 /*------------------------------------------------------------*/
   21439 
   21440 static IRTemp math_BSWAP ( IRTemp t1, IRType ty )
   21441 {
   21442    IRTemp t2 = newTemp(ty);
   21443    if (ty == Ity_I64) {
   21444       IRTemp m8  = newTemp(Ity_I64);
   21445       IRTemp s8  = newTemp(Ity_I64);
   21446       IRTemp m16 = newTemp(Ity_I64);
   21447       IRTemp s16 = newTemp(Ity_I64);
   21448       IRTemp m32 = newTemp(Ity_I64);
   21449       assign( m8, mkU64(0xFF00FF00FF00FF00ULL) );
   21450       assign( s8,
   21451               binop(Iop_Or64,
   21452                     binop(Iop_Shr64,
   21453                           binop(Iop_And64,mkexpr(t1),mkexpr(m8)),
   21454                           mkU8(8)),
   21455                     binop(Iop_And64,
   21456                           binop(Iop_Shl64,mkexpr(t1),mkU8(8)),
   21457                           mkexpr(m8))
   21458                    )
   21459             );
   21460 
   21461       assign( m16, mkU64(0xFFFF0000FFFF0000ULL) );
   21462       assign( s16,
   21463               binop(Iop_Or64,
   21464                     binop(Iop_Shr64,
   21465                           binop(Iop_And64,mkexpr(s8),mkexpr(m16)),
   21466                           mkU8(16)),
   21467                     binop(Iop_And64,
   21468                           binop(Iop_Shl64,mkexpr(s8),mkU8(16)),
   21469                           mkexpr(m16))
   21470                    )
   21471             );
   21472 
   21473       assign( m32, mkU64(0xFFFFFFFF00000000ULL) );
   21474       assign( t2,
   21475               binop(Iop_Or64,
   21476                     binop(Iop_Shr64,
   21477                           binop(Iop_And64,mkexpr(s16),mkexpr(m32)),
   21478                           mkU8(32)),
   21479                     binop(Iop_And64,
   21480                           binop(Iop_Shl64,mkexpr(s16),mkU8(32)),
   21481                           mkexpr(m32))
   21482                    )
   21483             );
   21484       return t2;
   21485    }
   21486    if (ty == Ity_I32) {
   21487       assign( t2,
   21488          binop(
   21489             Iop_Or32,
   21490             binop(Iop_Shl32, mkexpr(t1), mkU8(24)),
   21491             binop(
   21492                Iop_Or32,
   21493                binop(Iop_And32, binop(Iop_Shl32, mkexpr(t1), mkU8(8)),
   21494                                 mkU32(0x00FF0000)),
   21495                binop(Iop_Or32,
   21496                      binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(8)),
   21497                                       mkU32(0x0000FF00)),
   21498                      binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(24)),
   21499                                       mkU32(0x000000FF) )
   21500             )))
   21501       );
   21502       return t2;
   21503    }
   21504    if (ty == Ity_I16) {
   21505       assign(t2,
   21506              binop(Iop_Or16,
   21507                    binop(Iop_Shl16, mkexpr(t1), mkU8(8)),
   21508                    binop(Iop_Shr16, mkexpr(t1), mkU8(8)) ));
   21509       return t2;
   21510    }
   21511    vassert(0);
   21512    /*NOTREACHED*/
   21513    return IRTemp_INVALID;
   21514 }
   21515 
   21516 
   21517 __attribute__((noinline))
   21518 static
   21519 Long dis_ESC_0F (
   21520         /*MB_OUT*/DisResult* dres,
   21521         /*MB_OUT*/Bool*      expect_CAS,
   21522         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
   21523         Bool         resteerCisOk,
   21524         void*        callback_opaque,
   21525         const VexArchInfo* archinfo,
   21526         const VexAbiInfo*  vbi,
   21527         Prefix pfx, Int sz, Long deltaIN
   21528      )
   21529 {
   21530    Long   d64   = 0;
   21531    IRTemp addr  = IRTemp_INVALID;
   21532    IRTemp t1    = IRTemp_INVALID;
   21533    IRTemp t2    = IRTemp_INVALID;
   21534    UChar  modrm = 0;
   21535    Int    am_sz = 0;
   21536    Int    alen  = 0;
   21537    HChar  dis_buf[50];
   21538 
   21539    /* In the first switch, look for ordinary integer insns. */
   21540    Long   delta = deltaIN;
   21541    UChar  opc   = getUChar(delta);
   21542    delta++;
   21543    switch (opc) { /* first switch */
   21544 
   21545    case 0x01:
   21546    {
   21547       modrm = getUChar(delta);
   21548       /* 0F 01 /0 -- SGDT */
   21549       /* 0F 01 /1 -- SIDT */
   21550       if (!epartIsReg(modrm)
   21551           && (gregLO3ofRM(modrm) == 0 || gregLO3ofRM(modrm) == 1)) {
   21552          /* This is really revolting, but ... since each processor
   21553             (core) only has one IDT and one GDT, just let the guest
   21554             see it (pass-through semantics).  I can't see any way to
   21555             construct a faked-up value, so don't bother to try. */
   21556          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21557          delta += alen;
   21558          switch (gregLO3ofRM(modrm)) {
   21559             case 0: DIP("sgdt %s\n", dis_buf); break;
   21560             case 1: DIP("sidt %s\n", dis_buf); break;
   21561             default: vassert(0); /*NOTREACHED*/
   21562          }
   21563          IRDirty* d = unsafeIRDirty_0_N (
   21564                           0/*regparms*/,
   21565                           "amd64g_dirtyhelper_SxDT",
   21566                           &amd64g_dirtyhelper_SxDT,
   21567                           mkIRExprVec_2( mkexpr(addr),
   21568                                          mkU64(gregLO3ofRM(modrm)) )
   21569                       );
   21570          /* declare we're writing memory */
   21571          d->mFx   = Ifx_Write;
   21572          d->mAddr = mkexpr(addr);
   21573          d->mSize = 6;
   21574          stmt( IRStmt_Dirty(d) );
   21575          return delta;
   21576       }
   21577       /* 0F 01 D0 = XGETBV */
   21578       if (modrm == 0xD0 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   21579          delta += 1;
   21580          DIP("xgetbv\n");
   21581          /* Fault (SEGV) if ECX isn't zero.  Intel docs say #GP and I
   21582             am not sure if that translates in to SEGV or to something
   21583             else, in user space. */
   21584          t1 = newTemp(Ity_I32);
   21585          assign( t1, getIReg32(R_RCX) );
   21586          stmt( IRStmt_Exit(binop(Iop_CmpNE32, mkexpr(t1), mkU32(0)),
   21587                            Ijk_SigSEGV,
   21588                            IRConst_U64(guest_RIP_curr_instr),
   21589                            OFFB_RIP
   21590          ));
   21591          putIRegRAX(4, mkU32(7));
   21592          putIRegRDX(4, mkU32(0));
   21593          return delta;
   21594       }
   21595       /* BEGIN HACKY SUPPORT FOR xend */
   21596       /* 0F 01 D5 = XEND */
   21597       if (modrm == 0xD5 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   21598          /* We are never in an transaction (xbegin immediately aborts).
   21599             So this just always generates a General Protection Fault. */
   21600          delta += 1;
   21601          jmp_lit(dres, Ijk_SigSEGV, guest_RIP_bbstart + delta);
   21602          vassert(dres->whatNext == Dis_StopHere);
   21603          DIP("xend\n");
   21604          return delta;
   21605       }
   21606       /* END HACKY SUPPORT FOR xend */
   21607       /* BEGIN HACKY SUPPORT FOR xtest */
   21608       /* 0F 01 D6 = XTEST */
   21609       if (modrm == 0xD6 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   21610          /* Sets ZF because there never is a transaction, and all
   21611             CF, OF, SF, PF and AF are always cleared by xtest. */
   21612          delta += 1;
   21613          DIP("xtest\n");
   21614          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   21615          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   21616          stmt( IRStmt_Put( OFFB_CC_DEP1, mkU64(AMD64G_CC_MASK_Z) ));
   21617          /* Set NDEP even though it isn't used.  This makes redundant-PUT
   21618             elimination of previous stores to this field work better. */
   21619          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   21620          return delta;
   21621       }
   21622       /* END HACKY SUPPORT FOR xtest */
   21623       /* 0F 01 F9 = RDTSCP */
   21624       if (modrm == 0xF9 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_RDTSCP)) {
   21625          delta += 1;
   21626          /* Uses dirty helper:
   21627             void amd64g_dirtyhelper_RDTSCP ( VexGuestAMD64State* )
   21628             declared to wr rax, rcx, rdx
   21629          */
   21630          const HChar* fName = "amd64g_dirtyhelper_RDTSCP";
   21631          void*        fAddr = &amd64g_dirtyhelper_RDTSCP;
   21632          IRDirty* d
   21633             = unsafeIRDirty_0_N ( 0/*regparms*/,
   21634                                   fName, fAddr, mkIRExprVec_1(IRExpr_BBPTR()) );
   21635          /* declare guest state effects */
   21636          d->nFxState = 3;
   21637          vex_bzero(&d->fxState, sizeof(d->fxState));
   21638          d->fxState[0].fx     = Ifx_Write;
   21639          d->fxState[0].offset = OFFB_RAX;
   21640          d->fxState[0].size   = 8;
   21641          d->fxState[1].fx     = Ifx_Write;
   21642          d->fxState[1].offset = OFFB_RCX;
   21643          d->fxState[1].size   = 8;
   21644          d->fxState[2].fx     = Ifx_Write;
   21645          d->fxState[2].offset = OFFB_RDX;
   21646          d->fxState[2].size   = 8;
   21647          /* execute the dirty call, side-effecting guest state */
   21648          stmt( IRStmt_Dirty(d) );
   21649          /* RDTSCP is a serialising insn.  So, just in case someone is
   21650             using it as a memory fence ... */
   21651          stmt( IRStmt_MBE(Imbe_Fence) );
   21652          DIP("rdtscp\n");
   21653          return delta;
   21654       }
   21655       /* else decode failed */
   21656       break;
   21657    }
   21658 
   21659    case 0x05: /* SYSCALL */
   21660       guest_RIP_next_mustcheck = True;
   21661       guest_RIP_next_assumed = guest_RIP_bbstart + delta;
   21662       putIReg64( R_RCX, mkU64(guest_RIP_next_assumed) );
   21663       /* It's important that all guest state is up-to-date
   21664          at this point.  So we declare an end-of-block here, which
   21665          forces any cached guest state to be flushed. */
   21666       jmp_lit(dres, Ijk_Sys_syscall, guest_RIP_next_assumed);
   21667       vassert(dres->whatNext == Dis_StopHere);
   21668       DIP("syscall\n");
   21669       return delta;
   21670 
   21671    case 0x0B: /* UD2 */
   21672       stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_curr_instr) ) );
   21673       jmp_lit(dres, Ijk_NoDecode, guest_RIP_curr_instr);
   21674       vassert(dres->whatNext == Dis_StopHere);
   21675       DIP("ud2\n");
   21676       return delta;
   21677 
   21678    case 0x0D: /* 0F 0D /0 -- prefetch mem8 */
   21679               /* 0F 0D /1 -- prefetchw mem8 */
   21680       if (have66orF2orF3(pfx)) goto decode_failure;
   21681       modrm = getUChar(delta);
   21682       if (epartIsReg(modrm)) goto decode_failure;
   21683       if (gregLO3ofRM(modrm) != 0 && gregLO3ofRM(modrm) != 1)
   21684          goto decode_failure;
   21685       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21686       delta += alen;
   21687       switch (gregLO3ofRM(modrm)) {
   21688          case 0: DIP("prefetch %s\n", dis_buf); break;
   21689          case 1: DIP("prefetchw %s\n", dis_buf); break;
   21690          default: vassert(0); /*NOTREACHED*/
   21691       }
   21692       return delta;
   21693 
   21694    case 0x1F:
   21695       if (haveF2orF3(pfx)) goto decode_failure;
   21696       modrm = getUChar(delta);
   21697       if (epartIsReg(modrm)) goto decode_failure;
   21698       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21699       delta += alen;
   21700       DIP("nop%c %s\n", nameISize(sz), dis_buf);
   21701       return delta;
   21702 
   21703    case 0x31: { /* RDTSC */
   21704       IRTemp   val  = newTemp(Ity_I64);
   21705       IRExpr** args = mkIRExprVec_0();
   21706       IRDirty* d    = unsafeIRDirty_1_N (
   21707                          val,
   21708                          0/*regparms*/,
   21709                          "amd64g_dirtyhelper_RDTSC",
   21710                          &amd64g_dirtyhelper_RDTSC,
   21711                          args
   21712                       );
   21713       if (have66orF2orF3(pfx)) goto decode_failure;
   21714       /* execute the dirty call, dumping the result in val. */
   21715       stmt( IRStmt_Dirty(d) );
   21716       putIRegRDX(4, unop(Iop_64HIto32, mkexpr(val)));
   21717       putIRegRAX(4, unop(Iop_64to32, mkexpr(val)));
   21718       DIP("rdtsc\n");
   21719       return delta;
   21720    }
   21721 
   21722    case 0x40:
   21723    case 0x41:
   21724    case 0x42: /* CMOVBb/CMOVNAEb (cmov below) */
   21725    case 0x43: /* CMOVNBb/CMOVAEb (cmov not below) */
   21726    case 0x44: /* CMOVZb/CMOVEb (cmov zero) */
   21727    case 0x45: /* CMOVNZb/CMOVNEb (cmov not zero) */
   21728    case 0x46: /* CMOVBEb/CMOVNAb (cmov below or equal) */
   21729    case 0x47: /* CMOVNBEb/CMOVAb (cmov not below or equal) */
   21730    case 0x48: /* CMOVSb (cmov negative) */
   21731    case 0x49: /* CMOVSb (cmov not negative) */
   21732    case 0x4A: /* CMOVP (cmov parity even) */
   21733    case 0x4B: /* CMOVNP (cmov parity odd) */
   21734    case 0x4C: /* CMOVLb/CMOVNGEb (cmov less) */
   21735    case 0x4D: /* CMOVGEb/CMOVNLb (cmov greater or equal) */
   21736    case 0x4E: /* CMOVLEb/CMOVNGb (cmov less or equal) */
   21737    case 0x4F: /* CMOVGb/CMOVNLEb (cmov greater) */
   21738       if (haveF2orF3(pfx)) goto decode_failure;
   21739       delta = dis_cmov_E_G(vbi, pfx, sz, (AMD64Condcode)(opc - 0x40), delta);
   21740       return delta;
   21741 
   21742    case 0x80:
   21743    case 0x81:
   21744    case 0x82:   /* JBb/JNAEb (jump below) */
   21745    case 0x83:   /* JNBb/JAEb (jump not below) */
   21746    case 0x84:   /* JZb/JEb (jump zero) */
   21747    case 0x85:   /* JNZb/JNEb (jump not zero) */
   21748    case 0x86:   /* JBEb/JNAb (jump below or equal) */
   21749    case 0x87:   /* JNBEb/JAb (jump not below or equal) */
   21750    case 0x88:   /* JSb (jump negative) */
   21751    case 0x89:   /* JSb (jump not negative) */
   21752    case 0x8A:   /* JP (jump parity even) */
   21753    case 0x8B:   /* JNP/JPO (jump parity odd) */
   21754    case 0x8C:   /* JLb/JNGEb (jump less) */
   21755    case 0x8D:   /* JGEb/JNLb (jump greater or equal) */
   21756    case 0x8E:   /* JLEb/JNGb (jump less or equal) */
   21757    case 0x8F: { /* JGb/JNLEb (jump greater) */
   21758       Long   jmpDelta;
   21759       const HChar* comment  = "";
   21760       if (haveF3(pfx)) goto decode_failure;
   21761       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   21762       jmpDelta = getSDisp32(delta);
   21763       d64 = (guest_RIP_bbstart+delta+4) + jmpDelta;
   21764       delta += 4;
   21765       if (resteerCisOk
   21766           && vex_control.guest_chase_cond
   21767           && (Addr64)d64 != (Addr64)guest_RIP_bbstart
   21768           && jmpDelta < 0
   21769           && resteerOkFn( callback_opaque, (Addr64)d64) ) {
   21770          /* Speculation: assume this backward branch is taken.  So
   21771             we need to emit a side-exit to the insn following this
   21772             one, on the negation of the condition, and continue at
   21773             the branch target address (d64).  If we wind up back at
   21774             the first instruction of the trace, just stop; it's
   21775             better to let the IR loop unroller handle that case. */
   21776          stmt( IRStmt_Exit(
   21777                   mk_amd64g_calculate_condition(
   21778                      (AMD64Condcode)(1 ^ (opc - 0x80))),
   21779                   Ijk_Boring,
   21780                   IRConst_U64(guest_RIP_bbstart+delta),
   21781                   OFFB_RIP
   21782              ));
   21783          dres->whatNext   = Dis_ResteerC;
   21784          dres->continueAt = d64;
   21785          comment = "(assumed taken)";
   21786       }
   21787       else
   21788       if (resteerCisOk
   21789           && vex_control.guest_chase_cond
   21790           && (Addr64)d64 != (Addr64)guest_RIP_bbstart
   21791           && jmpDelta >= 0
   21792           && resteerOkFn( callback_opaque, guest_RIP_bbstart+delta ) ) {
   21793          /* Speculation: assume this forward branch is not taken.
   21794             So we need to emit a side-exit to d64 (the dest) and
   21795             continue disassembling at the insn immediately
   21796             following this one. */
   21797          stmt( IRStmt_Exit(
   21798                   mk_amd64g_calculate_condition((AMD64Condcode)
   21799                                                 (opc - 0x80)),
   21800                   Ijk_Boring,
   21801                   IRConst_U64(d64),
   21802                   OFFB_RIP
   21803              ));
   21804          dres->whatNext   = Dis_ResteerC;
   21805          dres->continueAt = guest_RIP_bbstart+delta;
   21806          comment = "(assumed not taken)";
   21807       }
   21808       else {
   21809          /* Conservative default translation - end the block at
   21810             this point. */
   21811          jcc_01( dres, (AMD64Condcode)(opc - 0x80),
   21812                  guest_RIP_bbstart+delta, d64 );
   21813          vassert(dres->whatNext == Dis_StopHere);
   21814       }
   21815       DIP("j%s-32 0x%llx %s\n", name_AMD64Condcode(opc - 0x80), (ULong)d64,
   21816           comment);
   21817       return delta;
   21818    }
   21819 
   21820    case 0x90:
   21821    case 0x91:
   21822    case 0x92: /* set-Bb/set-NAEb (set if below) */
   21823    case 0x93: /* set-NBb/set-AEb (set if not below) */
   21824    case 0x94: /* set-Zb/set-Eb (set if zero) */
   21825    case 0x95: /* set-NZb/set-NEb (set if not zero) */
   21826    case 0x96: /* set-BEb/set-NAb (set if below or equal) */
   21827    case 0x97: /* set-NBEb/set-Ab (set if not below or equal) */
   21828    case 0x98: /* set-Sb (set if negative) */
   21829    case 0x99: /* set-Sb (set if not negative) */
   21830    case 0x9A: /* set-P (set if parity even) */
   21831    case 0x9B: /* set-NP (set if parity odd) */
   21832    case 0x9C: /* set-Lb/set-NGEb (set if less) */
   21833    case 0x9D: /* set-GEb/set-NLb (set if greater or equal) */
   21834    case 0x9E: /* set-LEb/set-NGb (set if less or equal) */
   21835    case 0x9F: /* set-Gb/set-NLEb (set if greater) */
   21836       if (haveF2orF3(pfx)) goto decode_failure;
   21837       t1 = newTemp(Ity_I8);
   21838       assign( t1, unop(Iop_1Uto8,mk_amd64g_calculate_condition(opc-0x90)) );
   21839       modrm = getUChar(delta);
   21840       if (epartIsReg(modrm)) {
   21841          delta++;
   21842          putIRegE(1, pfx, modrm, mkexpr(t1));
   21843          DIP("set%s %s\n", name_AMD64Condcode(opc-0x90),
   21844                            nameIRegE(1,pfx,modrm));
   21845       } else {
   21846          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21847          delta += alen;
   21848          storeLE( mkexpr(addr), mkexpr(t1) );
   21849          DIP("set%s %s\n", name_AMD64Condcode(opc-0x90), dis_buf);
   21850       }
   21851       return delta;
   21852 
   21853    case 0x1A:
   21854    case 0x1B: { /* Future MPX instructions, currently NOPs.
   21855                    BNDMK b, m     F3 0F 1B
   21856                    BNDCL b, r/m   F3 0F 1A
   21857                    BNDCU b, r/m   F2 0F 1A
   21858                    BNDCN b, r/m   F2 0F 1B
   21859                    BNDMOV b, b/m  66 0F 1A
   21860                    BNDMOV b/m, b  66 0F 1B
   21861                    BNDLDX b, mib     0F 1A
   21862                    BNDSTX mib, b     0F 1B */
   21863 
   21864       /* All instructions have two operands. One operand is always the
   21865          bnd register number (bnd0-bnd3, other register numbers are
   21866          ignored when MPX isn't enabled, but should generate an
   21867          exception if MPX is enabled) given by gregOfRexRM. The other
   21868          operand is either a ModRM:reg, ModRM:r/m or a SIB encoded
   21869          address, all of which can be decoded by using either
   21870          eregOfRexRM or disAMode. */
   21871 
   21872       modrm = getUChar(delta);
   21873       int bnd = gregOfRexRM(pfx,modrm);
   21874       const HChar *oper;
   21875       if (epartIsReg(modrm)) {
   21876          oper = nameIReg64 (eregOfRexRM(pfx,modrm));
   21877          delta += 1;
   21878       } else {
   21879          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21880          delta += alen;
   21881          oper = dis_buf;
   21882       }
   21883 
   21884       if (haveF3no66noF2 (pfx)) {
   21885          if (opc == 0x1B) {
   21886             DIP ("bndmk %s, %%bnd%d\n", oper, bnd);
   21887          } else /* opc == 0x1A */ {
   21888             DIP ("bndcl %s, %%bnd%d\n", oper, bnd);
   21889          }
   21890       } else if (haveF2no66noF3 (pfx)) {
   21891          if (opc == 0x1A) {
   21892             DIP ("bndcu %s, %%bnd%d\n", oper, bnd);
   21893          } else /* opc == 0x1B */ {
   21894             DIP ("bndcn %s, %%bnd%d\n", oper, bnd);
   21895          }
   21896       } else if (have66noF2noF3 (pfx)) {
   21897          if (opc == 0x1A) {
   21898             DIP ("bndmov %s, %%bnd%d\n", oper, bnd);
   21899          } else /* opc == 0x1B */ {
   21900             DIP ("bndmov %%bnd%d, %s\n", bnd, oper);
   21901          }
   21902       } else if (haveNo66noF2noF3 (pfx)) {
   21903          if (opc == 0x1A) {
   21904             DIP ("bndldx %s, %%bnd%d\n", oper, bnd);
   21905          } else /* opc == 0x1B */ {
   21906             DIP ("bndstx %%bnd%d, %s\n", bnd, oper);
   21907          }
   21908       } else goto decode_failure;
   21909 
   21910       return delta;
   21911    }
   21912 
   21913    case 0xA2: { /* CPUID */
   21914       /* Uses dirty helper:
   21915             void amd64g_dirtyhelper_CPUID ( VexGuestAMD64State* )
   21916          declared to mod rax, wr rbx, rcx, rdx
   21917       */
   21918       IRDirty*     d     = NULL;
   21919       const HChar* fName = NULL;
   21920       void*        fAddr = NULL;
   21921 
   21922       if (haveF2orF3(pfx)) goto decode_failure;
   21923 
   21924       /* This isn't entirely correct, CPUID should depend on the VEX
   21925          capabilities, not on the underlying CPU. See bug #324882. */
   21926       if ((archinfo->hwcaps & VEX_HWCAPS_AMD64_SSE3) &&
   21927           (archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16) &&
   21928           (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX2)) {
   21929          fName = "amd64g_dirtyhelper_CPUID_avx2";
   21930          fAddr = &amd64g_dirtyhelper_CPUID_avx2;
   21931          /* This is a Core-i7-4910-like machine */
   21932       }
   21933       else if ((archinfo->hwcaps & VEX_HWCAPS_AMD64_SSE3) &&
   21934                (archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16) &&
   21935                (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   21936          fName = "amd64g_dirtyhelper_CPUID_avx_and_cx16";
   21937          fAddr = &amd64g_dirtyhelper_CPUID_avx_and_cx16;
   21938          /* This is a Core-i5-2300-like machine */
   21939       }
   21940       else if ((archinfo->hwcaps & VEX_HWCAPS_AMD64_SSE3) &&
   21941                (archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16)) {
   21942          fName = "amd64g_dirtyhelper_CPUID_sse42_and_cx16";
   21943          fAddr = &amd64g_dirtyhelper_CPUID_sse42_and_cx16;
   21944          /* This is a Core-i5-670-like machine */
   21945       }
   21946       else {
   21947          /* Give a CPUID for at least a baseline machine, SSE2
   21948             only, and no CX16 */
   21949          fName = "amd64g_dirtyhelper_CPUID_baseline";
   21950          fAddr = &amd64g_dirtyhelper_CPUID_baseline;
   21951       }
   21952 
   21953       vassert(fName); vassert(fAddr);
   21954       d = unsafeIRDirty_0_N ( 0/*regparms*/,
   21955                               fName, fAddr, mkIRExprVec_1(IRExpr_BBPTR()) );
   21956       /* declare guest state effects */
   21957       d->nFxState = 4;
   21958       vex_bzero(&d->fxState, sizeof(d->fxState));
   21959       d->fxState[0].fx     = Ifx_Modify;
   21960       d->fxState[0].offset = OFFB_RAX;
   21961       d->fxState[0].size   = 8;
   21962       d->fxState[1].fx     = Ifx_Write;
   21963       d->fxState[1].offset = OFFB_RBX;
   21964       d->fxState[1].size   = 8;
   21965       d->fxState[2].fx     = Ifx_Modify;
   21966       d->fxState[2].offset = OFFB_RCX;
   21967       d->fxState[2].size   = 8;
   21968       d->fxState[3].fx     = Ifx_Write;
   21969       d->fxState[3].offset = OFFB_RDX;
   21970       d->fxState[3].size   = 8;
   21971       /* execute the dirty call, side-effecting guest state */
   21972       stmt( IRStmt_Dirty(d) );
   21973       /* CPUID is a serialising insn.  So, just in case someone is
   21974          using it as a memory fence ... */
   21975       stmt( IRStmt_MBE(Imbe_Fence) );
   21976       DIP("cpuid\n");
   21977       return delta;
   21978    }
   21979 
   21980    case 0xA3: { /* BT Gv,Ev */
   21981       /* We let dis_bt_G_E decide whether F2 or F3 are allowable. */
   21982       Bool ok = True;
   21983       if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
   21984       delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpNone, &ok );
   21985       if (!ok) goto decode_failure;
   21986       return delta;
   21987    }
   21988 
   21989    case 0xA4: /* SHLDv imm8,Gv,Ev */
   21990       modrm = getUChar(delta);
   21991       d64   = delta + lengthAMode(pfx, delta);
   21992       vex_sprintf(dis_buf, "$%d", (Int)getUChar(d64));
   21993       delta = dis_SHLRD_Gv_Ev (
   21994                  vbi, pfx, delta, modrm, sz,
   21995                  mkU8(getUChar(d64)), True, /* literal */
   21996                  dis_buf, True /* left */ );
   21997       return delta;
   21998 
   21999    case 0xA5: /* SHLDv %cl,Gv,Ev */
   22000       modrm = getUChar(delta);
   22001       delta = dis_SHLRD_Gv_Ev (
   22002                  vbi, pfx, delta, modrm, sz,
   22003                  getIRegCL(), False, /* not literal */
   22004                  "%cl", True /* left */ );
   22005       return delta;
   22006 
   22007    case 0xAB: { /* BTS Gv,Ev */
   22008       /* We let dis_bt_G_E decide whether F2 or F3 are allowable. */
   22009       Bool ok = True;
   22010       if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
   22011       delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpSet, &ok );
   22012       if (!ok) goto decode_failure;
   22013       return delta;
   22014    }
   22015 
   22016    case 0xAC: /* SHRDv imm8,Gv,Ev */
   22017       modrm = getUChar(delta);
   22018       d64   = delta + lengthAMode(pfx, delta);
   22019       vex_sprintf(dis_buf, "$%d", (Int)getUChar(d64));
   22020       delta = dis_SHLRD_Gv_Ev (
   22021                  vbi, pfx, delta, modrm, sz,
   22022                  mkU8(getUChar(d64)), True, /* literal */
   22023                  dis_buf, False /* right */ );
   22024       return delta;
   22025 
   22026    case 0xAD: /* SHRDv %cl,Gv,Ev */
   22027       modrm = getUChar(delta);
   22028       delta = dis_SHLRD_Gv_Ev (
   22029                  vbi, pfx, delta, modrm, sz,
   22030                  getIRegCL(), False, /* not literal */
   22031                  "%cl", False /* right */);
   22032       return delta;
   22033 
   22034    case 0xAF: /* IMUL Ev, Gv */
   22035       if (haveF2orF3(pfx)) goto decode_failure;
   22036       delta = dis_mul_E_G ( vbi, pfx, sz, delta );
   22037       return delta;
   22038 
   22039    case 0xB0: { /* CMPXCHG Gb,Eb */
   22040       Bool ok = True;
   22041       /* We let dis_cmpxchg_G_E decide whether F2 or F3 are allowable. */
   22042       delta = dis_cmpxchg_G_E ( &ok, vbi, pfx, 1, delta );
   22043       if (!ok) goto decode_failure;
   22044       return delta;
   22045    }
   22046 
   22047    case 0xB1: { /* CMPXCHG Gv,Ev (allowed in 16,32,64 bit) */
   22048       Bool ok = True;
   22049       /* We let dis_cmpxchg_G_E decide whether F2 or F3 are allowable. */
   22050       if (sz != 2 && sz != 4 && sz != 8) goto decode_failure;
   22051       delta = dis_cmpxchg_G_E ( &ok, vbi, pfx, sz, delta );
   22052       if (!ok) goto decode_failure;
   22053       return delta;
   22054    }
   22055 
   22056    case 0xB3: { /* BTR Gv,Ev */
   22057       /* We let dis_bt_G_E decide whether F2 or F3 are allowable. */
   22058       Bool ok = True;
   22059       if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
   22060       delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpReset, &ok );
   22061       if (!ok) goto decode_failure;
   22062       return delta;
   22063    }
   22064 
   22065    case 0xB6: /* MOVZXb Eb,Gv */
   22066       if (haveF2orF3(pfx)) goto decode_failure;
   22067       if (sz != 2 && sz != 4 && sz != 8)
   22068          goto decode_failure;
   22069       delta = dis_movx_E_G ( vbi, pfx, delta, 1, sz, False );
   22070       return delta;
   22071 
   22072    case 0xB7: /* MOVZXw Ew,Gv */
   22073       if (haveF2orF3(pfx)) goto decode_failure;
   22074       if (sz != 4 && sz != 8)
   22075          goto decode_failure;
   22076       delta = dis_movx_E_G ( vbi, pfx, delta, 2, sz, False );
   22077       return delta;
   22078 
   22079    case 0xBA: { /* Grp8 Ib,Ev */
   22080       /* We let dis_Grp8_Imm decide whether F2 or F3 are allowable. */
   22081       Bool decode_OK = False;
   22082       modrm = getUChar(delta);
   22083       am_sz = lengthAMode(pfx,delta);
   22084       d64   = getSDisp8(delta + am_sz);
   22085       delta = dis_Grp8_Imm ( vbi, pfx, delta, modrm, am_sz, sz, d64,
   22086                              &decode_OK );
   22087       if (!decode_OK)
   22088          goto decode_failure;
   22089       return delta;
   22090    }
   22091 
   22092    case 0xBB: { /* BTC Gv,Ev */
   22093       /* We let dis_bt_G_E decide whether F2 or F3 are allowable. */
   22094       Bool ok = False;
   22095       if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
   22096       delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpComp, &ok );
   22097       if (!ok) goto decode_failure;
   22098       return delta;
   22099    }
   22100 
   22101    case 0xBC: /* BSF Gv,Ev */
   22102       if (!haveF2orF3(pfx)
   22103           || (haveF3noF2(pfx)
   22104               && 0 == (archinfo->hwcaps & VEX_HWCAPS_AMD64_BMI))) {
   22105          /* no-F2 no-F3 0F BC = BSF
   22106                   or F3 0F BC = REP; BSF on older CPUs.  */
   22107          delta = dis_bs_E_G ( vbi, pfx, sz, delta, True );
   22108          return delta;
   22109       }
   22110       /* Fall through, since F3 0F BC is TZCNT, and needs to
   22111          be handled by dis_ESC_0F__SSE4. */
   22112       break;
   22113 
   22114    case 0xBD: /* BSR Gv,Ev */
   22115       if (!haveF2orF3(pfx)
   22116           || (haveF3noF2(pfx)
   22117               && 0 == (archinfo->hwcaps & VEX_HWCAPS_AMD64_LZCNT))) {
   22118          /* no-F2 no-F3 0F BD = BSR
   22119                   or F3 0F BD = REP; BSR on older CPUs.  */
   22120          delta = dis_bs_E_G ( vbi, pfx, sz, delta, False );
   22121          return delta;
   22122       }
   22123       /* Fall through, since F3 0F BD is LZCNT, and needs to
   22124          be handled by dis_ESC_0F__SSE4. */
   22125       break;
   22126 
   22127    case 0xBE: /* MOVSXb Eb,Gv */
   22128       if (haveF2orF3(pfx)) goto decode_failure;
   22129       if (sz != 2 && sz != 4 && sz != 8)
   22130          goto decode_failure;
   22131       delta = dis_movx_E_G ( vbi, pfx, delta, 1, sz, True );
   22132       return delta;
   22133 
   22134    case 0xBF: /* MOVSXw Ew,Gv */
   22135       if (haveF2orF3(pfx)) goto decode_failure;
   22136       if (sz != 4 && sz != 8)
   22137          goto decode_failure;
   22138       delta = dis_movx_E_G ( vbi, pfx, delta, 2, sz, True );
   22139       return delta;
   22140 
   22141    case 0xC0: { /* XADD Gb,Eb */
   22142       Bool decode_OK = False;
   22143       delta = dis_xadd_G_E ( &decode_OK, vbi, pfx, 1, delta );
   22144       if (!decode_OK)
   22145          goto decode_failure;
   22146       return delta;
   22147    }
   22148 
   22149    case 0xC1: { /* XADD Gv,Ev */
   22150       Bool decode_OK = False;
   22151       delta = dis_xadd_G_E ( &decode_OK, vbi, pfx, sz, delta );
   22152       if (!decode_OK)
   22153          goto decode_failure;
   22154       return delta;
   22155    }
   22156 
   22157    case 0xC7: { /* CMPXCHG8B Ev, CMPXCHG16B Ev */
   22158       IRType  elemTy     = sz==4 ? Ity_I32 : Ity_I64;
   22159       IRTemp  expdHi     = newTemp(elemTy);
   22160       IRTemp  expdLo     = newTemp(elemTy);
   22161       IRTemp  dataHi     = newTemp(elemTy);
   22162       IRTemp  dataLo     = newTemp(elemTy);
   22163       IRTemp  oldHi      = newTemp(elemTy);
   22164       IRTemp  oldLo      = newTemp(elemTy);
   22165       IRTemp  flags_old  = newTemp(Ity_I64);
   22166       IRTemp  flags_new  = newTemp(Ity_I64);
   22167       IRTemp  success    = newTemp(Ity_I1);
   22168       IROp    opOR       = sz==4 ? Iop_Or32    : Iop_Or64;
   22169       IROp    opXOR      = sz==4 ? Iop_Xor32   : Iop_Xor64;
   22170       IROp    opCasCmpEQ = sz==4 ? Iop_CasCmpEQ32 : Iop_CasCmpEQ64;
   22171       IRExpr* zero       = sz==4 ? mkU32(0)    : mkU64(0);
   22172       IRTemp expdHi64    = newTemp(Ity_I64);
   22173       IRTemp expdLo64    = newTemp(Ity_I64);
   22174 
   22175       /* Translate this using a DCAS, even if there is no LOCK
   22176          prefix.  Life is too short to bother with generating two
   22177          different translations for the with/without-LOCK-prefix
   22178          cases. */
   22179       *expect_CAS = True;
   22180 
   22181       /* Decode, and generate address. */
   22182       if (have66(pfx)) goto decode_failure;
   22183       if (sz != 4 && sz != 8) goto decode_failure;
   22184       if (sz == 8 && !(archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16))
   22185          goto decode_failure;
   22186       modrm = getUChar(delta);
   22187       if (epartIsReg(modrm)) goto decode_failure;
   22188       if (gregLO3ofRM(modrm) != 1) goto decode_failure;
   22189       if (haveF2orF3(pfx)) {
   22190          /* Since the e-part is memory only, F2 or F3 (one or the
   22191             other) is acceptable if LOCK is also present.  But only
   22192             for cmpxchg8b. */
   22193          if (sz == 8) goto decode_failure;
   22194          if (haveF2andF3(pfx) || !haveLOCK(pfx)) goto decode_failure;
   22195       }
   22196 
   22197       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22198       delta += alen;
   22199 
   22200       /* cmpxchg16b requires an alignment check. */
   22201       if (sz == 8)
   22202          gen_SEGV_if_not_16_aligned( addr );
   22203 
   22204       /* Get the expected and new values. */
   22205       assign( expdHi64, getIReg64(R_RDX) );
   22206       assign( expdLo64, getIReg64(R_RAX) );
   22207 
   22208       /* These are the correctly-sized expected and new values.
   22209          However, we also get expdHi64/expdLo64 above as 64-bits
   22210          regardless, because we will need them later in the 32-bit
   22211          case (paradoxically). */
   22212       assign( expdHi, sz==4 ? unop(Iop_64to32, mkexpr(expdHi64))
   22213                             : mkexpr(expdHi64) );
   22214       assign( expdLo, sz==4 ? unop(Iop_64to32, mkexpr(expdLo64))
   22215                             : mkexpr(expdLo64) );
   22216       assign( dataHi, sz==4 ? getIReg32(R_RCX) : getIReg64(R_RCX) );
   22217       assign( dataLo, sz==4 ? getIReg32(R_RBX) : getIReg64(R_RBX) );
   22218 
   22219       /* Do the DCAS */
   22220       stmt( IRStmt_CAS(
   22221                mkIRCAS( oldHi, oldLo,
   22222                         Iend_LE, mkexpr(addr),
   22223                         mkexpr(expdHi), mkexpr(expdLo),
   22224                         mkexpr(dataHi), mkexpr(dataLo)
   22225             )));
   22226 
   22227       /* success when oldHi:oldLo == expdHi:expdLo */
   22228       assign( success,
   22229               binop(opCasCmpEQ,
   22230                     binop(opOR,
   22231                           binop(opXOR, mkexpr(oldHi), mkexpr(expdHi)),
   22232                           binop(opXOR, mkexpr(oldLo), mkexpr(expdLo))
   22233                     ),
   22234                     zero
   22235               ));
   22236 
   22237       /* If the DCAS is successful, that is to say oldHi:oldLo ==
   22238          expdHi:expdLo, then put expdHi:expdLo back in RDX:RAX,
   22239          which is where they came from originally.  Both the actual
   22240          contents of these two regs, and any shadow values, are
   22241          unchanged.  If the DCAS fails then we're putting into
   22242          RDX:RAX the value seen in memory. */
   22243       /* Now of course there's a complication in the 32-bit case
   22244          (bah!): if the DCAS succeeds, we need to leave RDX:RAX
   22245          unchanged; but if we use the same scheme as in the 64-bit
   22246          case, we get hit by the standard rule that a write to the
   22247          bottom 32 bits of an integer register zeros the upper 32
   22248          bits.  And so the upper halves of RDX and RAX mysteriously
   22249          become zero.  So we have to stuff back in the original
   22250          64-bit values which we previously stashed in
   22251          expdHi64:expdLo64, even if we're doing a cmpxchg8b. */
   22252       /* It's just _so_ much fun ... */
   22253       putIRegRDX( 8,
   22254                   IRExpr_ITE( mkexpr(success),
   22255                               mkexpr(expdHi64),
   22256                               sz == 4 ? unop(Iop_32Uto64, mkexpr(oldHi))
   22257                                       : mkexpr(oldHi)
   22258                 ));
   22259       putIRegRAX( 8,
   22260                   IRExpr_ITE( mkexpr(success),
   22261                               mkexpr(expdLo64),
   22262                               sz == 4 ? unop(Iop_32Uto64, mkexpr(oldLo))
   22263                                       : mkexpr(oldLo)
   22264                 ));
   22265 
   22266       /* Copy the success bit into the Z flag and leave the others
   22267          unchanged */
   22268       assign( flags_old, widenUto64(mk_amd64g_calculate_rflags_all()));
   22269       assign(
   22270          flags_new,
   22271          binop(Iop_Or64,
   22272                binop(Iop_And64, mkexpr(flags_old),
   22273                                 mkU64(~AMD64G_CC_MASK_Z)),
   22274                binop(Iop_Shl64,
   22275                      binop(Iop_And64,
   22276                            unop(Iop_1Uto64, mkexpr(success)), mkU64(1)),
   22277                      mkU8(AMD64G_CC_SHIFT_Z)) ));
   22278 
   22279       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   22280       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(flags_new) ));
   22281       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   22282       /* Set NDEP even though it isn't used.  This makes
   22283          redundant-PUT elimination of previous stores to this field
   22284          work better. */
   22285       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   22286 
   22287       /* Sheesh.  Aren't you glad it was me and not you that had to
   22288          write and validate all this grunge? */
   22289 
   22290       DIP("cmpxchg8b %s\n", dis_buf);
   22291       return delta;
   22292    }
   22293 
   22294    case 0xC8: /* BSWAP %eax */
   22295    case 0xC9:
   22296    case 0xCA:
   22297    case 0xCB:
   22298    case 0xCC:
   22299    case 0xCD:
   22300    case 0xCE:
   22301    case 0xCF: /* BSWAP %edi */
   22302       if (haveF2orF3(pfx)) goto decode_failure;
   22303       /* According to the AMD64 docs, this insn can have size 4 or
   22304          8. */
   22305       if (sz == 4) {
   22306          t1 = newTemp(Ity_I32);
   22307          assign( t1, getIRegRexB(4, pfx, opc-0xC8) );
   22308          t2 = math_BSWAP( t1, Ity_I32 );
   22309          putIRegRexB(4, pfx, opc-0xC8, mkexpr(t2));
   22310          DIP("bswapl %s\n", nameIRegRexB(4, pfx, opc-0xC8));
   22311          return delta;
   22312       }
   22313       if (sz == 8) {
   22314          t1 = newTemp(Ity_I64);
   22315          t2 = newTemp(Ity_I64);
   22316          assign( t1, getIRegRexB(8, pfx, opc-0xC8) );
   22317          t2 = math_BSWAP( t1, Ity_I64 );
   22318          putIRegRexB(8, pfx, opc-0xC8, mkexpr(t2));
   22319          DIP("bswapq %s\n", nameIRegRexB(8, pfx, opc-0xC8));
   22320          return delta;
   22321       }
   22322       goto decode_failure;
   22323 
   22324    default:
   22325       break;
   22326 
   22327    } /* first switch */
   22328 
   22329 
   22330    /* =-=-=-=-=-=-=-=-= MMXery =-=-=-=-=-=-=-=-= */
   22331    /* In the second switch, pick off MMX insns. */
   22332 
   22333    if (!have66orF2orF3(pfx)) {
   22334       /* So there's no SIMD prefix. */
   22335 
   22336       vassert(sz == 4 || sz == 8);
   22337 
   22338       switch (opc) { /* second switch */
   22339 
   22340       case 0x71:
   22341       case 0x72:
   22342       case 0x73: /* PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
   22343 
   22344       case 0x6E: /* MOVD (src)ireg-or-mem, (dst)mmxreg */
   22345       case 0x7E: /* MOVD (src)mmxreg, (dst)ireg-or-mem */
   22346       case 0x7F: /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
   22347       case 0x6F: /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
   22348 
   22349       case 0xFC:
   22350       case 0xFD:
   22351       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
   22352 
   22353       case 0xEC:
   22354       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
   22355 
   22356       case 0xDC:
   22357       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   22358 
   22359       case 0xF8:
   22360       case 0xF9:
   22361       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
   22362 
   22363       case 0xE8:
   22364       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
   22365 
   22366       case 0xD8:
   22367       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   22368 
   22369       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
   22370       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
   22371 
   22372       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
   22373 
   22374       case 0x74:
   22375       case 0x75:
   22376       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
   22377 
   22378       case 0x64:
   22379       case 0x65:
   22380       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
   22381 
   22382       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
   22383       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
   22384       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
   22385 
   22386       case 0x68:
   22387       case 0x69:
   22388       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
   22389 
   22390       case 0x60:
   22391       case 0x61:
   22392       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
   22393 
   22394       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
   22395       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
   22396       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
   22397       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
   22398 
   22399       case 0xF1: /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
   22400       case 0xF2:
   22401       case 0xF3:
   22402 
   22403       case 0xD1: /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
   22404       case 0xD2:
   22405       case 0xD3:
   22406 
   22407       case 0xE1: /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
   22408       case 0xE2: {
   22409          Bool decode_OK = False;
   22410          delta = dis_MMX ( &decode_OK, vbi, pfx, sz, deltaIN );
   22411          if (decode_OK)
   22412             return delta;
   22413          goto decode_failure;
   22414       }
   22415 
   22416       default:
   22417          break;
   22418       } /* second switch */
   22419 
   22420    }
   22421 
   22422    /* A couple of MMX corner cases */
   22423    if (opc == 0x0E/* FEMMS */ || opc == 0x77/* EMMS */) {
   22424       if (sz != 4)
   22425          goto decode_failure;
   22426       do_EMMS_preamble();
   22427       DIP("{f}emms\n");
   22428       return delta;
   22429    }
   22430 
   22431    /* =-=-=-=-=-=-=-=-= SSE2ery =-=-=-=-=-=-=-=-= */
   22432    /* Perhaps it's an SSE or SSE2 instruction.  We can try this
   22433       without checking the guest hwcaps because SSE2 is a baseline
   22434       facility in 64 bit mode. */
   22435    {
   22436       Bool decode_OK = False;
   22437       delta = dis_ESC_0F__SSE2 ( &decode_OK,
   22438                                  archinfo, vbi, pfx, sz, deltaIN, dres );
   22439       if (decode_OK)
   22440          return delta;
   22441    }
   22442 
   22443    /* =-=-=-=-=-=-=-=-= SSE3ery =-=-=-=-=-=-=-=-= */
   22444    /* Perhaps it's a SSE3 instruction.  FIXME: check guest hwcaps
   22445       first. */
   22446    {
   22447       Bool decode_OK = False;
   22448       delta = dis_ESC_0F__SSE3 ( &decode_OK, vbi, pfx, sz, deltaIN );
   22449       if (decode_OK)
   22450          return delta;
   22451    }
   22452 
   22453    /* =-=-=-=-=-=-=-=-= SSE4ery =-=-=-=-=-=-=-=-= */
   22454    /* Perhaps it's a SSE4 instruction.  FIXME: check guest hwcaps
   22455       first. */
   22456    {
   22457       Bool decode_OK = False;
   22458       delta = dis_ESC_0F__SSE4 ( &decode_OK,
   22459                                  archinfo, vbi, pfx, sz, deltaIN );
   22460       if (decode_OK)
   22461          return delta;
   22462    }
   22463 
   22464   decode_failure:
   22465    return deltaIN; /* fail */
   22466 }
   22467 
   22468 
   22469 /*------------------------------------------------------------*/
   22470 /*---                                                      ---*/
   22471 /*--- Top-level post-escape decoders: dis_ESC_0F38         ---*/
   22472 /*---                                                      ---*/
   22473 /*------------------------------------------------------------*/
   22474 
   22475 __attribute__((noinline))
   22476 static
   22477 Long dis_ESC_0F38 (
   22478         /*MB_OUT*/DisResult* dres,
   22479         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
   22480         Bool         resteerCisOk,
   22481         void*        callback_opaque,
   22482         const VexArchInfo* archinfo,
   22483         const VexAbiInfo*  vbi,
   22484         Prefix pfx, Int sz, Long deltaIN
   22485      )
   22486 {
   22487    Long   delta = deltaIN;
   22488    UChar  opc   = getUChar(delta);
   22489    delta++;
   22490    switch (opc) {
   22491 
   22492    case 0xF0:   /* 0F 38 F0 = MOVBE m16/32/64(E), r16/32/64(G) */
   22493    case 0xF1: { /* 0F 38 F1 = MOVBE r16/32/64(G), m16/32/64(E) */
   22494       if (!haveF2orF3(pfx) && !haveVEX(pfx)
   22495           && (sz == 2 || sz == 4 || sz == 8)) {
   22496          IRTemp addr  = IRTemp_INVALID;
   22497          UChar  modrm = 0;
   22498          Int    alen  = 0;
   22499          HChar  dis_buf[50];
   22500          modrm = getUChar(delta);
   22501          if (epartIsReg(modrm)) break;
   22502          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22503          delta += alen;
   22504          IRType ty = szToITy(sz);
   22505          IRTemp src = newTemp(ty);
   22506          if (opc == 0xF0) { /* LOAD */
   22507             assign(src, loadLE(ty, mkexpr(addr)));
   22508             IRTemp dst = math_BSWAP(src, ty);
   22509             putIRegG(sz, pfx, modrm, mkexpr(dst));
   22510             DIP("movbe %s,%s\n", dis_buf, nameIRegG(sz, pfx, modrm));
   22511          } else { /* STORE */
   22512             assign(src, getIRegG(sz, pfx, modrm));
   22513             IRTemp dst = math_BSWAP(src, ty);
   22514             storeLE(mkexpr(addr), mkexpr(dst));
   22515             DIP("movbe %s,%s\n", nameIRegG(sz, pfx, modrm), dis_buf);
   22516          }
   22517          return delta;
   22518       }
   22519       /* else fall through; maybe one of the decoders below knows what
   22520          it is. */
   22521       break;
   22522    }
   22523 
   22524    default:
   22525       break;
   22526 
   22527    }
   22528 
   22529    /* =-=-=-=-=-=-=-=-= SSSE3ery =-=-=-=-=-=-=-=-= */
   22530    /* Perhaps it's an SSSE3 instruction.  FIXME: consult guest hwcaps
   22531       rather than proceeding indiscriminately. */
   22532    {
   22533       Bool decode_OK = False;
   22534       delta = dis_ESC_0F38__SupSSE3 ( &decode_OK, vbi, pfx, sz, deltaIN );
   22535       if (decode_OK)
   22536          return delta;
   22537    }
   22538 
   22539    /* =-=-=-=-=-=-=-=-= SSE4ery =-=-=-=-=-=-=-=-= */
   22540    /* Perhaps it's an SSE4 instruction.  FIXME: consult guest hwcaps
   22541       rather than proceeding indiscriminately. */
   22542    {
   22543       Bool decode_OK = False;
   22544       delta = dis_ESC_0F38__SSE4 ( &decode_OK, vbi, pfx, sz, deltaIN );
   22545       if (decode_OK)
   22546          return delta;
   22547    }
   22548 
   22549   /*decode_failure:*/
   22550    return deltaIN; /* fail */
   22551 }
   22552 
   22553 
   22554 /*------------------------------------------------------------*/
   22555 /*---                                                      ---*/
   22556 /*--- Top-level post-escape decoders: dis_ESC_0F3A         ---*/
   22557 /*---                                                      ---*/
   22558 /*------------------------------------------------------------*/
   22559 
   22560 __attribute__((noinline))
   22561 static
   22562 Long dis_ESC_0F3A (
   22563         /*MB_OUT*/DisResult* dres,
   22564         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
   22565         Bool         resteerCisOk,
   22566         void*        callback_opaque,
   22567         const VexArchInfo* archinfo,
   22568         const VexAbiInfo*  vbi,
   22569         Prefix pfx, Int sz, Long deltaIN
   22570      )
   22571 {
   22572    Long   delta = deltaIN;
   22573    UChar  opc   = getUChar(delta);
   22574    delta++;
   22575    switch (opc) {
   22576 
   22577    default:
   22578       break;
   22579 
   22580    }
   22581 
   22582    /* =-=-=-=-=-=-=-=-= SSSE3ery =-=-=-=-=-=-=-=-= */
   22583    /* Perhaps it's an SSSE3 instruction.  FIXME: consult guest hwcaps
   22584       rather than proceeding indiscriminately. */
   22585    {
   22586       Bool decode_OK = False;
   22587       delta = dis_ESC_0F3A__SupSSE3 ( &decode_OK, vbi, pfx, sz, deltaIN );
   22588       if (decode_OK)
   22589          return delta;
   22590    }
   22591 
   22592    /* =-=-=-=-=-=-=-=-= SSE4ery =-=-=-=-=-=-=-=-= */
   22593    /* Perhaps it's an SSE4 instruction.  FIXME: consult guest hwcaps
   22594       rather than proceeding indiscriminately. */
   22595    {
   22596       Bool decode_OK = False;
   22597       delta = dis_ESC_0F3A__SSE4 ( &decode_OK, vbi, pfx, sz, deltaIN );
   22598       if (decode_OK)
   22599          return delta;
   22600    }
   22601 
   22602    return deltaIN; /* fail */
   22603 }
   22604 
   22605 
   22606 /*------------------------------------------------------------*/
   22607 /*---                                                      ---*/
   22608 /*--- Top-level post-escape decoders: dis_ESC_0F__VEX      ---*/
   22609 /*---                                                      ---*/
   22610 /*------------------------------------------------------------*/
   22611 
   22612 /* FIXME: common up with the _256_ version below? */
   22613 static
   22614 Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG (
   22615         /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
   22616         Prefix pfx, Long delta, const HChar* name,
   22617         /* The actual operation.  Use either 'op' or 'opfn',
   22618            but not both. */
   22619         IROp op, IRTemp(*opFn)(IRTemp,IRTemp),
   22620         Bool invertLeftArg,
   22621         Bool swapArgs
   22622      )
   22623 {
   22624    UChar  modrm = getUChar(delta);
   22625    UInt   rD    = gregOfRexRM(pfx, modrm);
   22626    UInt   rSL   = getVexNvvvv(pfx);
   22627    IRTemp tSL   = newTemp(Ity_V128);
   22628    IRTemp tSR   = newTemp(Ity_V128);
   22629    IRTemp addr  = IRTemp_INVALID;
   22630    HChar  dis_buf[50];
   22631    Int    alen  = 0;
   22632    vassert(0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*WIG?*/);
   22633 
   22634    assign(tSL, invertLeftArg ? unop(Iop_NotV128, getXMMReg(rSL))
   22635                              : getXMMReg(rSL));
   22636 
   22637    if (epartIsReg(modrm)) {
   22638       UInt rSR = eregOfRexRM(pfx, modrm);
   22639       delta += 1;
   22640       assign(tSR, getXMMReg(rSR));
   22641       DIP("%s %s,%s,%s\n",
   22642           name, nameXMMReg(rSR), nameXMMReg(rSL), nameXMMReg(rD));
   22643    } else {
   22644       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   22645       delta += alen;
   22646       assign(tSR, loadLE(Ity_V128, mkexpr(addr)));
   22647       DIP("%s %s,%s,%s\n",
   22648           name, dis_buf, nameXMMReg(rSL), nameXMMReg(rD));
   22649    }
   22650 
   22651    IRTemp res = IRTemp_INVALID;
   22652    if (op != Iop_INVALID) {
   22653       vassert(opFn == NULL);
   22654       res = newTemp(Ity_V128);
   22655       if (requiresRMode(op)) {
   22656          IRTemp rm = newTemp(Ity_I32);
   22657          assign(rm, get_FAKE_roundingmode()); /* XXXROUNDINGFIXME */
   22658          assign(res, swapArgs
   22659                         ? triop(op, mkexpr(rm), mkexpr(tSR), mkexpr(tSL))
   22660                         : triop(op, mkexpr(rm), mkexpr(tSL), mkexpr(tSR)));
   22661       } else {
   22662          assign(res, swapArgs
   22663                         ? binop(op, mkexpr(tSR), mkexpr(tSL))
   22664                         : binop(op, mkexpr(tSL), mkexpr(tSR)));
   22665       }
   22666    } else {
   22667       vassert(opFn != NULL);
   22668       res = swapArgs ? opFn(tSR, tSL) : opFn(tSL, tSR);
   22669    }
   22670 
   22671    putYMMRegLoAndZU(rD, mkexpr(res));
   22672 
   22673    *uses_vvvv = True;
   22674    return delta;
   22675 }
   22676 
   22677 
   22678 /* Handle a VEX_NDS_128_66_0F_WIG (3-addr) insn, with a simple IROp
   22679    for the operation, no inversion of the left arg, and no swapping of
   22680    args. */
   22681 static
   22682 Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple (
   22683         /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
   22684         Prefix pfx, Long delta, const HChar* name,
   22685         IROp op
   22686      )
   22687 {
   22688    return dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   22689              uses_vvvv, vbi, pfx, delta, name, op, NULL, False, False);
   22690 }
   22691 
   22692 
   22693 /* Handle a VEX_NDS_128_66_0F_WIG (3-addr) insn, using the given IR
   22694    generator to compute the result, no inversion of the left
   22695    arg, and no swapping of args. */
   22696 static
   22697 Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex (
   22698         /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
   22699         Prefix pfx, Long delta, const HChar* name,
   22700         IRTemp(*opFn)(IRTemp,IRTemp)
   22701      )
   22702 {
   22703    return dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   22704              uses_vvvv, vbi, pfx, delta, name,
   22705              Iop_INVALID, opFn, False, False );
   22706 }
   22707 
   22708 
   22709 /* Vector by scalar shift of V by the amount specified at the bottom
   22710    of E. */
   22711 static ULong dis_AVX128_shiftV_byE ( const VexAbiInfo* vbi,
   22712                                      Prefix pfx, Long delta,
   22713                                      const HChar* opname, IROp op )
   22714 {
   22715    HChar   dis_buf[50];
   22716    Int     alen, size;
   22717    IRTemp  addr;
   22718    Bool    shl, shr, sar;
   22719    UChar   modrm = getUChar(delta);
   22720    UInt    rG    = gregOfRexRM(pfx,modrm);
   22721    UInt    rV    = getVexNvvvv(pfx);;
   22722    IRTemp  g0    = newTemp(Ity_V128);
   22723    IRTemp  g1    = newTemp(Ity_V128);
   22724    IRTemp  amt   = newTemp(Ity_I64);
   22725    IRTemp  amt8  = newTemp(Ity_I8);
   22726    if (epartIsReg(modrm)) {
   22727       UInt rE = eregOfRexRM(pfx,modrm);
   22728       assign( amt, getXMMRegLane64(rE, 0) );
   22729       DIP("%s %s,%s,%s\n", opname, nameXMMReg(rE),
   22730           nameXMMReg(rV), nameXMMReg(rG) );
   22731       delta++;
   22732    } else {
   22733       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22734       assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
   22735       DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   22736       delta += alen;
   22737    }
   22738    assign( g0, getXMMReg(rV) );
   22739    assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
   22740 
   22741    shl = shr = sar = False;
   22742    size = 0;
   22743    switch (op) {
   22744       case Iop_ShlN16x8: shl = True; size = 32; break;
   22745       case Iop_ShlN32x4: shl = True; size = 32; break;
   22746       case Iop_ShlN64x2: shl = True; size = 64; break;
   22747       case Iop_SarN16x8: sar = True; size = 16; break;
   22748       case Iop_SarN32x4: sar = True; size = 32; break;
   22749       case Iop_ShrN16x8: shr = True; size = 16; break;
   22750       case Iop_ShrN32x4: shr = True; size = 32; break;
   22751       case Iop_ShrN64x2: shr = True; size = 64; break;
   22752       default: vassert(0);
   22753    }
   22754 
   22755    if (shl || shr) {
   22756      assign(
   22757         g1,
   22758         IRExpr_ITE(
   22759            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
   22760            binop(op, mkexpr(g0), mkexpr(amt8)),
   22761            mkV128(0x0000)
   22762         )
   22763      );
   22764    } else
   22765    if (sar) {
   22766      assign(
   22767         g1,
   22768         IRExpr_ITE(
   22769            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
   22770            binop(op, mkexpr(g0), mkexpr(amt8)),
   22771            binop(op, mkexpr(g0), mkU8(size-1))
   22772         )
   22773      );
   22774    } else {
   22775       vassert(0);
   22776    }
   22777 
   22778    putYMMRegLoAndZU( rG, mkexpr(g1) );
   22779    return delta;
   22780 }
   22781 
   22782 
   22783 /* Vector by scalar shift of V by the amount specified at the bottom
   22784    of E. */
   22785 static ULong dis_AVX256_shiftV_byE ( const VexAbiInfo* vbi,
   22786                                      Prefix pfx, Long delta,
   22787                                      const HChar* opname, IROp op )
   22788 {
   22789    HChar   dis_buf[50];
   22790    Int     alen, size;
   22791    IRTemp  addr;
   22792    Bool    shl, shr, sar;
   22793    UChar   modrm = getUChar(delta);
   22794    UInt    rG    = gregOfRexRM(pfx,modrm);
   22795    UInt    rV    = getVexNvvvv(pfx);;
   22796    IRTemp  g0    = newTemp(Ity_V256);
   22797    IRTemp  g1    = newTemp(Ity_V256);
   22798    IRTemp  amt   = newTemp(Ity_I64);
   22799    IRTemp  amt8  = newTemp(Ity_I8);
   22800    if (epartIsReg(modrm)) {
   22801       UInt rE = eregOfRexRM(pfx,modrm);
   22802       assign( amt, getXMMRegLane64(rE, 0) );
   22803       DIP("%s %s,%s,%s\n", opname, nameXMMReg(rE),
   22804           nameYMMReg(rV), nameYMMReg(rG) );
   22805       delta++;
   22806    } else {
   22807       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22808       assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
   22809       DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
   22810       delta += alen;
   22811    }
   22812    assign( g0, getYMMReg(rV) );
   22813    assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
   22814 
   22815    shl = shr = sar = False;
   22816    size = 0;
   22817    switch (op) {
   22818       case Iop_ShlN16x16: shl = True; size = 32; break;
   22819       case Iop_ShlN32x8:  shl = True; size = 32; break;
   22820       case Iop_ShlN64x4:  shl = True; size = 64; break;
   22821       case Iop_SarN16x16: sar = True; size = 16; break;
   22822       case Iop_SarN32x8:  sar = True; size = 32; break;
   22823       case Iop_ShrN16x16: shr = True; size = 16; break;
   22824       case Iop_ShrN32x8:  shr = True; size = 32; break;
   22825       case Iop_ShrN64x4:  shr = True; size = 64; break;
   22826       default: vassert(0);
   22827    }
   22828 
   22829    if (shl || shr) {
   22830      assign(
   22831         g1,
   22832         IRExpr_ITE(
   22833            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
   22834            binop(op, mkexpr(g0), mkexpr(amt8)),
   22835            binop(Iop_V128HLtoV256, mkV128(0), mkV128(0))
   22836         )
   22837      );
   22838    } else
   22839    if (sar) {
   22840      assign(
   22841         g1,
   22842         IRExpr_ITE(
   22843            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
   22844            binop(op, mkexpr(g0), mkexpr(amt8)),
   22845            binop(op, mkexpr(g0), mkU8(size-1))
   22846         )
   22847      );
   22848    } else {
   22849       vassert(0);
   22850    }
   22851 
   22852    putYMMReg( rG, mkexpr(g1) );
   22853    return delta;
   22854 }
   22855 
   22856 
   22857 /* Vector by vector shift of V by the amount specified at the bottom
   22858    of E.  Vector by vector shifts are defined for all shift amounts,
   22859    so not using Iop_S*x* here (and SSE2 doesn't support variable shifts
   22860    anyway).  */
   22861 static ULong dis_AVX_var_shiftV_byE ( const VexAbiInfo* vbi,
   22862                                       Prefix pfx, Long delta,
   22863                                       const HChar* opname, IROp op, Bool isYMM )
   22864 {
   22865    HChar   dis_buf[50];
   22866    Int     alen, size, i;
   22867    IRTemp  addr;
   22868    UChar   modrm = getUChar(delta);
   22869    UInt    rG    = gregOfRexRM(pfx,modrm);
   22870    UInt    rV    = getVexNvvvv(pfx);;
   22871    IRTemp  sV    = isYMM ? newTemp(Ity_V256) : newTemp(Ity_V128);
   22872    IRTemp  amt   = isYMM ? newTemp(Ity_V256) : newTemp(Ity_V128);
   22873    IRTemp  amts[8], sVs[8], res[8];
   22874    if (epartIsReg(modrm)) {
   22875       UInt rE = eregOfRexRM(pfx,modrm);
   22876       assign( amt, isYMM ? getYMMReg(rE) : getXMMReg(rE) );
   22877       if (isYMM) {
   22878          DIP("%s %s,%s,%s\n", opname, nameYMMReg(rE),
   22879              nameYMMReg(rV), nameYMMReg(rG) );
   22880       } else {
   22881          DIP("%s %s,%s,%s\n", opname, nameXMMReg(rE),
   22882              nameXMMReg(rV), nameXMMReg(rG) );
   22883       }
   22884       delta++;
   22885    } else {
   22886       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22887       assign( amt, loadLE(isYMM ? Ity_V256 : Ity_V128, mkexpr(addr)) );
   22888       if (isYMM) {
   22889          DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV),
   22890              nameYMMReg(rG) );
   22891       } else {
   22892          DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV),
   22893              nameXMMReg(rG) );
   22894       }
   22895       delta += alen;
   22896    }
   22897    assign( sV, isYMM ? getYMMReg(rV) : getXMMReg(rV) );
   22898 
   22899    size = 0;
   22900    switch (op) {
   22901       case Iop_Shl32: size = 32; break;
   22902       case Iop_Shl64: size = 64; break;
   22903       case Iop_Sar32: size = 32; break;
   22904       case Iop_Shr32: size = 32; break;
   22905       case Iop_Shr64: size = 64; break;
   22906       default: vassert(0);
   22907    }
   22908 
   22909    for (i = 0; i < 8; i++) {
   22910       sVs[i] = IRTemp_INVALID;
   22911       amts[i] = IRTemp_INVALID;
   22912    }
   22913    switch (size) {
   22914       case 32:
   22915          if (isYMM) {
   22916             breakupV256to32s( sV, &sVs[7], &sVs[6], &sVs[5], &sVs[4],
   22917                                   &sVs[3], &sVs[2], &sVs[1], &sVs[0] );
   22918             breakupV256to32s( amt, &amts[7], &amts[6], &amts[5], &amts[4],
   22919                                    &amts[3], &amts[2], &amts[1], &amts[0] );
   22920          } else {
   22921             breakupV128to32s( sV, &sVs[3], &sVs[2], &sVs[1], &sVs[0] );
   22922             breakupV128to32s( amt, &amts[3], &amts[2], &amts[1], &amts[0] );
   22923         }
   22924          break;
   22925       case 64:
   22926          if (isYMM) {
   22927             breakupV256to64s( sV, &sVs[3], &sVs[2], &sVs[1], &sVs[0] );
   22928             breakupV256to64s( amt, &amts[3], &amts[2], &amts[1], &amts[0] );
   22929          } else {
   22930             breakupV128to64s( sV, &sVs[1], &sVs[0] );
   22931             breakupV128to64s( amt, &amts[1], &amts[0] );
   22932          }
   22933          break;
   22934       default: vassert(0);
   22935    }
   22936    for (i = 0; i < 8; i++)
   22937       if (sVs[i] != IRTemp_INVALID) {
   22938          res[i] = size == 32 ? newTemp(Ity_I32) : newTemp(Ity_I64);
   22939          assign( res[i],
   22940                  IRExpr_ITE(
   22941                     binop(size == 32 ? Iop_CmpLT32U : Iop_CmpLT64U,
   22942                           mkexpr(amts[i]),
   22943                           size == 32 ? mkU32(size) : mkU64(size)),
   22944                     binop(op, mkexpr(sVs[i]),
   22945                                unop(size == 32 ? Iop_32to8 : Iop_64to8,
   22946                                     mkexpr(amts[i]))),
   22947                     op == Iop_Sar32 ? binop(op, mkexpr(sVs[i]), mkU8(size-1))
   22948                                     : size == 32 ? mkU32(0) : mkU64(0)
   22949          ));
   22950       }
   22951    switch (size) {
   22952       case 32:
   22953          for (i = 0; i < 8; i++)
   22954             putYMMRegLane32( rG, i, (i < 4 || isYMM)
   22955                                     ? mkexpr(res[i]) : mkU32(0) );
   22956          break;
   22957       case 64:
   22958          for (i = 0; i < 4; i++)
   22959             putYMMRegLane64( rG, i, (i < 2 || isYMM)
   22960                                     ? mkexpr(res[i]) : mkU64(0) );
   22961          break;
   22962       default: vassert(0);
   22963    }
   22964 
   22965    return delta;
   22966 }
   22967 
   22968 
   22969 /* Vector by scalar shift of E into V, by an immediate byte.  Modified
   22970    version of dis_SSE_shiftE_imm. */
   22971 static
   22972 Long dis_AVX128_shiftE_to_V_imm( Prefix pfx,
   22973                                  Long delta, const HChar* opname, IROp op )
   22974 {
   22975    Bool    shl, shr, sar;
   22976    UChar   rm   = getUChar(delta);
   22977    IRTemp  e0   = newTemp(Ity_V128);
   22978    IRTemp  e1   = newTemp(Ity_V128);
   22979    UInt    rD   = getVexNvvvv(pfx);
   22980    UChar   amt, size;
   22981    vassert(epartIsReg(rm));
   22982    vassert(gregLO3ofRM(rm) == 2
   22983            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
   22984    amt = getUChar(delta+1);
   22985    delta += 2;
   22986    DIP("%s $%d,%s,%s\n", opname,
   22987                          (Int)amt,
   22988                          nameXMMReg(eregOfRexRM(pfx,rm)),
   22989                          nameXMMReg(rD));
   22990    assign( e0, getXMMReg(eregOfRexRM(pfx,rm)) );
   22991 
   22992    shl = shr = sar = False;
   22993    size = 0;
   22994    switch (op) {
   22995       case Iop_ShlN16x8: shl = True; size = 16; break;
   22996       case Iop_ShlN32x4: shl = True; size = 32; break;
   22997       case Iop_ShlN64x2: shl = True; size = 64; break;
   22998       case Iop_SarN16x8: sar = True; size = 16; break;
   22999       case Iop_SarN32x4: sar = True; size = 32; break;
   23000       case Iop_ShrN16x8: shr = True; size = 16; break;
   23001       case Iop_ShrN32x4: shr = True; size = 32; break;
   23002       case Iop_ShrN64x2: shr = True; size = 64; break;
   23003       default: vassert(0);
   23004    }
   23005 
   23006    if (shl || shr) {
   23007      assign( e1, amt >= size
   23008                     ? mkV128(0x0000)
   23009                     : binop(op, mkexpr(e0), mkU8(amt))
   23010      );
   23011    } else
   23012    if (sar) {
   23013      assign( e1, amt >= size
   23014                     ? binop(op, mkexpr(e0), mkU8(size-1))
   23015                     : binop(op, mkexpr(e0), mkU8(amt))
   23016      );
   23017    } else {
   23018       vassert(0);
   23019    }
   23020 
   23021    putYMMRegLoAndZU( rD, mkexpr(e1) );
   23022    return delta;
   23023 }
   23024 
   23025 
   23026 /* Vector by scalar shift of E into V, by an immediate byte.  Modified
   23027    version of dis_AVX128_shiftE_to_V_imm. */
   23028 static
   23029 Long dis_AVX256_shiftE_to_V_imm( Prefix pfx,
   23030                                  Long delta, const HChar* opname, IROp op )
   23031 {
   23032    Bool    shl, shr, sar;
   23033    UChar   rm   = getUChar(delta);
   23034    IRTemp  e0   = newTemp(Ity_V256);
   23035    IRTemp  e1   = newTemp(Ity_V256);
   23036    UInt    rD   = getVexNvvvv(pfx);
   23037    UChar   amt, size;
   23038    vassert(epartIsReg(rm));
   23039    vassert(gregLO3ofRM(rm) == 2
   23040            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
   23041    amt = getUChar(delta+1);
   23042    delta += 2;
   23043    DIP("%s $%d,%s,%s\n", opname,
   23044                          (Int)amt,
   23045                          nameYMMReg(eregOfRexRM(pfx,rm)),
   23046                          nameYMMReg(rD));
   23047    assign( e0, getYMMReg(eregOfRexRM(pfx,rm)) );
   23048 
   23049    shl = shr = sar = False;
   23050    size = 0;
   23051    switch (op) {
   23052       case Iop_ShlN16x16: shl = True; size = 16; break;
   23053       case Iop_ShlN32x8:  shl = True; size = 32; break;
   23054       case Iop_ShlN64x4:  shl = True; size = 64; break;
   23055       case Iop_SarN16x16: sar = True; size = 16; break;
   23056       case Iop_SarN32x8:  sar = True; size = 32; break;
   23057       case Iop_ShrN16x16: shr = True; size = 16; break;
   23058       case Iop_ShrN32x8:  shr = True; size = 32; break;
   23059       case Iop_ShrN64x4:  shr = True; size = 64; break;
   23060       default: vassert(0);
   23061    }
   23062 
   23063 
   23064    if (shl || shr) {
   23065      assign( e1, amt >= size
   23066                     ? binop(Iop_V128HLtoV256, mkV128(0), mkV128(0))
   23067                     : binop(op, mkexpr(e0), mkU8(amt))
   23068      );
   23069    } else
   23070    if (sar) {
   23071      assign( e1, amt >= size
   23072                     ? binop(op, mkexpr(e0), mkU8(size-1))
   23073                     : binop(op, mkexpr(e0), mkU8(amt))
   23074      );
   23075    } else {
   23076       vassert(0);
   23077    }
   23078 
   23079    putYMMReg( rD, mkexpr(e1) );
   23080    return delta;
   23081 }
   23082 
   23083 
   23084 /* Lower 64-bit lane only AVX128 binary operation:
   23085    G[63:0]    = V[63:0] `op` E[63:0]
   23086    G[127:64]  = V[127:64]
   23087    G[255:128] = 0.
   23088    The specified op must be of the 64F0x2 kind, so that it
   23089    copies the upper half of the left operand to the result.
   23090 */
   23091 static Long dis_AVX128_E_V_to_G_lo64 ( /*OUT*/Bool* uses_vvvv,
   23092                                        const VexAbiInfo* vbi,
   23093                                        Prefix pfx, Long delta,
   23094                                        const HChar* opname, IROp op )
   23095 {
   23096    HChar   dis_buf[50];
   23097    Int     alen;
   23098    IRTemp  addr;
   23099    UChar   rm    = getUChar(delta);
   23100    UInt    rG    = gregOfRexRM(pfx,rm);
   23101    UInt    rV    = getVexNvvvv(pfx);
   23102    IRExpr* vpart = getXMMReg(rV);
   23103    if (epartIsReg(rm)) {
   23104       UInt rE = eregOfRexRM(pfx,rm);
   23105       putXMMReg( rG, binop(op, vpart, getXMMReg(rE)) );
   23106       DIP("%s %s,%s,%s\n", opname,
   23107           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   23108       delta = delta+1;
   23109    } else {
   23110       /* We can only do a 64-bit memory read, so the upper half of the
   23111          E operand needs to be made simply of zeroes. */
   23112       IRTemp epart = newTemp(Ity_V128);
   23113       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23114       assign( epart, unop( Iop_64UtoV128,
   23115                            loadLE(Ity_I64, mkexpr(addr))) );
   23116       putXMMReg( rG, binop(op, vpart, mkexpr(epart)) );
   23117       DIP("%s %s,%s,%s\n", opname,
   23118           dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   23119       delta = delta+alen;
   23120    }
   23121    putYMMRegLane128( rG, 1, mkV128(0) );
   23122    *uses_vvvv = True;
   23123    return delta;
   23124 }
   23125 
   23126 
   23127 /* Lower 64-bit lane only AVX128 unary operation:
   23128    G[63:0]    = op(E[63:0])
   23129    G[127:64]  = V[127:64]
   23130    G[255:128] = 0
   23131    The specified op must be of the 64F0x2 kind, so that it
   23132    copies the upper half of the operand to the result.
   23133 */
   23134 static Long dis_AVX128_E_V_to_G_lo64_unary ( /*OUT*/Bool* uses_vvvv,
   23135                                              const VexAbiInfo* vbi,
   23136                                              Prefix pfx, Long delta,
   23137                                              const HChar* opname, IROp op )
   23138 {
   23139    HChar   dis_buf[50];
   23140    Int     alen;
   23141    IRTemp  addr;
   23142    UChar   rm  = getUChar(delta);
   23143    UInt    rG  = gregOfRexRM(pfx,rm);
   23144    UInt    rV  = getVexNvvvv(pfx);
   23145    IRTemp  e64 = newTemp(Ity_I64);
   23146 
   23147    /* Fetch E[63:0] */
   23148    if (epartIsReg(rm)) {
   23149       UInt rE = eregOfRexRM(pfx,rm);
   23150       assign(e64, getXMMRegLane64(rE, 0));
   23151       DIP("%s %s,%s,%s\n", opname,
   23152           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   23153       delta += 1;
   23154    } else {
   23155       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23156       assign(e64, loadLE(Ity_I64, mkexpr(addr)));
   23157       DIP("%s %s,%s,%s\n", opname,
   23158           dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   23159       delta += alen;
   23160    }
   23161 
   23162    /* Create a value 'arg' as V[127:64]++E[63:0] */
   23163    IRTemp arg = newTemp(Ity_V128);
   23164    assign(arg,
   23165           binop(Iop_SetV128lo64,
   23166                 getXMMReg(rV), mkexpr(e64)));
   23167    /* and apply op to it */
   23168    putYMMRegLoAndZU( rG, unop(op, mkexpr(arg)) );
   23169    *uses_vvvv = True;
   23170    return delta;
   23171 }
   23172 
   23173 
   23174 /* Lower 32-bit lane only AVX128 unary operation:
   23175    G[31:0]    = op(E[31:0])
   23176    G[127:32]  = V[127:32]
   23177    G[255:128] = 0
   23178    The specified op must be of the 32F0x4 kind, so that it
   23179    copies the upper 3/4 of the operand to the result.
   23180 */
   23181 static Long dis_AVX128_E_V_to_G_lo32_unary ( /*OUT*/Bool* uses_vvvv,
   23182                                              const VexAbiInfo* vbi,
   23183                                              Prefix pfx, Long delta,
   23184                                              const HChar* opname, IROp op )
   23185 {
   23186    HChar   dis_buf[50];
   23187    Int     alen;
   23188    IRTemp  addr;
   23189    UChar   rm  = getUChar(delta);
   23190    UInt    rG  = gregOfRexRM(pfx,rm);
   23191    UInt    rV  = getVexNvvvv(pfx);
   23192    IRTemp  e32 = newTemp(Ity_I32);
   23193 
   23194    /* Fetch E[31:0] */
   23195    if (epartIsReg(rm)) {
   23196       UInt rE = eregOfRexRM(pfx,rm);
   23197       assign(e32, getXMMRegLane32(rE, 0));
   23198       DIP("%s %s,%s,%s\n", opname,
   23199           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   23200       delta += 1;
   23201    } else {
   23202       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23203       assign(e32, loadLE(Ity_I32, mkexpr(addr)));
   23204       DIP("%s %s,%s,%s\n", opname,
   23205           dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   23206       delta += alen;
   23207    }
   23208 
   23209    /* Create a value 'arg' as V[127:32]++E[31:0] */
   23210    IRTemp arg = newTemp(Ity_V128);
   23211    assign(arg,
   23212           binop(Iop_SetV128lo32,
   23213                 getXMMReg(rV), mkexpr(e32)));
   23214    /* and apply op to it */
   23215    putYMMRegLoAndZU( rG, unop(op, mkexpr(arg)) );
   23216    *uses_vvvv = True;
   23217    return delta;
   23218 }
   23219 
   23220 
   23221 /* Lower 32-bit lane only AVX128 binary operation:
   23222    G[31:0]    = V[31:0] `op` E[31:0]
   23223    G[127:32]  = V[127:32]
   23224    G[255:128] = 0.
   23225    The specified op must be of the 32F0x4 kind, so that it
   23226    copies the upper 3/4 of the left operand to the result.
   23227 */
   23228 static Long dis_AVX128_E_V_to_G_lo32 ( /*OUT*/Bool* uses_vvvv,
   23229                                        const VexAbiInfo* vbi,
   23230                                        Prefix pfx, Long delta,
   23231                                        const HChar* opname, IROp op )
   23232 {
   23233    HChar   dis_buf[50];
   23234    Int     alen;
   23235    IRTemp  addr;
   23236    UChar   rm    = getUChar(delta);
   23237    UInt    rG    = gregOfRexRM(pfx,rm);
   23238    UInt    rV    = getVexNvvvv(pfx);
   23239    IRExpr* vpart = getXMMReg(rV);
   23240    if (epartIsReg(rm)) {
   23241       UInt rE = eregOfRexRM(pfx,rm);
   23242       putXMMReg( rG, binop(op, vpart, getXMMReg(rE)) );
   23243       DIP("%s %s,%s,%s\n", opname,
   23244           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   23245       delta = delta+1;
   23246    } else {
   23247       /* We can only do a 32-bit memory read, so the upper 3/4 of the
   23248          E operand needs to be made simply of zeroes. */
   23249       IRTemp epart = newTemp(Ity_V128);
   23250       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23251       assign( epart, unop( Iop_32UtoV128,
   23252                            loadLE(Ity_I32, mkexpr(addr))) );
   23253       putXMMReg( rG, binop(op, vpart, mkexpr(epart)) );
   23254       DIP("%s %s,%s,%s\n", opname,
   23255           dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   23256       delta = delta+alen;
   23257    }
   23258    putYMMRegLane128( rG, 1, mkV128(0) );
   23259    *uses_vvvv = True;
   23260    return delta;
   23261 }
   23262 
   23263 
   23264 /* All-lanes AVX128 binary operation:
   23265    G[127:0]   = V[127:0] `op` E[127:0]
   23266    G[255:128] = 0.
   23267 */
   23268 static Long dis_AVX128_E_V_to_G ( /*OUT*/Bool* uses_vvvv,
   23269                                   const VexAbiInfo* vbi,
   23270                                   Prefix pfx, Long delta,
   23271                                   const HChar* opname, IROp op )
   23272 {
   23273    return dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   23274              uses_vvvv, vbi, pfx, delta, opname, op,
   23275              NULL, False/*!invertLeftArg*/, False/*!swapArgs*/
   23276    );
   23277 }
   23278 
   23279 
   23280 /* Handles AVX128 32F/64F comparisons.  A derivative of
   23281    dis_SSEcmp_E_to_G.  It can fail, in which case it returns the
   23282    original delta to indicate failure. */
   23283 static
   23284 Long dis_AVX128_cmp_V_E_to_G ( /*OUT*/Bool* uses_vvvv,
   23285                                const VexAbiInfo* vbi,
   23286                                Prefix pfx, Long delta,
   23287                                const HChar* opname, Bool all_lanes, Int sz )
   23288 {
   23289    vassert(sz == 4 || sz == 8);
   23290    Long    deltaIN = delta;
   23291    HChar   dis_buf[50];
   23292    Int     alen;
   23293    UInt    imm8;
   23294    IRTemp  addr;
   23295    Bool    preSwap = False;
   23296    IROp    op      = Iop_INVALID;
   23297    Bool    postNot = False;
   23298    IRTemp  plain   = newTemp(Ity_V128);
   23299    UChar   rm      = getUChar(delta);
   23300    UInt    rG      = gregOfRexRM(pfx, rm);
   23301    UInt    rV      = getVexNvvvv(pfx);
   23302    IRTemp argL     = newTemp(Ity_V128);
   23303    IRTemp argR     = newTemp(Ity_V128);
   23304 
   23305    assign(argL, getXMMReg(rV));
   23306    if (epartIsReg(rm)) {
   23307       imm8 = getUChar(delta+1);
   23308       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
   23309       if (!ok) return deltaIN; /* FAIL */
   23310       UInt rE = eregOfRexRM(pfx,rm);
   23311       assign(argR, getXMMReg(rE));
   23312       delta += 1+1;
   23313       DIP("%s $%u,%s,%s,%s\n",
   23314           opname, imm8,
   23315           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   23316    } else {
   23317       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   23318       imm8 = getUChar(delta+alen);
   23319       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
   23320       if (!ok) return deltaIN; /* FAIL */
   23321       assign(argR,
   23322              all_lanes   ? loadLE(Ity_V128, mkexpr(addr))
   23323              : sz == 8   ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
   23324              : /*sz==4*/   unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr))));
   23325       delta += alen+1;
   23326       DIP("%s $%u,%s,%s,%s\n",
   23327           opname, imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   23328    }
   23329 
   23330    assign(plain, preSwap ? binop(op, mkexpr(argR), mkexpr(argL))
   23331                          : binop(op, mkexpr(argL), mkexpr(argR)));
   23332 
   23333    if (all_lanes) {
   23334       /* This is simple: just invert the result, if necessary, and
   23335          have done. */
   23336       if (postNot) {
   23337          putYMMRegLoAndZU( rG, unop(Iop_NotV128, mkexpr(plain)) );
   23338       } else {
   23339          putYMMRegLoAndZU( rG, mkexpr(plain) );
   23340       }
   23341    }
   23342    else
   23343    if (!preSwap) {
   23344       /* More complex.  It's a one-lane-only, hence need to possibly
   23345          invert only that one lane.  But at least the other lanes are
   23346          correctly "in" the result, having been copied from the left
   23347          operand (argL). */
   23348       if (postNot) {
   23349          IRExpr* mask = mkV128(sz==4 ? 0x000F : 0x00FF);
   23350          putYMMRegLoAndZU( rG, binop(Iop_XorV128, mkexpr(plain),
   23351                                                   mask) );
   23352       } else {
   23353          putYMMRegLoAndZU( rG, mkexpr(plain) );
   23354       }
   23355    }
   23356    else {
   23357       /* This is the most complex case.  One-lane-only, but the args
   23358          were swapped.  So we have to possibly invert the bottom lane,
   23359          and (definitely) we have to copy the upper lane(s) from argL
   23360          since, due to the swapping, what's currently there is from
   23361          argR, which is not correct. */
   23362       IRTemp res     = newTemp(Ity_V128);
   23363       IRTemp mask    = newTemp(Ity_V128);
   23364       IRTemp notMask = newTemp(Ity_V128);
   23365       assign(mask,    mkV128(sz==4 ? 0x000F : 0x00FF));
   23366       assign(notMask, mkV128(sz==4 ? 0xFFF0 : 0xFF00));
   23367       if (postNot) {
   23368          assign(res,
   23369                 binop(Iop_OrV128,
   23370                       binop(Iop_AndV128,
   23371                             unop(Iop_NotV128, mkexpr(plain)),
   23372                             mkexpr(mask)),
   23373                       binop(Iop_AndV128, mkexpr(argL), mkexpr(notMask))));
   23374       } else {
   23375          assign(res,
   23376                 binop(Iop_OrV128,
   23377                       binop(Iop_AndV128,
   23378                             mkexpr(plain),
   23379                             mkexpr(mask)),
   23380                       binop(Iop_AndV128, mkexpr(argL), mkexpr(notMask))));
   23381       }
   23382       putYMMRegLoAndZU( rG, mkexpr(res) );
   23383    }
   23384 
   23385    *uses_vvvv = True;
   23386    return delta;
   23387 }
   23388 
   23389 
   23390 /* Handles AVX256 32F/64F comparisons.  A derivative of
   23391    dis_SSEcmp_E_to_G.  It can fail, in which case it returns the
   23392    original delta to indicate failure. */
   23393 static
   23394 Long dis_AVX256_cmp_V_E_to_G ( /*OUT*/Bool* uses_vvvv,
   23395                                const VexAbiInfo* vbi,
   23396                                Prefix pfx, Long delta,
   23397                                const HChar* opname, Int sz )
   23398 {
   23399    vassert(sz == 4 || sz == 8);
   23400    Long    deltaIN = delta;
   23401    HChar   dis_buf[50];
   23402    Int     alen;
   23403    UInt    imm8;
   23404    IRTemp  addr;
   23405    Bool    preSwap = False;
   23406    IROp    op      = Iop_INVALID;
   23407    Bool    postNot = False;
   23408    IRTemp  plain   = newTemp(Ity_V256);
   23409    UChar   rm      = getUChar(delta);
   23410    UInt    rG      = gregOfRexRM(pfx, rm);
   23411    UInt    rV      = getVexNvvvv(pfx);
   23412    IRTemp argL     = newTemp(Ity_V256);
   23413    IRTemp argR     = newTemp(Ity_V256);
   23414    IRTemp argLhi   = IRTemp_INVALID;
   23415    IRTemp argLlo   = IRTemp_INVALID;
   23416    IRTemp argRhi   = IRTemp_INVALID;
   23417    IRTemp argRlo   = IRTemp_INVALID;
   23418 
   23419    assign(argL, getYMMReg(rV));
   23420    if (epartIsReg(rm)) {
   23421       imm8 = getUChar(delta+1);
   23422       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8,
   23423                              True/*all_lanes*/, sz);
   23424       if (!ok) return deltaIN; /* FAIL */
   23425       UInt rE = eregOfRexRM(pfx,rm);
   23426       assign(argR, getYMMReg(rE));
   23427       delta += 1+1;
   23428       DIP("%s $%u,%s,%s,%s\n",
   23429           opname, imm8,
   23430           nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   23431    } else {
   23432       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   23433       imm8 = getUChar(delta+alen);
   23434       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8,
   23435                              True/*all_lanes*/, sz);
   23436       if (!ok) return deltaIN; /* FAIL */
   23437       assign(argR, loadLE(Ity_V256, mkexpr(addr)) );
   23438       delta += alen+1;
   23439       DIP("%s $%u,%s,%s,%s\n",
   23440           opname, imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   23441    }
   23442 
   23443    breakupV256toV128s( preSwap ? argR : argL, &argLhi, &argLlo );
   23444    breakupV256toV128s( preSwap ? argL : argR, &argRhi, &argRlo );
   23445    assign(plain, binop( Iop_V128HLtoV256,
   23446                         binop(op, mkexpr(argLhi), mkexpr(argRhi)),
   23447                         binop(op, mkexpr(argLlo), mkexpr(argRlo)) ) );
   23448 
   23449    /* This is simple: just invert the result, if necessary, and
   23450       have done. */
   23451    if (postNot) {
   23452       putYMMReg( rG, unop(Iop_NotV256, mkexpr(plain)) );
   23453    } else {
   23454       putYMMReg( rG, mkexpr(plain) );
   23455    }
   23456 
   23457    *uses_vvvv = True;
   23458    return delta;
   23459 }
   23460 
   23461 
   23462 /* Handles AVX128 unary E-to-G all-lanes operations. */
   23463 static
   23464 Long dis_AVX128_E_to_G_unary ( /*OUT*/Bool* uses_vvvv,
   23465                                const VexAbiInfo* vbi,
   23466                                Prefix pfx, Long delta,
   23467                                const HChar* opname,
   23468                                IRTemp (*opFn)(IRTemp) )
   23469 {
   23470    HChar  dis_buf[50];
   23471    Int    alen;
   23472    IRTemp addr;
   23473    IRTemp res  = newTemp(Ity_V128);
   23474    IRTemp arg  = newTemp(Ity_V128);
   23475    UChar  rm   = getUChar(delta);
   23476    UInt   rG   = gregOfRexRM(pfx, rm);
   23477    if (epartIsReg(rm)) {
   23478       UInt rE = eregOfRexRM(pfx,rm);
   23479       assign(arg, getXMMReg(rE));
   23480       delta += 1;
   23481       DIP("%s %s,%s\n", opname, nameXMMReg(rE), nameXMMReg(rG));
   23482    } else {
   23483       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23484       assign(arg, loadLE(Ity_V128, mkexpr(addr)));
   23485       delta += alen;
   23486       DIP("%s %s,%s\n", opname, dis_buf, nameXMMReg(rG));
   23487    }
   23488    res = opFn(arg);
   23489    putYMMRegLoAndZU( rG, mkexpr(res) );
   23490    *uses_vvvv = False;
   23491    return delta;
   23492 }
   23493 
   23494 
   23495 /* Handles AVX128 unary E-to-G all-lanes operations. */
   23496 static
   23497 Long dis_AVX128_E_to_G_unary_all ( /*OUT*/Bool* uses_vvvv,
   23498                                    const VexAbiInfo* vbi,
   23499                                    Prefix pfx, Long delta,
   23500                                    const HChar* opname, IROp op )
   23501 {
   23502    HChar  dis_buf[50];
   23503    Int    alen;
   23504    IRTemp addr;
   23505    IRTemp arg  = newTemp(Ity_V128);
   23506    UChar  rm   = getUChar(delta);
   23507    UInt   rG   = gregOfRexRM(pfx, rm);
   23508    if (epartIsReg(rm)) {
   23509       UInt rE = eregOfRexRM(pfx,rm);
   23510       assign(arg, getXMMReg(rE));
   23511       delta += 1;
   23512       DIP("%s %s,%s\n", opname, nameXMMReg(rE), nameXMMReg(rG));
   23513    } else {
   23514       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23515       assign(arg, loadLE(Ity_V128, mkexpr(addr)));
   23516       delta += alen;
   23517       DIP("%s %s,%s\n", opname, dis_buf, nameXMMReg(rG));
   23518    }
   23519    // Sqrt32Fx4 and Sqrt64Fx2 take a rounding mode, which is faked
   23520    // up in the usual way.
   23521    Bool needsIRRM = op == Iop_Sqrt32Fx4 || op == Iop_Sqrt64Fx2;
   23522    /* XXXROUNDINGFIXME */
   23523    IRExpr* res = needsIRRM ? binop(op, get_FAKE_roundingmode(), mkexpr(arg))
   23524                            : unop(op, mkexpr(arg));
   23525    putYMMRegLoAndZU( rG, res );
   23526    *uses_vvvv = False;
   23527    return delta;
   23528 }
   23529 
   23530 
   23531 /* FIXME: common up with the _128_ version above? */
   23532 static
   23533 Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG (
   23534         /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
   23535         Prefix pfx, Long delta, const HChar* name,
   23536         /* The actual operation.  Use either 'op' or 'opfn',
   23537            but not both. */
   23538         IROp op, IRTemp(*opFn)(IRTemp,IRTemp),
   23539         Bool invertLeftArg,
   23540         Bool swapArgs
   23541      )
   23542 {
   23543    UChar  modrm = getUChar(delta);
   23544    UInt   rD    = gregOfRexRM(pfx, modrm);
   23545    UInt   rSL   = getVexNvvvv(pfx);
   23546    IRTemp tSL   = newTemp(Ity_V256);
   23547    IRTemp tSR   = newTemp(Ity_V256);
   23548    IRTemp addr  = IRTemp_INVALID;
   23549    HChar  dis_buf[50];
   23550    Int    alen  = 0;
   23551    vassert(1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*WIG?*/);
   23552 
   23553    assign(tSL, invertLeftArg ? unop(Iop_NotV256, getYMMReg(rSL))
   23554                              : getYMMReg(rSL));
   23555 
   23556    if (epartIsReg(modrm)) {
   23557       UInt rSR = eregOfRexRM(pfx, modrm);
   23558       delta += 1;
   23559       assign(tSR, getYMMReg(rSR));
   23560       DIP("%s %s,%s,%s\n",
   23561           name, nameYMMReg(rSR), nameYMMReg(rSL), nameYMMReg(rD));
   23562    } else {
   23563       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   23564       delta += alen;
   23565       assign(tSR, loadLE(Ity_V256, mkexpr(addr)));
   23566       DIP("%s %s,%s,%s\n",
   23567           name, dis_buf, nameYMMReg(rSL), nameYMMReg(rD));
   23568    }
   23569 
   23570    IRTemp res = IRTemp_INVALID;
   23571    if (op != Iop_INVALID) {
   23572       vassert(opFn == NULL);
   23573       res = newTemp(Ity_V256);
   23574       if (requiresRMode(op)) {
   23575          IRTemp rm = newTemp(Ity_I32);
   23576          assign(rm, get_FAKE_roundingmode()); /* XXXROUNDINGFIXME */
   23577          assign(res, swapArgs
   23578                         ? triop(op, mkexpr(rm), mkexpr(tSR), mkexpr(tSL))
   23579                         : triop(op, mkexpr(rm), mkexpr(tSL), mkexpr(tSR)));
   23580       } else {
   23581          assign(res, swapArgs
   23582                         ? binop(op, mkexpr(tSR), mkexpr(tSL))
   23583                         : binop(op, mkexpr(tSL), mkexpr(tSR)));
   23584       }
   23585    } else {
   23586       vassert(opFn != NULL);
   23587       res = swapArgs ? opFn(tSR, tSL) : opFn(tSL, tSR);
   23588    }
   23589 
   23590    putYMMReg(rD, mkexpr(res));
   23591 
   23592    *uses_vvvv = True;
   23593    return delta;
   23594 }
   23595 
   23596 
   23597 /* All-lanes AVX256 binary operation:
   23598    G[255:0] = V[255:0] `op` E[255:0]
   23599 */
   23600 static Long dis_AVX256_E_V_to_G ( /*OUT*/Bool* uses_vvvv,
   23601                                   const VexAbiInfo* vbi,
   23602                                   Prefix pfx, Long delta,
   23603                                   const HChar* opname, IROp op )
   23604 {
   23605    return dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
   23606              uses_vvvv, vbi, pfx, delta, opname, op,
   23607              NULL, False/*!invertLeftArg*/, False/*!swapArgs*/
   23608    );
   23609 }
   23610 
   23611 
   23612 /* Handle a VEX_NDS_256_66_0F_WIG (3-addr) insn, with a simple IROp
   23613    for the operation, no inversion of the left arg, and no swapping of
   23614    args. */
   23615 static
   23616 Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple (
   23617         /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
   23618         Prefix pfx, Long delta, const HChar* name,
   23619         IROp op
   23620      )
   23621 {
   23622    return dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
   23623              uses_vvvv, vbi, pfx, delta, name, op, NULL, False, False);
   23624 }
   23625 
   23626 
   23627 /* Handle a VEX_NDS_256_66_0F_WIG (3-addr) insn, using the given IR
   23628    generator to compute the result, no inversion of the left
   23629    arg, and no swapping of args. */
   23630 static
   23631 Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex (
   23632         /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
   23633         Prefix pfx, Long delta, const HChar* name,
   23634         IRTemp(*opFn)(IRTemp,IRTemp)
   23635      )
   23636 {
   23637    return dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
   23638              uses_vvvv, vbi, pfx, delta, name,
   23639              Iop_INVALID, opFn, False, False );
   23640 }
   23641 
   23642 
   23643 /* Handles AVX256 unary E-to-G all-lanes operations. */
   23644 static
   23645 Long dis_AVX256_E_to_G_unary ( /*OUT*/Bool* uses_vvvv,
   23646                                const VexAbiInfo* vbi,
   23647                                Prefix pfx, Long delta,
   23648                                const HChar* opname,
   23649                                IRTemp (*opFn)(IRTemp) )
   23650 {
   23651    HChar  dis_buf[50];
   23652    Int    alen;
   23653    IRTemp addr;
   23654    IRTemp res  = newTemp(Ity_V256);
   23655    IRTemp arg  = newTemp(Ity_V256);
   23656    UChar  rm   = getUChar(delta);
   23657    UInt   rG   = gregOfRexRM(pfx, rm);
   23658    if (epartIsReg(rm)) {
   23659       UInt rE = eregOfRexRM(pfx,rm);
   23660       assign(arg, getYMMReg(rE));
   23661       delta += 1;
   23662       DIP("%s %s,%s\n", opname, nameYMMReg(rE), nameYMMReg(rG));
   23663    } else {
   23664       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23665       assign(arg, loadLE(Ity_V256, mkexpr(addr)));
   23666       delta += alen;
   23667       DIP("%s %s,%s\n", opname, dis_buf, nameYMMReg(rG));
   23668    }
   23669    res = opFn(arg);
   23670    putYMMReg( rG, mkexpr(res) );
   23671    *uses_vvvv = False;
   23672    return delta;
   23673 }
   23674 
   23675 
   23676 /* Handles AVX256 unary E-to-G all-lanes operations. */
   23677 static
   23678 Long dis_AVX256_E_to_G_unary_all ( /*OUT*/Bool* uses_vvvv,
   23679                                    const VexAbiInfo* vbi,
   23680                                    Prefix pfx, Long delta,
   23681                                    const HChar* opname, IROp op )
   23682 {
   23683    HChar  dis_buf[50];
   23684    Int    alen;
   23685    IRTemp addr;
   23686    IRTemp arg  = newTemp(Ity_V256);
   23687    UChar  rm   = getUChar(delta);
   23688    UInt   rG   = gregOfRexRM(pfx, rm);
   23689    if (epartIsReg(rm)) {
   23690       UInt rE = eregOfRexRM(pfx,rm);
   23691       assign(arg, getYMMReg(rE));
   23692       delta += 1;
   23693       DIP("%s %s,%s\n", opname, nameYMMReg(rE), nameYMMReg(rG));
   23694    } else {
   23695       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23696       assign(arg, loadLE(Ity_V256, mkexpr(addr)));
   23697       delta += alen;
   23698       DIP("%s %s,%s\n", opname, dis_buf, nameYMMReg(rG));
   23699    }
   23700    putYMMReg( rG, unop(op, mkexpr(arg)) );
   23701    *uses_vvvv = False;
   23702    return delta;
   23703 }
   23704 
   23705 
   23706 /* The use of ReinterpF64asI64 is ugly.  Surely could do better if we
   23707    had a variant of Iop_64x4toV256 that took F64s as args instead. */
   23708 static Long dis_CVTDQ2PD_256 ( const VexAbiInfo* vbi, Prefix pfx,
   23709                                Long delta )
   23710 {
   23711    IRTemp addr  = IRTemp_INVALID;
   23712    Int    alen  = 0;
   23713    HChar  dis_buf[50];
   23714    UChar  modrm = getUChar(delta);
   23715    IRTemp sV    = newTemp(Ity_V128);
   23716    UInt   rG    = gregOfRexRM(pfx,modrm);
   23717    if (epartIsReg(modrm)) {
   23718       UInt rE = eregOfRexRM(pfx,modrm);
   23719       assign( sV, getXMMReg(rE) );
   23720       delta += 1;
   23721       DIP("vcvtdq2pd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   23722    } else {
   23723       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23724       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   23725       delta += alen;
   23726       DIP("vcvtdq2pd %s,%s\n", dis_buf, nameYMMReg(rG) );
   23727    }
   23728    IRTemp s3, s2, s1, s0;
   23729    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   23730    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   23731    IRExpr* res
   23732       = IRExpr_Qop(
   23733            Iop_64x4toV256,
   23734            unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s3))),
   23735            unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s2))),
   23736            unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s1))),
   23737            unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s0)))
   23738         );
   23739    putYMMReg(rG, res);
   23740    return delta;
   23741 }
   23742 
   23743 
   23744 static Long dis_CVTPD2PS_256 ( const VexAbiInfo* vbi, Prefix pfx,
   23745                                Long delta )
   23746 {
   23747    IRTemp addr  = IRTemp_INVALID;
   23748    Int    alen  = 0;
   23749    HChar  dis_buf[50];
   23750    UChar  modrm = getUChar(delta);
   23751    UInt   rG    = gregOfRexRM(pfx,modrm);
   23752    IRTemp argV  = newTemp(Ity_V256);
   23753    IRTemp rmode = newTemp(Ity_I32);
   23754    if (epartIsReg(modrm)) {
   23755       UInt rE = eregOfRexRM(pfx,modrm);
   23756       assign( argV, getYMMReg(rE) );
   23757       delta += 1;
   23758       DIP("vcvtpd2psy %s,%s\n", nameYMMReg(rE), nameXMMReg(rG));
   23759    } else {
   23760       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23761       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
   23762       delta += alen;
   23763       DIP("vcvtpd2psy %s,%s\n", dis_buf, nameXMMReg(rG) );
   23764    }
   23765 
   23766    assign( rmode, get_sse_roundingmode() );
   23767    IRTemp t3, t2, t1, t0;
   23768    t3 = t2 = t1 = t0 = IRTemp_INVALID;
   23769    breakupV256to64s( argV, &t3, &t2, &t1, &t0 );
   23770 #  define CVT(_t)  binop( Iop_F64toF32, mkexpr(rmode), \
   23771                           unop(Iop_ReinterpI64asF64, mkexpr(_t)) )
   23772    putXMMRegLane32F( rG, 3, CVT(t3) );
   23773    putXMMRegLane32F( rG, 2, CVT(t2) );
   23774    putXMMRegLane32F( rG, 1, CVT(t1) );
   23775    putXMMRegLane32F( rG, 0, CVT(t0) );
   23776 #  undef CVT
   23777    putYMMRegLane128( rG, 1, mkV128(0) );
   23778    return delta;
   23779 }
   23780 
   23781 
   23782 static IRTemp math_VPUNPCK_YMM ( IRTemp tL, IRType tR, IROp op )
   23783 {
   23784    IRTemp tLhi, tLlo, tRhi, tRlo;
   23785    tLhi = tLlo = tRhi = tRlo = IRTemp_INVALID;
   23786    IRTemp res = newTemp(Ity_V256);
   23787    breakupV256toV128s( tL, &tLhi, &tLlo );
   23788    breakupV256toV128s( tR, &tRhi, &tRlo );
   23789    assign( res, binop( Iop_V128HLtoV256,
   23790                        binop( op, mkexpr(tRhi), mkexpr(tLhi) ),
   23791                        binop( op, mkexpr(tRlo), mkexpr(tLlo) ) ) );
   23792    return res;
   23793 }
   23794 
   23795 
   23796 static IRTemp math_VPUNPCKLBW_YMM ( IRTemp tL, IRTemp tR )
   23797 {
   23798    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO8x16 );
   23799 }
   23800 
   23801 
   23802 static IRTemp math_VPUNPCKLWD_YMM ( IRTemp tL, IRTemp tR )
   23803 {
   23804    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO16x8 );
   23805 }
   23806 
   23807 
   23808 static IRTemp math_VPUNPCKLDQ_YMM ( IRTemp tL, IRTemp tR )
   23809 {
   23810    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO32x4 );
   23811 }
   23812 
   23813 
   23814 static IRTemp math_VPUNPCKLQDQ_YMM ( IRTemp tL, IRTemp tR )
   23815 {
   23816    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO64x2 );
   23817 }
   23818 
   23819 
   23820 static IRTemp math_VPUNPCKHBW_YMM ( IRTemp tL, IRTemp tR )
   23821 {
   23822    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI8x16 );
   23823 }
   23824 
   23825 
   23826 static IRTemp math_VPUNPCKHWD_YMM ( IRTemp tL, IRTemp tR )
   23827 {
   23828    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI16x8 );
   23829 }
   23830 
   23831 
   23832 static IRTemp math_VPUNPCKHDQ_YMM ( IRTemp tL, IRTemp tR )
   23833 {
   23834    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI32x4 );
   23835 }
   23836 
   23837 
   23838 static IRTemp math_VPUNPCKHQDQ_YMM ( IRTemp tL, IRTemp tR )
   23839 {
   23840    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI64x2 );
   23841 }
   23842 
   23843 
   23844 static IRTemp math_VPACKSSWB_YMM ( IRTemp tL, IRTemp tR )
   23845 {
   23846    return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin16Sto8Sx16 );
   23847 }
   23848 
   23849 
   23850 static IRTemp math_VPACKUSWB_YMM ( IRTemp tL, IRTemp tR )
   23851 {
   23852    return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin16Sto8Ux16 );
   23853 }
   23854 
   23855 
   23856 static IRTemp math_VPACKSSDW_YMM ( IRTemp tL, IRTemp tR )
   23857 {
   23858    return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin32Sto16Sx8 );
   23859 }
   23860 
   23861 
   23862 static IRTemp math_VPACKUSDW_YMM ( IRTemp tL, IRTemp tR )
   23863 {
   23864    return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin32Sto16Ux8 );
   23865 }
   23866 
   23867 
   23868 __attribute__((noinline))
   23869 static
   23870 Long dis_ESC_0F__VEX (
   23871         /*MB_OUT*/DisResult* dres,
   23872         /*OUT*/   Bool*      uses_vvvv,
   23873         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
   23874         Bool         resteerCisOk,
   23875         void*        callback_opaque,
   23876         const VexArchInfo* archinfo,
   23877         const VexAbiInfo*  vbi,
   23878         Prefix pfx, Int sz, Long deltaIN
   23879      )
   23880 {
   23881    IRTemp addr  = IRTemp_INVALID;
   23882    Int    alen  = 0;
   23883    HChar  dis_buf[50];
   23884    Long   delta = deltaIN;
   23885    UChar  opc   = getUChar(delta);
   23886    delta++;
   23887    *uses_vvvv = False;
   23888 
   23889    switch (opc) {
   23890 
   23891    case 0x10:
   23892       /* VMOVSD m64, xmm1 = VEX.LIG.F2.0F.WIG 10 /r */
   23893       /* Move 64 bits from E (mem only) to G (lo half xmm).
   23894          Bits 255-64 of the dest are zeroed out. */
   23895       if (haveF2no66noF3(pfx) && !epartIsReg(getUChar(delta))) {
   23896          UChar modrm = getUChar(delta);
   23897          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23898          UInt   rG   = gregOfRexRM(pfx,modrm);
   23899          IRTemp z128 = newTemp(Ity_V128);
   23900          assign(z128, mkV128(0));
   23901          putXMMReg( rG, mkexpr(z128) );
   23902          /* FIXME: ALIGNMENT CHECK? */
   23903          putXMMRegLane64( rG, 0, loadLE(Ity_I64, mkexpr(addr)) );
   23904          putYMMRegLane128( rG, 1, mkexpr(z128) );
   23905          DIP("vmovsd %s,%s\n", dis_buf, nameXMMReg(rG));
   23906          delta += alen;
   23907          goto decode_success;
   23908       }
   23909       /* VMOVSD xmm3, xmm2, xmm1 = VEX.LIG.F2.0F.WIG 10 /r */
   23910       /* Reg form. */
   23911       if (haveF2no66noF3(pfx) && epartIsReg(getUChar(delta))) {
   23912          UChar modrm = getUChar(delta);
   23913          UInt  rG    = gregOfRexRM(pfx, modrm);
   23914          UInt  rE    = eregOfRexRM(pfx, modrm);
   23915          UInt  rV    = getVexNvvvv(pfx);
   23916          delta++;
   23917          DIP("vmovsd %s,%s,%s\n",
   23918              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   23919          IRTemp res = newTemp(Ity_V128);
   23920          assign(res, binop(Iop_64HLtoV128,
   23921                            getXMMRegLane64(rV, 1),
   23922                            getXMMRegLane64(rE, 0)));
   23923          putYMMRegLoAndZU(rG, mkexpr(res));
   23924          *uses_vvvv = True;
   23925          goto decode_success;
   23926       }
   23927       /* VMOVSS m32, xmm1 = VEX.LIG.F3.0F.WIG 10 /r */
   23928       /* Move 32 bits from E (mem only) to G (lo half xmm).
   23929          Bits 255-32 of the dest are zeroed out. */
   23930       if (haveF3no66noF2(pfx) && !epartIsReg(getUChar(delta))) {
   23931          UChar modrm = getUChar(delta);
   23932          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23933          UInt   rG   = gregOfRexRM(pfx,modrm);
   23934          IRTemp z128 = newTemp(Ity_V128);
   23935          assign(z128, mkV128(0));
   23936          putXMMReg( rG, mkexpr(z128) );
   23937          /* FIXME: ALIGNMENT CHECK? */
   23938          putXMMRegLane32( rG, 0, loadLE(Ity_I32, mkexpr(addr)) );
   23939          putYMMRegLane128( rG, 1, mkexpr(z128) );
   23940          DIP("vmovss %s,%s\n", dis_buf, nameXMMReg(rG));
   23941          delta += alen;
   23942          goto decode_success;
   23943       }
   23944       /* VMOVSS xmm3, xmm2, xmm1 = VEX.LIG.F3.0F.WIG 10 /r */
   23945       /* Reg form. */
   23946       if (haveF3no66noF2(pfx) && epartIsReg(getUChar(delta))) {
   23947          UChar modrm = getUChar(delta);
   23948          UInt  rG    = gregOfRexRM(pfx, modrm);
   23949          UInt  rE    = eregOfRexRM(pfx, modrm);
   23950          UInt  rV    = getVexNvvvv(pfx);
   23951          delta++;
   23952          DIP("vmovss %s,%s,%s\n",
   23953              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   23954          IRTemp res = newTemp(Ity_V128);
   23955          assign( res, binop( Iop_64HLtoV128,
   23956                              getXMMRegLane64(rV, 1),
   23957                              binop(Iop_32HLto64,
   23958                                    getXMMRegLane32(rV, 1),
   23959                                    getXMMRegLane32(rE, 0)) ) );
   23960          putYMMRegLoAndZU(rG, mkexpr(res));
   23961          *uses_vvvv = True;
   23962          goto decode_success;
   23963       }
   23964       /* VMOVUPD xmm2/m128, xmm1 = VEX.128.66.0F.WIG 10 /r */
   23965       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23966          UChar modrm = getUChar(delta);
   23967          UInt  rG    = gregOfRexRM(pfx, modrm);
   23968          if (epartIsReg(modrm)) {
   23969             UInt rE = eregOfRexRM(pfx,modrm);
   23970             putYMMRegLoAndZU( rG, getXMMReg( rE ));
   23971             DIP("vmovupd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   23972             delta += 1;
   23973          } else {
   23974             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23975             putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
   23976             DIP("vmovupd %s,%s\n", dis_buf, nameXMMReg(rG));
   23977             delta += alen;
   23978          }
   23979          goto decode_success;
   23980       }
   23981       /* VMOVUPD ymm2/m256, ymm1 = VEX.256.66.0F.WIG 10 /r */
   23982       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   23983          UChar modrm = getUChar(delta);
   23984          UInt  rG    = gregOfRexRM(pfx, modrm);
   23985          if (epartIsReg(modrm)) {
   23986             UInt rE = eregOfRexRM(pfx,modrm);
   23987             putYMMReg( rG, getYMMReg( rE ));
   23988             DIP("vmovupd %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   23989             delta += 1;
   23990          } else {
   23991             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23992             putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
   23993             DIP("vmovupd %s,%s\n", dis_buf, nameYMMReg(rG));
   23994             delta += alen;
   23995          }
   23996          goto decode_success;
   23997       }
   23998       /* VMOVUPS xmm2/m128, xmm1 = VEX.128.0F.WIG 10 /r */
   23999       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24000          UChar modrm = getUChar(delta);
   24001          UInt  rG    = gregOfRexRM(pfx, modrm);
   24002          if (epartIsReg(modrm)) {
   24003             UInt rE = eregOfRexRM(pfx,modrm);
   24004             putYMMRegLoAndZU( rG, getXMMReg( rE ));
   24005             DIP("vmovups %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   24006             delta += 1;
   24007          } else {
   24008             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24009             putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
   24010             DIP("vmovups %s,%s\n", dis_buf, nameXMMReg(rG));
   24011             delta += alen;
   24012          }
   24013          goto decode_success;
   24014       }
   24015       /* VMOVUPS ymm2/m256, ymm1 = VEX.256.0F.WIG 10 /r */
   24016       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24017          UChar modrm = getUChar(delta);
   24018          UInt  rG    = gregOfRexRM(pfx, modrm);
   24019          if (epartIsReg(modrm)) {
   24020             UInt rE = eregOfRexRM(pfx,modrm);
   24021             putYMMReg( rG, getYMMReg( rE ));
   24022             DIP("vmovups %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   24023             delta += 1;
   24024          } else {
   24025             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24026             putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
   24027             DIP("vmovups %s,%s\n", dis_buf, nameYMMReg(rG));
   24028             delta += alen;
   24029          }
   24030          goto decode_success;
   24031       }
   24032       break;
   24033 
   24034    case 0x11:
   24035       /* VMOVSD xmm1, m64 = VEX.LIG.F2.0F.WIG 11 /r */
   24036       /* Move 64 bits from G (low half xmm) to mem only. */
   24037       if (haveF2no66noF3(pfx) && !epartIsReg(getUChar(delta))) {
   24038          UChar modrm = getUChar(delta);
   24039          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24040          UInt   rG   = gregOfRexRM(pfx,modrm);
   24041          /* FIXME: ALIGNMENT CHECK? */
   24042          storeLE( mkexpr(addr), getXMMRegLane64(rG, 0));
   24043          DIP("vmovsd %s,%s\n", nameXMMReg(rG), dis_buf);
   24044          delta += alen;
   24045          goto decode_success;
   24046       }
   24047       /* VMOVSD xmm3, xmm2, xmm1 = VEX.LIG.F2.0F.WIG 11 /r */
   24048       /* Reg form. */
   24049       if (haveF2no66noF3(pfx) && epartIsReg(getUChar(delta))) {
   24050          UChar modrm = getUChar(delta);
   24051          UInt  rG    = gregOfRexRM(pfx, modrm);
   24052          UInt  rE    = eregOfRexRM(pfx, modrm);
   24053          UInt  rV    = getVexNvvvv(pfx);
   24054          delta++;
   24055          DIP("vmovsd %s,%s,%s\n",
   24056              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   24057          IRTemp res = newTemp(Ity_V128);
   24058          assign(res, binop(Iop_64HLtoV128,
   24059                            getXMMRegLane64(rV, 1),
   24060                            getXMMRegLane64(rE, 0)));
   24061          putYMMRegLoAndZU(rG, mkexpr(res));
   24062          *uses_vvvv = True;
   24063          goto decode_success;
   24064       }
   24065       /* VMOVSS xmm1, m64 = VEX.LIG.F3.0F.WIG 11 /r */
   24066       /* Move 32 bits from G (low 1/4 xmm) to mem only. */
   24067       if (haveF3no66noF2(pfx) && !epartIsReg(getUChar(delta))) {
   24068          UChar modrm = getUChar(delta);
   24069          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24070          UInt   rG   = gregOfRexRM(pfx,modrm);
   24071          /* FIXME: ALIGNMENT CHECK? */
   24072          storeLE( mkexpr(addr), getXMMRegLane32(rG, 0));
   24073          DIP("vmovss %s,%s\n", nameXMMReg(rG), dis_buf);
   24074          delta += alen;
   24075          goto decode_success;
   24076       }
   24077       /* VMOVSS xmm3, xmm2, xmm1 = VEX.LIG.F3.0F.WIG 11 /r */
   24078       /* Reg form. */
   24079       if (haveF3no66noF2(pfx) && epartIsReg(getUChar(delta))) {
   24080          UChar modrm = getUChar(delta);
   24081          UInt  rG    = gregOfRexRM(pfx, modrm);
   24082          UInt  rE    = eregOfRexRM(pfx, modrm);
   24083          UInt  rV    = getVexNvvvv(pfx);
   24084          delta++;
   24085          DIP("vmovss %s,%s,%s\n",
   24086              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   24087          IRTemp res = newTemp(Ity_V128);
   24088          assign( res, binop( Iop_64HLtoV128,
   24089                              getXMMRegLane64(rV, 1),
   24090                              binop(Iop_32HLto64,
   24091                                    getXMMRegLane32(rV, 1),
   24092                                    getXMMRegLane32(rE, 0)) ) );
   24093          putYMMRegLoAndZU(rG, mkexpr(res));
   24094          *uses_vvvv = True;
   24095          goto decode_success;
   24096       }
   24097       /* VMOVUPD xmm1, xmm2/m128 = VEX.128.66.0F.WIG 11 /r */
   24098       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24099          UChar modrm = getUChar(delta);
   24100          UInt  rG    = gregOfRexRM(pfx,modrm);
   24101          if (epartIsReg(modrm)) {
   24102             UInt rE = eregOfRexRM(pfx,modrm);
   24103             putYMMRegLoAndZU( rE, getXMMReg(rG) );
   24104             DIP("vmovupd %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
   24105             delta += 1;
   24106          } else {
   24107             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24108             storeLE( mkexpr(addr), getXMMReg(rG) );
   24109             DIP("vmovupd %s,%s\n", nameXMMReg(rG), dis_buf);
   24110             delta += alen;
   24111          }
   24112          goto decode_success;
   24113       }
   24114       /* VMOVUPD ymm1, ymm2/m256 = VEX.256.66.0F.WIG 11 /r */
   24115       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24116          UChar modrm = getUChar(delta);
   24117          UInt  rG    = gregOfRexRM(pfx,modrm);
   24118          if (epartIsReg(modrm)) {
   24119             UInt rE = eregOfRexRM(pfx,modrm);
   24120             putYMMReg( rE, getYMMReg(rG) );
   24121             DIP("vmovupd %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
   24122             delta += 1;
   24123          } else {
   24124             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24125             storeLE( mkexpr(addr), getYMMReg(rG) );
   24126             DIP("vmovupd %s,%s\n", nameYMMReg(rG), dis_buf);
   24127             delta += alen;
   24128          }
   24129          goto decode_success;
   24130       }
   24131       /* VMOVUPS xmm1, xmm2/m128 = VEX.128.0F.WIG 11 /r */
   24132       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24133          UChar modrm = getUChar(delta);
   24134          UInt  rG    = gregOfRexRM(pfx,modrm);
   24135          if (epartIsReg(modrm)) {
   24136             UInt rE = eregOfRexRM(pfx,modrm);
   24137             putYMMRegLoAndZU( rE, getXMMReg(rG) );
   24138             DIP("vmovups %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
   24139             delta += 1;
   24140          } else {
   24141             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24142             storeLE( mkexpr(addr), getXMMReg(rG) );
   24143             DIP("vmovups %s,%s\n", nameXMMReg(rG), dis_buf);
   24144             delta += alen;
   24145          }
   24146          goto decode_success;
   24147       }
   24148       /* VMOVUPS ymm1, ymm2/m256 = VEX.256.0F.WIG 11 /r */
   24149       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24150          UChar modrm = getUChar(delta);
   24151          UInt  rG    = gregOfRexRM(pfx,modrm);
   24152          if (epartIsReg(modrm)) {
   24153             UInt rE = eregOfRexRM(pfx,modrm);
   24154             putYMMReg( rE, getYMMReg(rG) );
   24155             DIP("vmovups %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
   24156             delta += 1;
   24157          } else {
   24158             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24159             storeLE( mkexpr(addr), getYMMReg(rG) );
   24160             DIP("vmovups %s,%s\n", nameYMMReg(rG), dis_buf);
   24161             delta += alen;
   24162          }
   24163          goto decode_success;
   24164       }
   24165       break;
   24166 
   24167    case 0x12:
   24168       /* VMOVDDUP xmm2/m64, xmm1 = VEX.128.F2.0F.WIG /12 r */
   24169       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24170          delta = dis_MOVDDUP_128( vbi, pfx, delta, True/*isAvx*/ );
   24171          goto decode_success;
   24172       }
   24173       /* VMOVDDUP ymm2/m256, ymm1 = VEX.256.F2.0F.WIG /12 r */
   24174       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24175          delta = dis_MOVDDUP_256( vbi, pfx, delta );
   24176          goto decode_success;
   24177       }
   24178       /* VMOVHLPS xmm3, xmm2, xmm1 = VEX.NDS.128.0F.WIG 12 /r */
   24179       /* Insn only exists in reg form */
   24180       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   24181           && epartIsReg(getUChar(delta))) {
   24182          UChar modrm = getUChar(delta);
   24183          UInt  rG    = gregOfRexRM(pfx, modrm);
   24184          UInt  rE    = eregOfRexRM(pfx, modrm);
   24185          UInt  rV    = getVexNvvvv(pfx);
   24186          delta++;
   24187          DIP("vmovhlps %s,%s,%s\n",
   24188              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   24189          IRTemp res = newTemp(Ity_V128);
   24190          assign(res, binop(Iop_64HLtoV128,
   24191                            getXMMRegLane64(rV, 1),
   24192                            getXMMRegLane64(rE, 1)));
   24193          putYMMRegLoAndZU(rG, mkexpr(res));
   24194          *uses_vvvv = True;
   24195          goto decode_success;
   24196       }
   24197       /* VMOVLPS m64, xmm1, xmm2 = VEX.NDS.128.0F.WIG 12 /r */
   24198       /* Insn exists only in mem form, it appears. */
   24199       /* VMOVLPD m64, xmm1, xmm2 = VEX.NDS.128.66.0F.WIG 12 /r */
   24200       /* Insn exists only in mem form, it appears. */
   24201       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   24202           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
   24203          UChar modrm = getUChar(delta);
   24204          UInt  rG    = gregOfRexRM(pfx, modrm);
   24205          UInt  rV    = getVexNvvvv(pfx);
   24206          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24207          delta += alen;
   24208          DIP("vmovlpd %s,%s,%s\n",
   24209              dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   24210          IRTemp res = newTemp(Ity_V128);
   24211          assign(res, binop(Iop_64HLtoV128,
   24212                            getXMMRegLane64(rV, 1),
   24213                            loadLE(Ity_I64, mkexpr(addr))));
   24214          putYMMRegLoAndZU(rG, mkexpr(res));
   24215          *uses_vvvv = True;
   24216          goto decode_success;
   24217       }
   24218       /* VMOVSLDUP xmm2/m128, xmm1 = VEX.NDS.128.F3.0F.WIG 12 /r */
   24219       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
   24220          delta = dis_MOVSxDUP_128( vbi, pfx, delta, True/*isAvx*/,
   24221                                    True/*isL*/ );
   24222          goto decode_success;
   24223       }
   24224       /* VMOVSLDUP ymm2/m256, ymm1 = VEX.NDS.256.F3.0F.WIG 12 /r */
   24225       if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
   24226          delta = dis_MOVSxDUP_256( vbi, pfx, delta, True/*isL*/ );
   24227          goto decode_success;
   24228       }
   24229       break;
   24230 
   24231    case 0x13:
   24232       /* VMOVLPS xmm1, m64 = VEX.128.0F.WIG 13 /r */
   24233       /* Insn exists only in mem form, it appears. */
   24234       /* VMOVLPD xmm1, m64 = VEX.128.66.0F.WIG 13 /r */
   24235       /* Insn exists only in mem form, it appears. */
   24236       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   24237           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
   24238          UChar modrm = getUChar(delta);
   24239          UInt  rG    = gregOfRexRM(pfx, modrm);
   24240          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24241          delta += alen;
   24242          storeLE( mkexpr(addr), getXMMRegLane64( rG, 0));
   24243          DIP("vmovlpd %s,%s\n", nameXMMReg(rG), dis_buf);
   24244          goto decode_success;
   24245       }
   24246       break;
   24247 
   24248    case 0x14:
   24249    case 0x15:
   24250       /* VUNPCKLPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 14 /r */
   24251       /* VUNPCKHPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 15 /r */
   24252       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24253          Bool   hi    = opc == 0x15;
   24254          UChar  modrm = getUChar(delta);
   24255          UInt   rG    = gregOfRexRM(pfx,modrm);
   24256          UInt   rV    = getVexNvvvv(pfx);
   24257          IRTemp eV    = newTemp(Ity_V128);
   24258          IRTemp vV    = newTemp(Ity_V128);
   24259          assign( vV, getXMMReg(rV) );
   24260          if (epartIsReg(modrm)) {
   24261             UInt rE = eregOfRexRM(pfx,modrm);
   24262             assign( eV, getXMMReg(rE) );
   24263             delta += 1;
   24264             DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
   24265                 nameXMMReg(rE), nameXMMReg(rG));
   24266          } else {
   24267             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24268             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   24269             delta += alen;
   24270             DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
   24271                 dis_buf, nameXMMReg(rG));
   24272          }
   24273          IRTemp res = math_UNPCKxPS_128( eV, vV, hi );
   24274          putYMMRegLoAndZU( rG, mkexpr(res) );
   24275          *uses_vvvv = True;
   24276          goto decode_success;
   24277       }
   24278       /* VUNPCKLPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 14 /r */
   24279       /* VUNPCKHPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 15 /r */
   24280       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24281          Bool   hi    = opc == 0x15;
   24282          UChar  modrm = getUChar(delta);
   24283          UInt   rG    = gregOfRexRM(pfx,modrm);
   24284          UInt   rV    = getVexNvvvv(pfx);
   24285          IRTemp eV    = newTemp(Ity_V256);
   24286          IRTemp vV    = newTemp(Ity_V256);
   24287          assign( vV, getYMMReg(rV) );
   24288          if (epartIsReg(modrm)) {
   24289             UInt rE = eregOfRexRM(pfx,modrm);
   24290             assign( eV, getYMMReg(rE) );
   24291             delta += 1;
   24292             DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
   24293                 nameYMMReg(rE), nameYMMReg(rG));
   24294          } else {
   24295             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24296             assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
   24297             delta += alen;
   24298             DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
   24299                 dis_buf, nameYMMReg(rG));
   24300          }
   24301          IRTemp res = math_UNPCKxPS_256( eV, vV, hi );
   24302          putYMMReg( rG, mkexpr(res) );
   24303          *uses_vvvv = True;
   24304          goto decode_success;
   24305       }
   24306       /* VUNPCKLPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 14 /r */
   24307       /* VUNPCKHPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 15 /r */
   24308       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24309          Bool   hi    = opc == 0x15;
   24310          UChar  modrm = getUChar(delta);
   24311          UInt   rG    = gregOfRexRM(pfx,modrm);
   24312          UInt   rV    = getVexNvvvv(pfx);
   24313          IRTemp eV    = newTemp(Ity_V128);
   24314          IRTemp vV    = newTemp(Ity_V128);
   24315          assign( vV, getXMMReg(rV) );
   24316          if (epartIsReg(modrm)) {
   24317             UInt rE = eregOfRexRM(pfx,modrm);
   24318             assign( eV, getXMMReg(rE) );
   24319             delta += 1;
   24320             DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
   24321                 nameXMMReg(rE), nameXMMReg(rG));
   24322          } else {
   24323             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24324             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   24325             delta += alen;
   24326             DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
   24327                 dis_buf, nameXMMReg(rG));
   24328          }
   24329          IRTemp res = math_UNPCKxPD_128( eV, vV, hi );
   24330          putYMMRegLoAndZU( rG, mkexpr(res) );
   24331          *uses_vvvv = True;
   24332          goto decode_success;
   24333       }
   24334       /* VUNPCKLPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 14 /r */
   24335       /* VUNPCKHPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 15 /r */
   24336       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24337          Bool   hi    = opc == 0x15;
   24338          UChar  modrm = getUChar(delta);
   24339          UInt   rG    = gregOfRexRM(pfx,modrm);
   24340          UInt   rV    = getVexNvvvv(pfx);
   24341          IRTemp eV    = newTemp(Ity_V256);
   24342          IRTemp vV    = newTemp(Ity_V256);
   24343          assign( vV, getYMMReg(rV) );
   24344          if (epartIsReg(modrm)) {
   24345             UInt rE = eregOfRexRM(pfx,modrm);
   24346             assign( eV, getYMMReg(rE) );
   24347             delta += 1;
   24348             DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
   24349                 nameYMMReg(rE), nameYMMReg(rG));
   24350          } else {
   24351             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24352             assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
   24353             delta += alen;
   24354             DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
   24355                 dis_buf, nameYMMReg(rG));
   24356          }
   24357          IRTemp res = math_UNPCKxPD_256( eV, vV, hi );
   24358          putYMMReg( rG, mkexpr(res) );
   24359          *uses_vvvv = True;
   24360          goto decode_success;
   24361       }
   24362       break;
   24363 
   24364    case 0x16:
   24365       /* VMOVLHPS xmm3, xmm2, xmm1 = VEX.NDS.128.0F.WIG 16 /r */
   24366       /* Insn only exists in reg form */
   24367       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   24368           && epartIsReg(getUChar(delta))) {
   24369          UChar modrm = getUChar(delta);
   24370          UInt  rG    = gregOfRexRM(pfx, modrm);
   24371          UInt  rE    = eregOfRexRM(pfx, modrm);
   24372          UInt  rV    = getVexNvvvv(pfx);
   24373          delta++;
   24374          DIP("vmovlhps %s,%s,%s\n",
   24375              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   24376          IRTemp res = newTemp(Ity_V128);
   24377          assign(res, binop(Iop_64HLtoV128,
   24378                            getXMMRegLane64(rE, 0),
   24379                            getXMMRegLane64(rV, 0)));
   24380          putYMMRegLoAndZU(rG, mkexpr(res));
   24381          *uses_vvvv = True;
   24382          goto decode_success;
   24383       }
   24384       /* VMOVHPS m64, xmm1, xmm2 = VEX.NDS.128.0F.WIG 16 /r */
   24385       /* Insn exists only in mem form, it appears. */
   24386       /* VMOVHPD m64, xmm1, xmm2 = VEX.NDS.128.66.0F.WIG 16 /r */
   24387       /* Insn exists only in mem form, it appears. */
   24388       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   24389           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
   24390          UChar modrm = getUChar(delta);
   24391          UInt  rG    = gregOfRexRM(pfx, modrm);
   24392          UInt  rV    = getVexNvvvv(pfx);
   24393          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24394          delta += alen;
   24395          DIP("vmovhp%c %s,%s,%s\n", have66(pfx) ? 'd' : 's',
   24396              dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   24397          IRTemp res = newTemp(Ity_V128);
   24398          assign(res, binop(Iop_64HLtoV128,
   24399                            loadLE(Ity_I64, mkexpr(addr)),
   24400                            getXMMRegLane64(rV, 0)));
   24401          putYMMRegLoAndZU(rG, mkexpr(res));
   24402          *uses_vvvv = True;
   24403          goto decode_success;
   24404       }
   24405       /* VMOVSHDUP xmm2/m128, xmm1 = VEX.NDS.128.F3.0F.WIG 16 /r */
   24406       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
   24407          delta = dis_MOVSxDUP_128( vbi, pfx, delta, True/*isAvx*/,
   24408                                    False/*!isL*/ );
   24409          goto decode_success;
   24410       }
   24411       /* VMOVSHDUP ymm2/m256, ymm1 = VEX.NDS.256.F3.0F.WIG 16 /r */
   24412       if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
   24413          delta = dis_MOVSxDUP_256( vbi, pfx, delta, False/*!isL*/ );
   24414          goto decode_success;
   24415       }
   24416       break;
   24417 
   24418    case 0x17:
   24419       /* VMOVHPS xmm1, m64 = VEX.128.0F.WIG 17 /r */
   24420       /* Insn exists only in mem form, it appears. */
   24421       /* VMOVHPD xmm1, m64 = VEX.128.66.0F.WIG 17 /r */
   24422       /* Insn exists only in mem form, it appears. */
   24423       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   24424           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
   24425          UChar modrm = getUChar(delta);
   24426          UInt  rG    = gregOfRexRM(pfx, modrm);
   24427          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24428          delta += alen;
   24429          storeLE( mkexpr(addr), getXMMRegLane64( rG, 1));
   24430          DIP("vmovhp%c %s,%s\n", have66(pfx) ? 'd' : 's',
   24431              nameXMMReg(rG), dis_buf);
   24432          goto decode_success;
   24433       }
   24434       break;
   24435 
   24436    case 0x28:
   24437       /* VMOVAPD xmm2/m128, xmm1 = VEX.128.66.0F.WIG 28 /r */
   24438       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24439          UChar modrm = getUChar(delta);
   24440          UInt  rG    = gregOfRexRM(pfx, modrm);
   24441          if (epartIsReg(modrm)) {
   24442             UInt rE = eregOfRexRM(pfx,modrm);
   24443             putYMMRegLoAndZU( rG, getXMMReg( rE ));
   24444             DIP("vmovapd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   24445             delta += 1;
   24446          } else {
   24447             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24448             gen_SEGV_if_not_16_aligned( addr );
   24449             putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
   24450             DIP("vmovapd %s,%s\n", dis_buf, nameXMMReg(rG));
   24451             delta += alen;
   24452          }
   24453          goto decode_success;
   24454       }
   24455       /* VMOVAPD ymm2/m256, ymm1 = VEX.256.66.0F.WIG 28 /r */
   24456       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24457          UChar modrm = getUChar(delta);
   24458          UInt  rG    = gregOfRexRM(pfx, modrm);
   24459          if (epartIsReg(modrm)) {
   24460             UInt rE = eregOfRexRM(pfx,modrm);
   24461             putYMMReg( rG, getYMMReg( rE ));
   24462             DIP("vmovapd %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   24463             delta += 1;
   24464          } else {
   24465             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24466             gen_SEGV_if_not_32_aligned( addr );
   24467             putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
   24468             DIP("vmovapd %s,%s\n", dis_buf, nameYMMReg(rG));
   24469             delta += alen;
   24470          }
   24471          goto decode_success;
   24472       }
   24473       /* VMOVAPS xmm2/m128, xmm1 = VEX.128.0F.WIG 28 /r */
   24474       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24475          UChar modrm = getUChar(delta);
   24476          UInt  rG    = gregOfRexRM(pfx, modrm);
   24477          if (epartIsReg(modrm)) {
   24478             UInt rE = eregOfRexRM(pfx,modrm);
   24479             putYMMRegLoAndZU( rG, getXMMReg( rE ));
   24480             DIP("vmovaps %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   24481             delta += 1;
   24482          } else {
   24483             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24484             gen_SEGV_if_not_16_aligned( addr );
   24485             putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
   24486             DIP("vmovaps %s,%s\n", dis_buf, nameXMMReg(rG));
   24487             delta += alen;
   24488          }
   24489          goto decode_success;
   24490       }
   24491       /* VMOVAPS ymm2/m256, ymm1 = VEX.256.0F.WIG 28 /r */
   24492       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24493          UChar modrm = getUChar(delta);
   24494          UInt  rG    = gregOfRexRM(pfx, modrm);
   24495          if (epartIsReg(modrm)) {
   24496             UInt rE = eregOfRexRM(pfx,modrm);
   24497             putYMMReg( rG, getYMMReg( rE ));
   24498             DIP("vmovaps %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   24499             delta += 1;
   24500          } else {
   24501             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24502             gen_SEGV_if_not_32_aligned( addr );
   24503             putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
   24504             DIP("vmovaps %s,%s\n", dis_buf, nameYMMReg(rG));
   24505             delta += alen;
   24506          }
   24507          goto decode_success;
   24508       }
   24509       break;
   24510 
   24511    case 0x29:
   24512       /* VMOVAPD xmm1, xmm2/m128 = VEX.128.66.0F.WIG 29 /r */
   24513       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24514          UChar modrm = getUChar(delta);
   24515          UInt  rG    = gregOfRexRM(pfx,modrm);
   24516          if (epartIsReg(modrm)) {
   24517             UInt rE = eregOfRexRM(pfx,modrm);
   24518             putYMMRegLoAndZU( rE, getXMMReg(rG) );
   24519             DIP("vmovapd %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
   24520             delta += 1;
   24521          } else {
   24522             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24523             gen_SEGV_if_not_16_aligned( addr );
   24524             storeLE( mkexpr(addr), getXMMReg(rG) );
   24525             DIP("vmovapd %s,%s\n", nameXMMReg(rG), dis_buf );
   24526             delta += alen;
   24527          }
   24528          goto decode_success;
   24529       }
   24530       /* VMOVAPD ymm1, ymm2/m256 = VEX.256.66.0F.WIG 29 /r */
   24531       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24532          UChar modrm = getUChar(delta);
   24533          UInt  rG    = gregOfRexRM(pfx,modrm);
   24534          if (epartIsReg(modrm)) {
   24535             UInt rE = eregOfRexRM(pfx,modrm);
   24536             putYMMReg( rE, getYMMReg(rG) );
   24537             DIP("vmovapd %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
   24538             delta += 1;
   24539          } else {
   24540             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24541             gen_SEGV_if_not_32_aligned( addr );
   24542             storeLE( mkexpr(addr), getYMMReg(rG) );
   24543             DIP("vmovapd %s,%s\n", nameYMMReg(rG), dis_buf );
   24544             delta += alen;
   24545          }
   24546          goto decode_success;
   24547       }
   24548       /* VMOVAPS xmm1, xmm2/m128 = VEX.128.0F.WIG 29 /r */
   24549       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24550          UChar modrm = getUChar(delta);
   24551          UInt  rG    = gregOfRexRM(pfx,modrm);
   24552          if (epartIsReg(modrm)) {
   24553             UInt rE = eregOfRexRM(pfx,modrm);
   24554             putYMMRegLoAndZU( rE, getXMMReg(rG) );
   24555             DIP("vmovaps %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
   24556             delta += 1;
   24557             goto decode_success;
   24558          } else {
   24559             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24560             gen_SEGV_if_not_16_aligned( addr );
   24561             storeLE( mkexpr(addr), getXMMReg(rG) );
   24562             DIP("vmovaps %s,%s\n", nameXMMReg(rG), dis_buf );
   24563             delta += alen;
   24564             goto decode_success;
   24565          }
   24566       }
   24567       /* VMOVAPS ymm1, ymm2/m256 = VEX.256.0F.WIG 29 /r */
   24568       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24569          UChar modrm = getUChar(delta);
   24570          UInt  rG    = gregOfRexRM(pfx,modrm);
   24571          if (epartIsReg(modrm)) {
   24572             UInt rE = eregOfRexRM(pfx,modrm);
   24573             putYMMReg( rE, getYMMReg(rG) );
   24574             DIP("vmovaps %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
   24575             delta += 1;
   24576             goto decode_success;
   24577          } else {
   24578             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24579             gen_SEGV_if_not_32_aligned( addr );
   24580             storeLE( mkexpr(addr), getYMMReg(rG) );
   24581             DIP("vmovaps %s,%s\n", nameYMMReg(rG), dis_buf );
   24582             delta += alen;
   24583             goto decode_success;
   24584          }
   24585       }
   24586       break;
   24587 
   24588    case 0x2A: {
   24589       IRTemp rmode = newTemp(Ity_I32);
   24590       assign( rmode, get_sse_roundingmode() );
   24591       /* VCVTSI2SD r/m32, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.W0 2A /r */
   24592       if (haveF2no66noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
   24593          UChar  modrm = getUChar(delta);
   24594          UInt   rV    = getVexNvvvv(pfx);
   24595          UInt   rD    = gregOfRexRM(pfx, modrm);
   24596          IRTemp arg32 = newTemp(Ity_I32);
   24597          if (epartIsReg(modrm)) {
   24598             UInt rS = eregOfRexRM(pfx,modrm);
   24599             assign( arg32, getIReg32(rS) );
   24600             delta += 1;
   24601             DIP("vcvtsi2sdl %s,%s,%s\n",
   24602                 nameIReg32(rS), nameXMMReg(rV), nameXMMReg(rD));
   24603          } else {
   24604             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24605             assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   24606             delta += alen;
   24607             DIP("vcvtsi2sdl %s,%s,%s\n",
   24608                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   24609          }
   24610          putXMMRegLane64F( rD, 0,
   24611                            unop(Iop_I32StoF64, mkexpr(arg32)));
   24612          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   24613          putYMMRegLane128( rD, 1, mkV128(0) );
   24614          *uses_vvvv = True;
   24615          goto decode_success;
   24616       }
   24617       /* VCVTSI2SD r/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.W1 2A /r */
   24618       if (haveF2no66noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
   24619          UChar  modrm = getUChar(delta);
   24620          UInt   rV    = getVexNvvvv(pfx);
   24621          UInt   rD    = gregOfRexRM(pfx, modrm);
   24622          IRTemp arg64 = newTemp(Ity_I64);
   24623          if (epartIsReg(modrm)) {
   24624             UInt rS = eregOfRexRM(pfx,modrm);
   24625             assign( arg64, getIReg64(rS) );
   24626             delta += 1;
   24627             DIP("vcvtsi2sdq %s,%s,%s\n",
   24628                 nameIReg64(rS), nameXMMReg(rV), nameXMMReg(rD));
   24629          } else {
   24630             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24631             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   24632             delta += alen;
   24633             DIP("vcvtsi2sdq %s,%s,%s\n",
   24634                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   24635          }
   24636          putXMMRegLane64F( rD, 0,
   24637                            binop( Iop_I64StoF64,
   24638                                   get_sse_roundingmode(),
   24639                                   mkexpr(arg64)) );
   24640          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   24641          putYMMRegLane128( rD, 1, mkV128(0) );
   24642          *uses_vvvv = True;
   24643          goto decode_success;
   24644       }
   24645       /* VCVTSI2SS r/m64, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.W1 2A /r */
   24646       if (haveF3no66noF2(pfx) && 1==getRexW(pfx)/*W1*/) {
   24647          UChar  modrm = getUChar(delta);
   24648          UInt   rV    = getVexNvvvv(pfx);
   24649          UInt   rD    = gregOfRexRM(pfx, modrm);
   24650          IRTemp arg64 = newTemp(Ity_I64);
   24651          if (epartIsReg(modrm)) {
   24652             UInt rS = eregOfRexRM(pfx,modrm);
   24653             assign( arg64, getIReg64(rS) );
   24654             delta += 1;
   24655             DIP("vcvtsi2ssq %s,%s,%s\n",
   24656                 nameIReg64(rS), nameXMMReg(rV), nameXMMReg(rD));
   24657          } else {
   24658             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24659             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   24660             delta += alen;
   24661             DIP("vcvtsi2ssq %s,%s,%s\n",
   24662                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   24663          }
   24664          putXMMRegLane32F( rD, 0,
   24665                            binop(Iop_F64toF32,
   24666                                  mkexpr(rmode),
   24667                                  binop(Iop_I64StoF64, mkexpr(rmode),
   24668                                                       mkexpr(arg64)) ) );
   24669          putXMMRegLane32( rD, 1, getXMMRegLane32( rV, 1 ));
   24670          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   24671          putYMMRegLane128( rD, 1, mkV128(0) );
   24672          *uses_vvvv = True;
   24673          goto decode_success;
   24674       }
   24675       /* VCVTSI2SS r/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.W0 2A /r */
   24676       if (haveF3no66noF2(pfx) && 0==getRexW(pfx)/*W0*/) {
   24677          UChar  modrm = getUChar(delta);
   24678          UInt   rV    = getVexNvvvv(pfx);
   24679          UInt   rD    = gregOfRexRM(pfx, modrm);
   24680          IRTemp arg32 = newTemp(Ity_I32);
   24681          if (epartIsReg(modrm)) {
   24682             UInt rS = eregOfRexRM(pfx,modrm);
   24683             assign( arg32, getIReg32(rS) );
   24684             delta += 1;
   24685             DIP("vcvtsi2ssl %s,%s,%s\n",
   24686                 nameIReg32(rS), nameXMMReg(rV), nameXMMReg(rD));
   24687          } else {
   24688             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24689             assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   24690             delta += alen;
   24691             DIP("vcvtsi2ssl %s,%s,%s\n",
   24692                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   24693          }
   24694          putXMMRegLane32F( rD, 0,
   24695                            binop(Iop_F64toF32,
   24696                                  mkexpr(rmode),
   24697                                  unop(Iop_I32StoF64, mkexpr(arg32)) ) );
   24698          putXMMRegLane32( rD, 1, getXMMRegLane32( rV, 1 ));
   24699          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   24700          putYMMRegLane128( rD, 1, mkV128(0) );
   24701          *uses_vvvv = True;
   24702          goto decode_success;
   24703       }
   24704       break;
   24705    }
   24706 
   24707    case 0x2B:
   24708       /* VMOVNTPD xmm1, m128 = VEX.128.66.0F.WIG 2B /r */
   24709       /* VMOVNTPS xmm1, m128 = VEX.128.0F.WIG 2B /r */
   24710       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   24711           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
   24712          UChar  modrm = getUChar(delta);
   24713          UInt   rS    = gregOfRexRM(pfx, modrm);
   24714          IRTemp tS    = newTemp(Ity_V128);
   24715          assign(tS, getXMMReg(rS));
   24716          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   24717          delta += alen;
   24718          gen_SEGV_if_not_16_aligned(addr);
   24719          storeLE(mkexpr(addr), mkexpr(tS));
   24720          DIP("vmovntp%c %s,%s\n", have66(pfx) ? 'd' : 's',
   24721              nameXMMReg(rS), dis_buf);
   24722          goto decode_success;
   24723       }
   24724       /* VMOVNTPD ymm1, m256 = VEX.256.66.0F.WIG 2B /r */
   24725       /* VMOVNTPS ymm1, m256 = VEX.256.0F.WIG 2B /r */
   24726       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   24727           && 1==getVexL(pfx)/*256*/ && !epartIsReg(getUChar(delta))) {
   24728          UChar  modrm = getUChar(delta);
   24729          UInt   rS    = gregOfRexRM(pfx, modrm);
   24730          IRTemp tS    = newTemp(Ity_V256);
   24731          assign(tS, getYMMReg(rS));
   24732          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   24733          delta += alen;
   24734          gen_SEGV_if_not_32_aligned(addr);
   24735          storeLE(mkexpr(addr), mkexpr(tS));
   24736          DIP("vmovntp%c %s,%s\n", have66(pfx) ? 'd' : 's',
   24737              nameYMMReg(rS), dis_buf);
   24738          goto decode_success;
   24739       }
   24740       break;
   24741 
   24742    case 0x2C:
   24743       /* VCVTTSD2SI xmm1/m32, r32 = VEX.LIG.F2.0F.W0 2C /r */
   24744       if (haveF2no66noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
   24745          delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
   24746          goto decode_success;
   24747       }
   24748       /* VCVTTSD2SI xmm1/m64, r64 = VEX.LIG.F2.0F.W1 2C /r */
   24749       if (haveF2no66noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
   24750          delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
   24751          goto decode_success;
   24752       }
   24753       /* VCVTTSS2SI xmm1/m32, r32 = VEX.LIG.F3.0F.W0 2C /r */
   24754       if (haveF3no66noF2(pfx) && 0==getRexW(pfx)/*W0*/) {
   24755          delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
   24756          goto decode_success;
   24757       }
   24758       /* VCVTTSS2SI xmm1/m64, r64 = VEX.LIG.F3.0F.W1 2C /r */
   24759       if (haveF3no66noF2(pfx) && 1==getRexW(pfx)/*W1*/) {
   24760          delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
   24761          goto decode_success;
   24762       }
   24763       break;
   24764 
   24765    case 0x2D:
   24766       /* VCVTSD2SI xmm1/m32, r32 = VEX.LIG.F2.0F.W0 2D /r */
   24767       if (haveF2no66noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
   24768          delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
   24769          goto decode_success;
   24770       }
   24771       /* VCVTSD2SI xmm1/m64, r64 = VEX.LIG.F2.0F.W1 2D /r */
   24772       if (haveF2no66noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
   24773          delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
   24774          goto decode_success;
   24775       }
   24776       /* VCVTSS2SI xmm1/m32, r32 = VEX.LIG.F3.0F.W0 2D /r */
   24777       if (haveF3no66noF2(pfx) && 0==getRexW(pfx)/*W0*/) {
   24778          delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
   24779          goto decode_success;
   24780       }
   24781       /* VCVTSS2SI xmm1/m64, r64 = VEX.LIG.F3.0F.W1 2D /r */
   24782       if (haveF3no66noF2(pfx) && 1==getRexW(pfx)/*W1*/) {
   24783          delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
   24784          goto decode_success;
   24785       }
   24786       break;
   24787 
   24788    case 0x2E:
   24789    case 0x2F:
   24790       /* VUCOMISD xmm2/m64, xmm1 = VEX.LIG.66.0F.WIG 2E /r */
   24791       /* VCOMISD  xmm2/m64, xmm1 = VEX.LIG.66.0F.WIG 2F /r */
   24792       if (have66noF2noF3(pfx)) {
   24793          delta = dis_COMISD( vbi, pfx, delta, True/*isAvx*/, opc );
   24794          goto decode_success;
   24795       }
   24796       /* VUCOMISS xmm2/m32, xmm1 = VEX.LIG.0F.WIG 2E /r */
   24797       /* VCOMISS xmm2/m32, xmm1  = VEX.LIG.0F.WIG 2F /r */
   24798       if (haveNo66noF2noF3(pfx)) {
   24799          delta = dis_COMISS( vbi, pfx, delta, True/*isAvx*/, opc );
   24800          goto decode_success;
   24801       }
   24802       break;
   24803 
   24804    case 0x50:
   24805       /* VMOVMSKPD xmm2, r32 = VEX.128.66.0F.WIG 50 /r */
   24806       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24807          delta = dis_MOVMSKPD_128( vbi, pfx, delta, True/*isAvx*/ );
   24808          goto decode_success;
   24809       }
   24810       /* VMOVMSKPD ymm2, r32 = VEX.256.66.0F.WIG 50 /r */
   24811       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24812          delta = dis_MOVMSKPD_256( vbi, pfx, delta );
   24813          goto decode_success;
   24814       }
   24815       /* VMOVMSKPS xmm2, r32 = VEX.128.0F.WIG 50 /r */
   24816       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24817          delta = dis_MOVMSKPS_128( vbi, pfx, delta, True/*isAvx*/ );
   24818          goto decode_success;
   24819       }
   24820       /* VMOVMSKPS ymm2, r32 = VEX.256.0F.WIG 50 /r */
   24821       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24822          delta = dis_MOVMSKPS_256( vbi, pfx, delta );
   24823          goto decode_success;
   24824       }
   24825       break;
   24826 
   24827    case 0x51:
   24828       /* VSQRTSS xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F3.0F.WIG 51 /r */
   24829       if (haveF3no66noF2(pfx)) {
   24830          delta = dis_AVX128_E_V_to_G_lo32_unary(
   24831                     uses_vvvv, vbi, pfx, delta, "vsqrtss", Iop_Sqrt32F0x4 );
   24832          goto decode_success;
   24833       }
   24834       /* VSQRTPS xmm2/m128(E), xmm1(G) = VEX.NDS.128.0F.WIG 51 /r */
   24835       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24836          delta = dis_AVX128_E_to_G_unary_all(
   24837                     uses_vvvv, vbi, pfx, delta, "vsqrtps", Iop_Sqrt32Fx4 );
   24838          goto decode_success;
   24839       }
   24840       /* VSQRTPS ymm2/m256(E), ymm1(G) = VEX.NDS.256.0F.WIG 51 /r */
   24841       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24842          delta = dis_AVX256_E_to_G_unary_all(
   24843                     uses_vvvv, vbi, pfx, delta, "vsqrtps", Iop_Sqrt32Fx8 );
   24844          goto decode_success;
   24845       }
   24846       /* VSQRTSD xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F2.0F.WIG 51 /r */
   24847       if (haveF2no66noF3(pfx)) {
   24848          delta = dis_AVX128_E_V_to_G_lo64_unary(
   24849                     uses_vvvv, vbi, pfx, delta, "vsqrtsd", Iop_Sqrt64F0x2 );
   24850          goto decode_success;
   24851       }
   24852       /* VSQRTPD xmm2/m128(E), xmm1(G) = VEX.NDS.128.66.0F.WIG 51 /r */
   24853       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24854          delta = dis_AVX128_E_to_G_unary_all(
   24855                     uses_vvvv, vbi, pfx, delta, "vsqrtpd", Iop_Sqrt64Fx2 );
   24856          goto decode_success;
   24857       }
   24858       /* VSQRTPD ymm2/m256(E), ymm1(G) = VEX.NDS.256.66.0F.WIG 51 /r */
   24859       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24860          delta = dis_AVX256_E_to_G_unary_all(
   24861                     uses_vvvv, vbi, pfx, delta, "vsqrtpd", Iop_Sqrt64Fx4 );
   24862          goto decode_success;
   24863       }
   24864       break;
   24865 
   24866    case 0x52:
   24867       /* VRSQRTSS xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F3.0F.WIG 52 /r */
   24868       if (haveF3no66noF2(pfx)) {
   24869          delta = dis_AVX128_E_V_to_G_lo32_unary(
   24870                     uses_vvvv, vbi, pfx, delta, "vrsqrtss",
   24871                     Iop_RSqrtEst32F0x4 );
   24872          goto decode_success;
   24873       }
   24874       /* VRSQRTPS xmm2/m128(E), xmm1(G) = VEX.NDS.128.0F.WIG 52 /r */
   24875       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24876          delta = dis_AVX128_E_to_G_unary_all(
   24877                     uses_vvvv, vbi, pfx, delta, "vrsqrtps", Iop_RSqrtEst32Fx4 );
   24878          goto decode_success;
   24879       }
   24880       /* VRSQRTPS ymm2/m256(E), ymm1(G) = VEX.NDS.256.0F.WIG 52 /r */
   24881       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24882          delta = dis_AVX256_E_to_G_unary_all(
   24883                     uses_vvvv, vbi, pfx, delta, "vrsqrtps", Iop_RSqrtEst32Fx8 );
   24884          goto decode_success;
   24885       }
   24886       break;
   24887 
   24888    case 0x53:
   24889       /* VRCPSS xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F3.0F.WIG 53 /r */
   24890       if (haveF3no66noF2(pfx)) {
   24891          delta = dis_AVX128_E_V_to_G_lo32_unary(
   24892                     uses_vvvv, vbi, pfx, delta, "vrcpss", Iop_RecipEst32F0x4 );
   24893          goto decode_success;
   24894       }
   24895       /* VRCPPS xmm2/m128(E), xmm1(G) = VEX.NDS.128.0F.WIG 53 /r */
   24896       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24897          delta = dis_AVX128_E_to_G_unary_all(
   24898                     uses_vvvv, vbi, pfx, delta, "vrcpps", Iop_RecipEst32Fx4 );
   24899          goto decode_success;
   24900       }
   24901       /* VRCPPS ymm2/m256(E), ymm1(G) = VEX.NDS.256.0F.WIG 53 /r */
   24902       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24903          delta = dis_AVX256_E_to_G_unary_all(
   24904                     uses_vvvv, vbi, pfx, delta, "vrcpps", Iop_RecipEst32Fx8 );
   24905          goto decode_success;
   24906       }
   24907       break;
   24908 
   24909    case 0x54:
   24910       /* VANDPD r/m, rV, r ::: r = rV & r/m */
   24911       /* VANDPD = VEX.NDS.128.66.0F.WIG 54 /r */
   24912       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24913          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24914                     uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV128 );
   24915          goto decode_success;
   24916       }
   24917       /* VANDPD r/m, rV, r ::: r = rV & r/m */
   24918       /* VANDPD = VEX.NDS.256.66.0F.WIG 54 /r */
   24919       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24920          delta = dis_AVX256_E_V_to_G(
   24921                     uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV256 );
   24922          goto decode_success;
   24923       }
   24924       /* VANDPS = VEX.NDS.128.0F.WIG 54 /r */
   24925       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24926          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24927                     uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV128 );
   24928          goto decode_success;
   24929       }
   24930       /* VANDPS = VEX.NDS.256.0F.WIG 54 /r */
   24931       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24932          delta = dis_AVX256_E_V_to_G(
   24933                     uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV256 );
   24934          goto decode_success;
   24935       }
   24936       break;
   24937 
   24938    case 0x55:
   24939       /* VANDNPD r/m, rV, r ::: r = (not rV) & r/m */
   24940       /* VANDNPD = VEX.NDS.128.66.0F.WIG 55 /r */
   24941       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24942          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   24943                     uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV128,
   24944                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
   24945          goto decode_success;
   24946       }
   24947       /* VANDNPD = VEX.NDS.256.66.0F.WIG 55 /r */
   24948       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24949          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
   24950                     uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV256,
   24951                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
   24952          goto decode_success;
   24953       }
   24954       /* VANDNPS = VEX.NDS.128.0F.WIG 55 /r */
   24955       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24956          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   24957                     uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV128,
   24958                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
   24959          goto decode_success;
   24960       }
   24961       /* VANDNPS = VEX.NDS.256.0F.WIG 55 /r */
   24962       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24963          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
   24964                     uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV256,
   24965                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
   24966          goto decode_success;
   24967       }
   24968       break;
   24969 
   24970    case 0x56:
   24971       /* VORPD r/m, rV, r ::: r = rV | r/m */
   24972       /* VORPD = VEX.NDS.128.66.0F.WIG 56 /r */
   24973       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24974          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24975                     uses_vvvv, vbi, pfx, delta, "vorpd", Iop_OrV128 );
   24976          goto decode_success;
   24977       }
   24978       /* VORPD r/m, rV, r ::: r = rV | r/m */
   24979       /* VORPD = VEX.NDS.256.66.0F.WIG 56 /r */
   24980       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24981          delta = dis_AVX256_E_V_to_G(
   24982                     uses_vvvv, vbi, pfx, delta, "vorpd", Iop_OrV256 );
   24983          goto decode_success;
   24984       }
   24985       /* VORPS r/m, rV, r ::: r = rV | r/m */
   24986       /* VORPS = VEX.NDS.128.0F.WIG 56 /r */
   24987       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24988          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24989                     uses_vvvv, vbi, pfx, delta, "vorps", Iop_OrV128 );
   24990          goto decode_success;
   24991       }
   24992       /* VORPS r/m, rV, r ::: r = rV | r/m */
   24993       /* VORPS = VEX.NDS.256.0F.WIG 56 /r */
   24994       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24995          delta = dis_AVX256_E_V_to_G(
   24996                     uses_vvvv, vbi, pfx, delta, "vorps", Iop_OrV256 );
   24997          goto decode_success;
   24998       }
   24999       break;
   25000 
   25001    case 0x57:
   25002       /* VXORPD r/m, rV, r ::: r = rV ^ r/m */
   25003       /* VXORPD = VEX.NDS.128.66.0F.WIG 57 /r */
   25004       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25005          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25006                     uses_vvvv, vbi, pfx, delta, "vxorpd", Iop_XorV128 );
   25007          goto decode_success;
   25008       }
   25009       /* VXORPD r/m, rV, r ::: r = rV ^ r/m */
   25010       /* VXORPD = VEX.NDS.256.66.0F.WIG 57 /r */
   25011       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25012          delta = dis_AVX256_E_V_to_G(
   25013                     uses_vvvv, vbi, pfx, delta, "vxorpd", Iop_XorV256 );
   25014          goto decode_success;
   25015       }
   25016       /* VXORPS r/m, rV, r ::: r = rV ^ r/m */
   25017       /* VXORPS = VEX.NDS.128.0F.WIG 57 /r */
   25018       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25019          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25020                     uses_vvvv, vbi, pfx, delta, "vxorps", Iop_XorV128 );
   25021          goto decode_success;
   25022       }
   25023       /* VXORPS r/m, rV, r ::: r = rV ^ r/m */
   25024       /* VXORPS = VEX.NDS.256.0F.WIG 57 /r */
   25025       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25026          delta = dis_AVX256_E_V_to_G(
   25027                     uses_vvvv, vbi, pfx, delta, "vxorps", Iop_XorV256 );
   25028          goto decode_success;
   25029       }
   25030       break;
   25031 
   25032    case 0x58:
   25033       /* VADDSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 58 /r */
   25034       if (haveF2no66noF3(pfx)) {
   25035          delta = dis_AVX128_E_V_to_G_lo64(
   25036                     uses_vvvv, vbi, pfx, delta, "vaddsd", Iop_Add64F0x2 );
   25037          goto decode_success;
   25038       }
   25039       /* VADDSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 58 /r */
   25040       if (haveF3no66noF2(pfx)) {
   25041          delta = dis_AVX128_E_V_to_G_lo32(
   25042                     uses_vvvv, vbi, pfx, delta, "vaddss", Iop_Add32F0x4 );
   25043          goto decode_success;
   25044       }
   25045       /* VADDPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 58 /r */
   25046       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25047          delta = dis_AVX128_E_V_to_G(
   25048                     uses_vvvv, vbi, pfx, delta, "vaddps", Iop_Add32Fx4 );
   25049          goto decode_success;
   25050       }
   25051       /* VADDPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 58 /r */
   25052       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25053          delta = dis_AVX256_E_V_to_G(
   25054                     uses_vvvv, vbi, pfx, delta, "vaddps", Iop_Add32Fx8 );
   25055          goto decode_success;
   25056       }
   25057       /* VADDPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 58 /r */
   25058       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25059          delta = dis_AVX128_E_V_to_G(
   25060                     uses_vvvv, vbi, pfx, delta, "vaddpd", Iop_Add64Fx2 );
   25061          goto decode_success;
   25062       }
   25063       /* VADDPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 58 /r */
   25064       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25065          delta = dis_AVX256_E_V_to_G(
   25066                     uses_vvvv, vbi, pfx, delta, "vaddpd", Iop_Add64Fx4 );
   25067          goto decode_success;
   25068       }
   25069       break;
   25070 
   25071    case 0x59:
   25072       /* VMULSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 59 /r */
   25073       if (haveF2no66noF3(pfx)) {
   25074          delta = dis_AVX128_E_V_to_G_lo64(
   25075                     uses_vvvv, vbi, pfx, delta, "vmulsd", Iop_Mul64F0x2 );
   25076          goto decode_success;
   25077       }
   25078       /* VMULSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 59 /r */
   25079       if (haveF3no66noF2(pfx)) {
   25080          delta = dis_AVX128_E_V_to_G_lo32(
   25081                     uses_vvvv, vbi, pfx, delta, "vmulss", Iop_Mul32F0x4 );
   25082          goto decode_success;
   25083       }
   25084       /* VMULPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 59 /r */
   25085       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25086          delta = dis_AVX128_E_V_to_G(
   25087                     uses_vvvv, vbi, pfx, delta, "vmulps", Iop_Mul32Fx4 );
   25088          goto decode_success;
   25089       }
   25090       /* VMULPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 59 /r */
   25091       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25092          delta = dis_AVX256_E_V_to_G(
   25093                     uses_vvvv, vbi, pfx, delta, "vmulps", Iop_Mul32Fx8 );
   25094          goto decode_success;
   25095       }
   25096       /* VMULPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 59 /r */
   25097       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25098          delta = dis_AVX128_E_V_to_G(
   25099                     uses_vvvv, vbi, pfx, delta, "vmulpd", Iop_Mul64Fx2 );
   25100          goto decode_success;
   25101       }
   25102       /* VMULPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 59 /r */
   25103       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25104          delta = dis_AVX256_E_V_to_G(
   25105                     uses_vvvv, vbi, pfx, delta, "vmulpd", Iop_Mul64Fx4 );
   25106          goto decode_success;
   25107       }
   25108       break;
   25109 
   25110    case 0x5A:
   25111       /* VCVTPS2PD xmm2/m64, xmm1 = VEX.128.0F.WIG 5A /r */
   25112       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25113          delta = dis_CVTPS2PD_128( vbi, pfx, delta, True/*isAvx*/ );
   25114          goto decode_success;
   25115       }
   25116       /* VCVTPS2PD xmm2/m128, ymm1 = VEX.256.0F.WIG 5A /r */
   25117       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25118          delta = dis_CVTPS2PD_256( vbi, pfx, delta );
   25119          goto decode_success;
   25120       }
   25121       /* VCVTPD2PS xmm2/m128, xmm1 = VEX.128.66.0F.WIG 5A /r */
   25122       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25123          delta = dis_CVTPD2PS_128( vbi, pfx, delta, True/*isAvx*/ );
   25124          goto decode_success;
   25125       }
   25126       /* VCVTPD2PS ymm2/m256, xmm1 = VEX.256.66.0F.WIG 5A /r */
   25127       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25128          delta = dis_CVTPD2PS_256( vbi, pfx, delta );
   25129          goto decode_success;
   25130       }
   25131       /* VCVTSD2SS xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5A /r */
   25132       if (haveF2no66noF3(pfx)) {
   25133          UChar  modrm = getUChar(delta);
   25134          UInt   rV    = getVexNvvvv(pfx);
   25135          UInt   rD    = gregOfRexRM(pfx, modrm);
   25136          IRTemp f64lo = newTemp(Ity_F64);
   25137          IRTemp rmode = newTemp(Ity_I32);
   25138          assign( rmode, get_sse_roundingmode() );
   25139          if (epartIsReg(modrm)) {
   25140             UInt rS = eregOfRexRM(pfx,modrm);
   25141             assign(f64lo, getXMMRegLane64F(rS, 0));
   25142             delta += 1;
   25143             DIP("vcvtsd2ss %s,%s,%s\n",
   25144                 nameXMMReg(rS), nameXMMReg(rV), nameXMMReg(rD));
   25145          } else {
   25146             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   25147             assign(f64lo, loadLE(Ity_F64, mkexpr(addr)) );
   25148             delta += alen;
   25149             DIP("vcvtsd2ss %s,%s,%s\n",
   25150                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   25151          }
   25152          putXMMRegLane32F( rD, 0,
   25153                            binop( Iop_F64toF32, mkexpr(rmode),
   25154                                                 mkexpr(f64lo)) );
   25155          putXMMRegLane32( rD, 1, getXMMRegLane32( rV, 1 ));
   25156          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   25157          putYMMRegLane128( rD, 1, mkV128(0) );
   25158          *uses_vvvv = True;
   25159          goto decode_success;
   25160       }
   25161       /* VCVTSS2SD xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5A /r */
   25162       if (haveF3no66noF2(pfx)) {
   25163          UChar  modrm = getUChar(delta);
   25164          UInt   rV    = getVexNvvvv(pfx);
   25165          UInt   rD    = gregOfRexRM(pfx, modrm);
   25166          IRTemp f32lo = newTemp(Ity_F32);
   25167          if (epartIsReg(modrm)) {
   25168             UInt rS = eregOfRexRM(pfx,modrm);
   25169             assign(f32lo, getXMMRegLane32F(rS, 0));
   25170             delta += 1;
   25171             DIP("vcvtss2sd %s,%s,%s\n",
   25172                 nameXMMReg(rS), nameXMMReg(rV), nameXMMReg(rD));
   25173          } else {
   25174             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   25175             assign(f32lo, loadLE(Ity_F32, mkexpr(addr)) );
   25176             delta += alen;
   25177             DIP("vcvtss2sd %s,%s,%s\n",
   25178                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   25179          }
   25180          putXMMRegLane64F( rD, 0,
   25181                            unop( Iop_F32toF64, mkexpr(f32lo)) );
   25182          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   25183          putYMMRegLane128( rD, 1, mkV128(0) );
   25184          *uses_vvvv = True;
   25185          goto decode_success;
   25186       }
   25187       break;
   25188 
   25189    case 0x5B:
   25190       /* VCVTPS2DQ xmm2/m128, xmm1 = VEX.128.66.0F.WIG 5B /r */
   25191       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25192          delta = dis_CVTxPS2DQ_128( vbi, pfx, delta,
   25193                                     True/*isAvx*/, False/*!r2zero*/ );
   25194          goto decode_success;
   25195       }
   25196       /* VCVTPS2DQ ymm2/m256, ymm1 = VEX.256.66.0F.WIG 5B /r */
   25197       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25198          delta = dis_CVTxPS2DQ_256( vbi, pfx, delta,
   25199                                     False/*!r2zero*/ );
   25200          goto decode_success;
   25201       }
   25202       /* VCVTTPS2DQ xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 5B /r */
   25203       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
   25204          delta = dis_CVTxPS2DQ_128( vbi, pfx, delta,
   25205                                     True/*isAvx*/, True/*r2zero*/ );
   25206          goto decode_success;
   25207       }
   25208       /* VCVTTPS2DQ ymm2/m256, ymm1 = VEX.256.F3.0F.WIG 5B /r */
   25209       if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
   25210          delta = dis_CVTxPS2DQ_256( vbi, pfx, delta,
   25211                                     True/*r2zero*/ );
   25212          goto decode_success;
   25213       }
   25214       /* VCVTDQ2PS xmm2/m128, xmm1 = VEX.128.0F.WIG 5B /r */
   25215       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25216          delta = dis_CVTDQ2PS_128 ( vbi, pfx, delta, True/*isAvx*/ );
   25217          goto decode_success;
   25218       }
   25219       /* VCVTDQ2PS ymm2/m256, ymm1 = VEX.256.0F.WIG 5B /r */
   25220       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25221          delta = dis_CVTDQ2PS_256 ( vbi, pfx, delta );
   25222          goto decode_success;
   25223       }
   25224       break;
   25225 
   25226    case 0x5C:
   25227       /* VSUBSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5C /r */
   25228       if (haveF2no66noF3(pfx)) {
   25229          delta = dis_AVX128_E_V_to_G_lo64(
   25230                     uses_vvvv, vbi, pfx, delta, "vsubsd", Iop_Sub64F0x2 );
   25231          goto decode_success;
   25232       }
   25233       /* VSUBSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5C /r */
   25234       if (haveF3no66noF2(pfx)) {
   25235          delta = dis_AVX128_E_V_to_G_lo32(
   25236                     uses_vvvv, vbi, pfx, delta, "vsubss", Iop_Sub32F0x4 );
   25237          goto decode_success;
   25238       }
   25239       /* VSUBPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5C /r */
   25240       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25241          delta = dis_AVX128_E_V_to_G(
   25242                     uses_vvvv, vbi, pfx, delta, "vsubps", Iop_Sub32Fx4 );
   25243          goto decode_success;
   25244       }
   25245       /* VSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5C /r */
   25246       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25247          delta = dis_AVX256_E_V_to_G(
   25248                     uses_vvvv, vbi, pfx, delta, "vsubps", Iop_Sub32Fx8 );
   25249          goto decode_success;
   25250       }
   25251       /* VSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5C /r */
   25252       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25253          delta = dis_AVX128_E_V_to_G(
   25254                     uses_vvvv, vbi, pfx, delta, "vsubpd", Iop_Sub64Fx2 );
   25255          goto decode_success;
   25256       }
   25257       /* VSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5C /r */
   25258       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25259          delta = dis_AVX256_E_V_to_G(
   25260                     uses_vvvv, vbi, pfx, delta, "vsubpd", Iop_Sub64Fx4 );
   25261          goto decode_success;
   25262       }
   25263       break;
   25264 
   25265    case 0x5D:
   25266       /* VMINSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5D /r */
   25267       if (haveF2no66noF3(pfx)) {
   25268          delta = dis_AVX128_E_V_to_G_lo64(
   25269                     uses_vvvv, vbi, pfx, delta, "vminsd", Iop_Min64F0x2 );
   25270          goto decode_success;
   25271       }
   25272       /* VMINSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5D /r */
   25273       if (haveF3no66noF2(pfx)) {
   25274          delta = dis_AVX128_E_V_to_G_lo32(
   25275                     uses_vvvv, vbi, pfx, delta, "vminss", Iop_Min32F0x4 );
   25276          goto decode_success;
   25277       }
   25278       /* VMINPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5D /r */
   25279       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25280          delta = dis_AVX128_E_V_to_G(
   25281                     uses_vvvv, vbi, pfx, delta, "vminps", Iop_Min32Fx4 );
   25282          goto decode_success;
   25283       }
   25284       /* VMINPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5D /r */
   25285       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25286          delta = dis_AVX256_E_V_to_G(
   25287                     uses_vvvv, vbi, pfx, delta, "vminps", Iop_Min32Fx8 );
   25288          goto decode_success;
   25289       }
   25290       /* VMINPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5D /r */
   25291       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25292          delta = dis_AVX128_E_V_to_G(
   25293                     uses_vvvv, vbi, pfx, delta, "vminpd", Iop_Min64Fx2 );
   25294          goto decode_success;
   25295       }
   25296       /* VMINPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5D /r */
   25297       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25298          delta = dis_AVX256_E_V_to_G(
   25299                     uses_vvvv, vbi, pfx, delta, "vminpd", Iop_Min64Fx4 );
   25300          goto decode_success;
   25301       }
   25302       break;
   25303 
   25304    case 0x5E:
   25305       /* VDIVSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5E /r */
   25306       if (haveF2no66noF3(pfx)) {
   25307          delta = dis_AVX128_E_V_to_G_lo64(
   25308                     uses_vvvv, vbi, pfx, delta, "vdivsd", Iop_Div64F0x2 );
   25309          goto decode_success;
   25310       }
   25311       /* VDIVSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5E /r */
   25312       if (haveF3no66noF2(pfx)) {
   25313          delta = dis_AVX128_E_V_to_G_lo32(
   25314                     uses_vvvv, vbi, pfx, delta, "vdivss", Iop_Div32F0x4 );
   25315          goto decode_success;
   25316       }
   25317       /* VDIVPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5E /r */
   25318       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25319          delta = dis_AVX128_E_V_to_G(
   25320                     uses_vvvv, vbi, pfx, delta, "vdivps", Iop_Div32Fx4 );
   25321          goto decode_success;
   25322       }
   25323       /* VDIVPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5E /r */
   25324       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25325          delta = dis_AVX256_E_V_to_G(
   25326                     uses_vvvv, vbi, pfx, delta, "vdivps", Iop_Div32Fx8 );
   25327          goto decode_success;
   25328       }
   25329       /* VDIVPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5E /r */
   25330       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25331          delta = dis_AVX128_E_V_to_G(
   25332                     uses_vvvv, vbi, pfx, delta, "vdivpd", Iop_Div64Fx2 );
   25333          goto decode_success;
   25334       }
   25335       /* VDIVPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5E /r */
   25336       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25337          delta = dis_AVX256_E_V_to_G(
   25338                     uses_vvvv, vbi, pfx, delta, "vdivpd", Iop_Div64Fx4 );
   25339          goto decode_success;
   25340       }
   25341       break;
   25342 
   25343    case 0x5F:
   25344       /* VMAXSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5F /r */
   25345       if (haveF2no66noF3(pfx)) {
   25346          delta = dis_AVX128_E_V_to_G_lo64(
   25347                     uses_vvvv, vbi, pfx, delta, "vmaxsd", Iop_Max64F0x2 );
   25348          goto decode_success;
   25349       }
   25350       /* VMAXSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5F /r */
   25351       if (haveF3no66noF2(pfx)) {
   25352          delta = dis_AVX128_E_V_to_G_lo32(
   25353                     uses_vvvv, vbi, pfx, delta, "vmaxss", Iop_Max32F0x4 );
   25354          goto decode_success;
   25355       }
   25356       /* VMAXPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5F /r */
   25357       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25358          delta = dis_AVX128_E_V_to_G(
   25359                     uses_vvvv, vbi, pfx, delta, "vmaxps", Iop_Max32Fx4 );
   25360          goto decode_success;
   25361       }
   25362       /* VMAXPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5F /r */
   25363       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25364          delta = dis_AVX256_E_V_to_G(
   25365                     uses_vvvv, vbi, pfx, delta, "vmaxps", Iop_Max32Fx8 );
   25366          goto decode_success;
   25367       }
   25368       /* VMAXPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5F /r */
   25369       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25370          delta = dis_AVX128_E_V_to_G(
   25371                     uses_vvvv, vbi, pfx, delta, "vmaxpd", Iop_Max64Fx2 );
   25372          goto decode_success;
   25373       }
   25374       /* VMAXPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5F /r */
   25375       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25376          delta = dis_AVX256_E_V_to_G(
   25377                     uses_vvvv, vbi, pfx, delta, "vmaxpd", Iop_Max64Fx4 );
   25378          goto decode_success;
   25379       }
   25380       break;
   25381 
   25382    case 0x60:
   25383       /* VPUNPCKLBW r/m, rV, r ::: r = interleave-lo-bytes(rV, r/m) */
   25384       /* VPUNPCKLBW = VEX.NDS.128.66.0F.WIG 60 /r */
   25385       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25386          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25387                     uses_vvvv, vbi, pfx, delta, "vpunpcklbw",
   25388                     Iop_InterleaveLO8x16, NULL,
   25389                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25390          goto decode_success;
   25391       }
   25392       /* VPUNPCKLBW r/m, rV, r ::: r = interleave-lo-bytes(rV, r/m) */
   25393       /* VPUNPCKLBW = VEX.NDS.256.66.0F.WIG 60 /r */
   25394       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25395          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25396                     uses_vvvv, vbi, pfx, delta, "vpunpcklbw",
   25397                     math_VPUNPCKLBW_YMM );
   25398          goto decode_success;
   25399       }
   25400       break;
   25401 
   25402    case 0x61:
   25403       /* VPUNPCKLWD r/m, rV, r ::: r = interleave-lo-words(rV, r/m) */
   25404       /* VPUNPCKLWD = VEX.NDS.128.66.0F.WIG 61 /r */
   25405       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25406          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25407                     uses_vvvv, vbi, pfx, delta, "vpunpcklwd",
   25408                     Iop_InterleaveLO16x8, NULL,
   25409                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25410          goto decode_success;
   25411       }
   25412       /* VPUNPCKLWD r/m, rV, r ::: r = interleave-lo-words(rV, r/m) */
   25413       /* VPUNPCKLWD = VEX.NDS.256.66.0F.WIG 61 /r */
   25414       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25415          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25416                     uses_vvvv, vbi, pfx, delta, "vpunpcklwd",
   25417                     math_VPUNPCKLWD_YMM );
   25418          goto decode_success;
   25419       }
   25420       break;
   25421 
   25422    case 0x62:
   25423       /* VPUNPCKLDQ r/m, rV, r ::: r = interleave-lo-dwords(rV, r/m) */
   25424       /* VPUNPCKLDQ = VEX.NDS.128.66.0F.WIG 62 /r */
   25425       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25426          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25427                     uses_vvvv, vbi, pfx, delta, "vpunpckldq",
   25428                     Iop_InterleaveLO32x4, NULL,
   25429                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25430          goto decode_success;
   25431       }
   25432       /* VPUNPCKLDQ r/m, rV, r ::: r = interleave-lo-dwords(rV, r/m) */
   25433       /* VPUNPCKLDQ = VEX.NDS.256.66.0F.WIG 62 /r */
   25434       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25435          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25436                     uses_vvvv, vbi, pfx, delta, "vpunpckldq",
   25437                     math_VPUNPCKLDQ_YMM );
   25438          goto decode_success;
   25439       }
   25440       break;
   25441 
   25442    case 0x63:
   25443       /* VPACKSSWB r/m, rV, r ::: r = QNarrowBin16Sto8Sx16(rV, r/m) */
   25444       /* VPACKSSWB = VEX.NDS.128.66.0F.WIG 63 /r */
   25445       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25446          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25447                     uses_vvvv, vbi, pfx, delta, "vpacksswb",
   25448                     Iop_QNarrowBin16Sto8Sx16, NULL,
   25449                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25450          goto decode_success;
   25451       }
   25452       /* VPACKSSWB r/m, rV, r ::: r = QNarrowBin16Sto8Sx16(rV, r/m) */
   25453       /* VPACKSSWB = VEX.NDS.256.66.0F.WIG 63 /r */
   25454       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25455          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25456                     uses_vvvv, vbi, pfx, delta, "vpacksswb",
   25457                     math_VPACKSSWB_YMM );
   25458          goto decode_success;
   25459       }
   25460       break;
   25461 
   25462    case 0x64:
   25463       /* VPCMPGTB r/m, rV, r ::: r = rV `>s-by-8s` r/m */
   25464       /* VPCMPGTB = VEX.NDS.128.66.0F.WIG 64 /r */
   25465       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25466          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25467                     uses_vvvv, vbi, pfx, delta, "vpcmpgtb", Iop_CmpGT8Sx16 );
   25468          goto decode_success;
   25469       }
   25470       /* VPCMPGTB r/m, rV, r ::: r = rV `>s-by-8s` r/m */
   25471       /* VPCMPGTB = VEX.NDS.256.66.0F.WIG 64 /r */
   25472       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25473          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   25474                     uses_vvvv, vbi, pfx, delta, "vpcmpgtb", Iop_CmpGT8Sx32 );
   25475          goto decode_success;
   25476       }
   25477       break;
   25478 
   25479    case 0x65:
   25480       /* VPCMPGTW r/m, rV, r ::: r = rV `>s-by-16s` r/m */
   25481       /* VPCMPGTW = VEX.NDS.128.66.0F.WIG 65 /r */
   25482       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25483          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25484                     uses_vvvv, vbi, pfx, delta, "vpcmpgtw", Iop_CmpGT16Sx8 );
   25485          goto decode_success;
   25486       }
   25487       /* VPCMPGTW r/m, rV, r ::: r = rV `>s-by-16s` r/m */
   25488       /* VPCMPGTW = VEX.NDS.256.66.0F.WIG 65 /r */
   25489       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25490          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   25491                     uses_vvvv, vbi, pfx, delta, "vpcmpgtw", Iop_CmpGT16Sx16 );
   25492          goto decode_success;
   25493       }
   25494       break;
   25495 
   25496    case 0x66:
   25497       /* VPCMPGTD r/m, rV, r ::: r = rV `>s-by-32s` r/m */
   25498       /* VPCMPGTD = VEX.NDS.128.66.0F.WIG 66 /r */
   25499       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25500          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25501                     uses_vvvv, vbi, pfx, delta, "vpcmpgtd", Iop_CmpGT32Sx4 );
   25502          goto decode_success;
   25503       }
   25504       /* VPCMPGTD r/m, rV, r ::: r = rV `>s-by-32s` r/m */
   25505       /* VPCMPGTD = VEX.NDS.256.66.0F.WIG 66 /r */
   25506       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25507          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   25508                     uses_vvvv, vbi, pfx, delta, "vpcmpgtd", Iop_CmpGT32Sx8 );
   25509          goto decode_success;
   25510       }
   25511       break;
   25512 
   25513    case 0x67:
   25514       /* VPACKUSWB r/m, rV, r ::: r = QNarrowBin16Sto8Ux16(rV, r/m) */
   25515       /* VPACKUSWB = VEX.NDS.128.66.0F.WIG 67 /r */
   25516       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25517          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25518                     uses_vvvv, vbi, pfx, delta, "vpackuswb",
   25519                     Iop_QNarrowBin16Sto8Ux16, NULL,
   25520                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25521          goto decode_success;
   25522       }
   25523       /* VPACKUSWB r/m, rV, r ::: r = QNarrowBin16Sto8Ux16(rV, r/m) */
   25524       /* VPACKUSWB = VEX.NDS.256.66.0F.WIG 67 /r */
   25525       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25526          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25527                     uses_vvvv, vbi, pfx, delta, "vpackuswb",
   25528                     math_VPACKUSWB_YMM );
   25529          goto decode_success;
   25530       }
   25531       break;
   25532 
   25533    case 0x68:
   25534       /* VPUNPCKHBW r/m, rV, r ::: r = interleave-hi-bytes(rV, r/m) */
   25535       /* VPUNPCKHBW = VEX.NDS.128.0F.WIG 68 /r */
   25536       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25537          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25538                     uses_vvvv, vbi, pfx, delta, "vpunpckhbw",
   25539                     Iop_InterleaveHI8x16, NULL,
   25540                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25541          goto decode_success;
   25542       }
   25543       /* VPUNPCKHBW r/m, rV, r ::: r = interleave-hi-bytes(rV, r/m) */
   25544       /* VPUNPCKHBW = VEX.NDS.256.0F.WIG 68 /r */
   25545       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25546          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25547                     uses_vvvv, vbi, pfx, delta, "vpunpckhbw",
   25548                     math_VPUNPCKHBW_YMM );
   25549          goto decode_success;
   25550       }
   25551       break;
   25552 
   25553    case 0x69:
   25554       /* VPUNPCKHWD r/m, rV, r ::: r = interleave-hi-words(rV, r/m) */
   25555       /* VPUNPCKHWD = VEX.NDS.128.0F.WIG 69 /r */
   25556       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25557          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25558                     uses_vvvv, vbi, pfx, delta, "vpunpckhwd",
   25559                     Iop_InterleaveHI16x8, NULL,
   25560                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25561          goto decode_success;
   25562       }
   25563       /* VPUNPCKHWD r/m, rV, r ::: r = interleave-hi-words(rV, r/m) */
   25564       /* VPUNPCKHWD = VEX.NDS.256.0F.WIG 69 /r */
   25565       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25566          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25567                     uses_vvvv, vbi, pfx, delta, "vpunpckhwd",
   25568                     math_VPUNPCKHWD_YMM );
   25569          goto decode_success;
   25570       }
   25571       break;
   25572 
   25573    case 0x6A:
   25574       /* VPUNPCKHDQ r/m, rV, r ::: r = interleave-hi-dwords(rV, r/m) */
   25575       /* VPUNPCKHDQ = VEX.NDS.128.66.0F.WIG 6A /r */
   25576       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25577          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25578                     uses_vvvv, vbi, pfx, delta, "vpunpckhdq",
   25579                     Iop_InterleaveHI32x4, NULL,
   25580                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25581          goto decode_success;
   25582       }
   25583       /* VPUNPCKHDQ r/m, rV, r ::: r = interleave-hi-dwords(rV, r/m) */
   25584       /* VPUNPCKHDQ = VEX.NDS.256.66.0F.WIG 6A /r */
   25585       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25586          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25587                     uses_vvvv, vbi, pfx, delta, "vpunpckhdq",
   25588                     math_VPUNPCKHDQ_YMM );
   25589          goto decode_success;
   25590       }
   25591       break;
   25592 
   25593    case 0x6B:
   25594       /* VPACKSSDW r/m, rV, r ::: r = QNarrowBin32Sto16Sx8(rV, r/m) */
   25595       /* VPACKSSDW = VEX.NDS.128.66.0F.WIG 6B /r */
   25596       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25597          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25598                     uses_vvvv, vbi, pfx, delta, "vpackssdw",
   25599                     Iop_QNarrowBin32Sto16Sx8, NULL,
   25600                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25601          goto decode_success;
   25602       }
   25603       /* VPACKSSDW r/m, rV, r ::: r = QNarrowBin32Sto16Sx8(rV, r/m) */
   25604       /* VPACKSSDW = VEX.NDS.256.66.0F.WIG 6B /r */
   25605       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25606          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25607                     uses_vvvv, vbi, pfx, delta, "vpackssdw",
   25608                     math_VPACKSSDW_YMM );
   25609          goto decode_success;
   25610       }
   25611       break;
   25612 
   25613    case 0x6C:
   25614       /* VPUNPCKLQDQ r/m, rV, r ::: r = interleave-lo-64bitses(rV, r/m) */
   25615       /* VPUNPCKLQDQ = VEX.NDS.128.0F.WIG 6C /r */
   25616       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25617          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25618                     uses_vvvv, vbi, pfx, delta, "vpunpcklqdq",
   25619                     Iop_InterleaveLO64x2, NULL,
   25620                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25621          goto decode_success;
   25622       }
   25623       /* VPUNPCKLQDQ r/m, rV, r ::: r = interleave-lo-64bitses(rV, r/m) */
   25624       /* VPUNPCKLQDQ = VEX.NDS.256.0F.WIG 6C /r */
   25625       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25626          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25627                     uses_vvvv, vbi, pfx, delta, "vpunpcklqdq",
   25628                     math_VPUNPCKLQDQ_YMM );
   25629          goto decode_success;
   25630       }
   25631       break;
   25632 
   25633    case 0x6D:
   25634       /* VPUNPCKHQDQ r/m, rV, r ::: r = interleave-hi-64bitses(rV, r/m) */
   25635       /* VPUNPCKHQDQ = VEX.NDS.128.0F.WIG 6D /r */
   25636       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25637          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25638                     uses_vvvv, vbi, pfx, delta, "vpunpckhqdq",
   25639                     Iop_InterleaveHI64x2, NULL,
   25640                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25641          goto decode_success;
   25642       }
   25643       /* VPUNPCKHQDQ r/m, rV, r ::: r = interleave-hi-64bitses(rV, r/m) */
   25644       /* VPUNPCKHQDQ = VEX.NDS.256.0F.WIG 6D /r */
   25645       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25646          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25647                     uses_vvvv, vbi, pfx, delta, "vpunpckhqdq",
   25648                     math_VPUNPCKHQDQ_YMM );
   25649          goto decode_success;
   25650       }
   25651       break;
   25652 
   25653    case 0x6E:
   25654       /* VMOVD r32/m32, xmm1 = VEX.128.66.0F.W0 6E */
   25655       if (have66noF2noF3(pfx)
   25656           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   25657          vassert(sz == 2); /* even tho we are transferring 4, not 2. */
   25658          UChar modrm = getUChar(delta);
   25659          if (epartIsReg(modrm)) {
   25660             delta += 1;
   25661             putYMMRegLoAndZU(
   25662                gregOfRexRM(pfx,modrm),
   25663                unop( Iop_32UtoV128, getIReg32(eregOfRexRM(pfx,modrm)) )
   25664             );
   25665             DIP("vmovd %s, %s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   25666                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   25667         } else {
   25668             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   25669             delta += alen;
   25670             putYMMRegLoAndZU(
   25671                gregOfRexRM(pfx,modrm),
   25672                unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)))
   25673                              );
   25674             DIP("vmovd %s, %s\n", dis_buf,
   25675                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   25676          }
   25677          goto decode_success;
   25678       }
   25679       /* VMOVQ r64/m64, xmm1 = VEX.128.66.0F.W1 6E */
   25680       if (have66noF2noF3(pfx)
   25681           && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
   25682          vassert(sz == 2); /* even tho we are transferring 8, not 2. */
   25683          UChar modrm = getUChar(delta);
   25684          if (epartIsReg(modrm)) {
   25685             delta += 1;
   25686             putYMMRegLoAndZU(
   25687                gregOfRexRM(pfx,modrm),
   25688                unop( Iop_64UtoV128, getIReg64(eregOfRexRM(pfx,modrm)) )
   25689             );
   25690             DIP("vmovq %s, %s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   25691                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   25692         } else {
   25693             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   25694             delta += alen;
   25695             putYMMRegLoAndZU(
   25696                gregOfRexRM(pfx,modrm),
   25697                unop( Iop_64UtoV128,loadLE(Ity_I64, mkexpr(addr)))
   25698                              );
   25699             DIP("vmovq %s, %s\n", dis_buf,
   25700                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   25701          }
   25702          goto decode_success;
   25703       }
   25704       break;
   25705 
   25706    case 0x6F:
   25707       /* VMOVDQA ymm2/m256, ymm1 = VEX.256.66.0F.WIG 6F */
   25708       /* VMOVDQU ymm2/m256, ymm1 = VEX.256.F3.0F.WIG 6F */
   25709       if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
   25710           && 1==getVexL(pfx)/*256*/) {
   25711          UChar  modrm = getUChar(delta);
   25712          UInt   rD    = gregOfRexRM(pfx, modrm);
   25713          IRTemp tD    = newTemp(Ity_V256);
   25714          Bool   isA   = have66noF2noF3(pfx);
   25715          HChar  ch    = isA ? 'a' : 'u';
   25716          if (epartIsReg(modrm)) {
   25717             UInt rS = eregOfRexRM(pfx, modrm);
   25718             delta += 1;
   25719             assign(tD, getYMMReg(rS));
   25720             DIP("vmovdq%c %s,%s\n", ch, nameYMMReg(rS), nameYMMReg(rD));
   25721          } else {
   25722             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   25723             delta += alen;
   25724             if (isA)
   25725                gen_SEGV_if_not_32_aligned(addr);
   25726             assign(tD, loadLE(Ity_V256, mkexpr(addr)));
   25727             DIP("vmovdq%c %s,%s\n", ch, dis_buf, nameYMMReg(rD));
   25728          }
   25729          putYMMReg(rD, mkexpr(tD));
   25730          goto decode_success;
   25731       }
   25732       /* VMOVDQA xmm2/m128, xmm1 = VEX.128.66.0F.WIG 6F */
   25733       /* VMOVDQU xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 6F */
   25734       if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
   25735           && 0==getVexL(pfx)/*128*/) {
   25736          UChar  modrm = getUChar(delta);
   25737          UInt   rD    = gregOfRexRM(pfx, modrm);
   25738          IRTemp tD    = newTemp(Ity_V128);
   25739          Bool   isA   = have66noF2noF3(pfx);
   25740          HChar  ch    = isA ? 'a' : 'u';
   25741          if (epartIsReg(modrm)) {
   25742             UInt rS = eregOfRexRM(pfx, modrm);
   25743             delta += 1;
   25744             assign(tD, getXMMReg(rS));
   25745             DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), nameXMMReg(rD));
   25746          } else {
   25747             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   25748             delta += alen;
   25749             if (isA)
   25750                gen_SEGV_if_not_16_aligned(addr);
   25751             assign(tD, loadLE(Ity_V128, mkexpr(addr)));
   25752             DIP("vmovdq%c %s,%s\n", ch, dis_buf, nameXMMReg(rD));
   25753          }
   25754          putYMMRegLoAndZU(rD, mkexpr(tD));
   25755          goto decode_success;
   25756       }
   25757       break;
   25758 
   25759    case 0x70:
   25760       /* VPSHUFD imm8, xmm2/m128, xmm1 = VEX.128.66.0F.WIG 70 /r ib */
   25761       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25762          delta = dis_PSHUFD_32x4( vbi, pfx, delta, True/*writesYmm*/);
   25763          goto decode_success;
   25764       }
   25765       /* VPSHUFD imm8, ymm2/m256, ymm1 = VEX.256.66.0F.WIG 70 /r ib */
   25766       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25767          delta = dis_PSHUFD_32x8( vbi, pfx, delta);
   25768          goto decode_success;
   25769       }
   25770       /* VPSHUFLW imm8, xmm2/m128, xmm1 = VEX.128.F2.0F.WIG 70 /r ib */
   25771       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25772          delta = dis_PSHUFxW_128( vbi, pfx, delta,
   25773                                   True/*isAvx*/, False/*!xIsH*/ );
   25774          goto decode_success;
   25775       }
   25776       /* VPSHUFLW imm8, ymm2/m256, ymm1 = VEX.256.F2.0F.WIG 70 /r ib */
   25777       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25778          delta = dis_PSHUFxW_256( vbi, pfx, delta, False/*!xIsH*/ );
   25779          goto decode_success;
   25780       }
   25781       /* VPSHUFHW imm8, xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 70 /r ib */
   25782       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
   25783          delta = dis_PSHUFxW_128( vbi, pfx, delta,
   25784                                   True/*isAvx*/, True/*xIsH*/ );
   25785          goto decode_success;
   25786       }
   25787       /* VPSHUFHW imm8, ymm2/m256, ymm1 = VEX.256.F3.0F.WIG 70 /r ib */
   25788       if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
   25789          delta = dis_PSHUFxW_256( vbi, pfx, delta, True/*xIsH*/ );
   25790          goto decode_success;
   25791       }
   25792       break;
   25793 
   25794    case 0x71:
   25795       /* VPSRLW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /2 ib */
   25796       /* VPSRAW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /4 ib */
   25797       /* VPSLLW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /6 ib */
   25798       if (have66noF2noF3(pfx)
   25799           && 0==getVexL(pfx)/*128*/
   25800           && epartIsReg(getUChar(delta))) {
   25801          if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
   25802             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   25803                                                 "vpsrlw", Iop_ShrN16x8 );
   25804             *uses_vvvv = True;
   25805             goto decode_success;
   25806          }
   25807          if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
   25808             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   25809                                                 "vpsraw", Iop_SarN16x8 );
   25810             *uses_vvvv = True;
   25811             goto decode_success;
   25812          }
   25813          if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
   25814             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   25815                                                 "vpsllw", Iop_ShlN16x8 );
   25816             *uses_vvvv = True;
   25817             goto decode_success;
   25818          }
   25819          /* else fall through */
   25820       }
   25821       /* VPSRLW imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 71 /2 ib */
   25822       /* VPSRAW imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 71 /4 ib */
   25823       /* VPSLLW imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 71 /6 ib */
   25824       if (have66noF2noF3(pfx)
   25825           && 1==getVexL(pfx)/*256*/
   25826           && epartIsReg(getUChar(delta))) {
   25827          if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
   25828             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   25829                                                 "vpsrlw", Iop_ShrN16x16 );
   25830             *uses_vvvv = True;
   25831             goto decode_success;
   25832          }
   25833          if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
   25834             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   25835                                                 "vpsraw", Iop_SarN16x16 );
   25836             *uses_vvvv = True;
   25837             goto decode_success;
   25838          }
   25839          if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
   25840             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   25841                                                 "vpsllw", Iop_ShlN16x16 );
   25842             *uses_vvvv = True;
   25843             goto decode_success;
   25844          }
   25845          /* else fall through */
   25846       }
   25847       break;
   25848 
   25849    case 0x72:
   25850       /* VPSRLD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /2 ib */
   25851       /* VPSRAD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /4 ib */
   25852       /* VPSLLD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /6 ib */
   25853       if (have66noF2noF3(pfx)
   25854           && 0==getVexL(pfx)/*128*/
   25855           && epartIsReg(getUChar(delta))) {
   25856          if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
   25857             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   25858                                                 "vpsrld", Iop_ShrN32x4 );
   25859             *uses_vvvv = True;
   25860             goto decode_success;
   25861          }
   25862          if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
   25863             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   25864                                                 "vpsrad", Iop_SarN32x4 );
   25865             *uses_vvvv = True;
   25866             goto decode_success;
   25867          }
   25868          if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
   25869             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   25870                                                 "vpslld", Iop_ShlN32x4 );
   25871             *uses_vvvv = True;
   25872             goto decode_success;
   25873          }
   25874          /* else fall through */
   25875       }
   25876       /* VPSRLD imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 72 /2 ib */
   25877       /* VPSRAD imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 72 /4 ib */
   25878       /* VPSLLD imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 72 /6 ib */
   25879       if (have66noF2noF3(pfx)
   25880           && 1==getVexL(pfx)/*256*/
   25881           && epartIsReg(getUChar(delta))) {
   25882          if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
   25883             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   25884                                                 "vpsrld", Iop_ShrN32x8 );
   25885             *uses_vvvv = True;
   25886             goto decode_success;
   25887          }
   25888          if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
   25889             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   25890                                                 "vpsrad", Iop_SarN32x8 );
   25891             *uses_vvvv = True;
   25892             goto decode_success;
   25893          }
   25894          if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
   25895             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   25896                                                 "vpslld", Iop_ShlN32x8 );
   25897             *uses_vvvv = True;
   25898             goto decode_success;
   25899          }
   25900          /* else fall through */
   25901       }
   25902       break;
   25903 
   25904    case 0x73:
   25905       /* VPSRLDQ imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /3 ib */
   25906       /* VPSLLDQ imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /7 ib */
   25907       /* VPSRLQ  imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /2 ib */
   25908       /* VPSLLQ  imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /6 ib */
   25909       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   25910           && epartIsReg(getUChar(delta))) {
   25911          Int    rS   = eregOfRexRM(pfx,getUChar(delta));
   25912          Int    rD   = getVexNvvvv(pfx);
   25913          IRTemp vecS = newTemp(Ity_V128);
   25914          if (gregLO3ofRM(getUChar(delta)) == 3) {
   25915             Int imm = (Int)getUChar(delta+1);
   25916             DIP("vpsrldq $%d,%s,%s\n", imm, nameXMMReg(rS), nameXMMReg(rD));
   25917             delta += 2;
   25918             assign( vecS, getXMMReg(rS) );
   25919             putYMMRegLoAndZU(rD, mkexpr(math_PSRLDQ( vecS, imm )));
   25920             *uses_vvvv = True;
   25921             goto decode_success;
   25922          }
   25923          if (gregLO3ofRM(getUChar(delta)) == 7) {
   25924             Int imm = (Int)getUChar(delta+1);
   25925             DIP("vpslldq $%d,%s,%s\n", imm, nameXMMReg(rS), nameXMMReg(rD));
   25926             delta += 2;
   25927             assign( vecS, getXMMReg(rS) );
   25928             putYMMRegLoAndZU(rD, mkexpr(math_PSLLDQ( vecS, imm )));
   25929             *uses_vvvv = True;
   25930             goto decode_success;
   25931          }
   25932          if (gregLO3ofRM(getUChar(delta)) == 2) {
   25933             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   25934                                                 "vpsrlq", Iop_ShrN64x2 );
   25935             *uses_vvvv = True;
   25936             goto decode_success;
   25937          }
   25938          if (gregLO3ofRM(getUChar(delta)) == 6) {
   25939             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   25940                                                 "vpsllq", Iop_ShlN64x2 );
   25941             *uses_vvvv = True;
   25942             goto decode_success;
   25943          }
   25944          /* else fall through */
   25945       }
   25946       /* VPSRLDQ imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /3 ib */
   25947       /* VPSLLDQ imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /7 ib */
   25948       /* VPSRLQ  imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /2 ib */
   25949       /* VPSLLQ  imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /6 ib */
   25950       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   25951           && epartIsReg(getUChar(delta))) {
   25952          Int    rS   = eregOfRexRM(pfx,getUChar(delta));
   25953          Int    rD   = getVexNvvvv(pfx);
   25954          if (gregLO3ofRM(getUChar(delta)) == 3) {
   25955             IRTemp vecS0 = newTemp(Ity_V128);
   25956             IRTemp vecS1 = newTemp(Ity_V128);
   25957             Int imm = (Int)getUChar(delta+1);
   25958             DIP("vpsrldq $%d,%s,%s\n", imm, nameYMMReg(rS), nameYMMReg(rD));
   25959             delta += 2;
   25960             assign( vecS0, getYMMRegLane128(rS, 0));
   25961             assign( vecS1, getYMMRegLane128(rS, 1));
   25962             putYMMRegLane128(rD, 0, mkexpr(math_PSRLDQ( vecS0, imm )));
   25963             putYMMRegLane128(rD, 1, mkexpr(math_PSRLDQ( vecS1, imm )));
   25964             *uses_vvvv = True;
   25965             goto decode_success;
   25966          }
   25967          if (gregLO3ofRM(getUChar(delta)) == 7) {
   25968             IRTemp vecS0 = newTemp(Ity_V128);
   25969             IRTemp vecS1 = newTemp(Ity_V128);
   25970             Int imm = (Int)getUChar(delta+1);
   25971             DIP("vpslldq $%d,%s,%s\n", imm, nameYMMReg(rS), nameYMMReg(rD));
   25972             delta += 2;
   25973             assign( vecS0, getYMMRegLane128(rS, 0));
   25974             assign( vecS1, getYMMRegLane128(rS, 1));
   25975             putYMMRegLane128(rD, 0, mkexpr(math_PSLLDQ( vecS0, imm )));
   25976             putYMMRegLane128(rD, 1, mkexpr(math_PSLLDQ( vecS1, imm )));
   25977             *uses_vvvv = True;
   25978             goto decode_success;
   25979          }
   25980          if (gregLO3ofRM(getUChar(delta)) == 2) {
   25981             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   25982                                                 "vpsrlq", Iop_ShrN64x4 );
   25983             *uses_vvvv = True;
   25984             goto decode_success;
   25985          }
   25986          if (gregLO3ofRM(getUChar(delta)) == 6) {
   25987             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   25988                                                 "vpsllq", Iop_ShlN64x4 );
   25989             *uses_vvvv = True;
   25990             goto decode_success;
   25991          }
   25992          /* else fall through */
   25993       }
   25994       break;
   25995 
   25996    case 0x74:
   25997       /* VPCMPEQB r/m, rV, r ::: r = rV `eq-by-8s` r/m */
   25998       /* VPCMPEQB = VEX.NDS.128.66.0F.WIG 74 /r */
   25999       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26000          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26001                     uses_vvvv, vbi, pfx, delta, "vpcmpeqb", Iop_CmpEQ8x16 );
   26002          goto decode_success;
   26003       }
   26004       /* VPCMPEQB r/m, rV, r ::: r = rV `eq-by-8s` r/m */
   26005       /* VPCMPEQB = VEX.NDS.256.66.0F.WIG 74 /r */
   26006       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26007          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26008                     uses_vvvv, vbi, pfx, delta, "vpcmpeqb", Iop_CmpEQ8x32 );
   26009          goto decode_success;
   26010       }
   26011       break;
   26012 
   26013    case 0x75:
   26014       /* VPCMPEQW r/m, rV, r ::: r = rV `eq-by-16s` r/m */
   26015       /* VPCMPEQW = VEX.NDS.128.66.0F.WIG 75 /r */
   26016       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26017          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26018                     uses_vvvv, vbi, pfx, delta, "vpcmpeqw", Iop_CmpEQ16x8 );
   26019          goto decode_success;
   26020       }
   26021       /* VPCMPEQW r/m, rV, r ::: r = rV `eq-by-16s` r/m */
   26022       /* VPCMPEQW = VEX.NDS.256.66.0F.WIG 75 /r */
   26023       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26024          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26025                     uses_vvvv, vbi, pfx, delta, "vpcmpeqw", Iop_CmpEQ16x16 );
   26026          goto decode_success;
   26027       }
   26028       break;
   26029 
   26030    case 0x76:
   26031       /* VPCMPEQD r/m, rV, r ::: r = rV `eq-by-32s` r/m */
   26032       /* VPCMPEQD = VEX.NDS.128.66.0F.WIG 76 /r */
   26033       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26034          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26035                     uses_vvvv, vbi, pfx, delta, "vpcmpeqd", Iop_CmpEQ32x4 );
   26036          goto decode_success;
   26037       }
   26038       /* VPCMPEQD r/m, rV, r ::: r = rV `eq-by-32s` r/m */
   26039       /* VPCMPEQD = VEX.NDS.256.66.0F.WIG 76 /r */
   26040       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26041          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26042                     uses_vvvv, vbi, pfx, delta, "vpcmpeqd", Iop_CmpEQ32x8 );
   26043          goto decode_success;
   26044       }
   26045       break;
   26046 
   26047    case 0x77:
   26048       /* VZEROUPPER = VEX.128.0F.WIG 77 */
   26049       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26050          Int i;
   26051          IRTemp zero128 = newTemp(Ity_V128);
   26052          assign(zero128, mkV128(0));
   26053          for (i = 0; i < 16; i++) {
   26054             putYMMRegLane128(i, 1, mkexpr(zero128));
   26055          }
   26056          DIP("vzeroupper\n");
   26057          goto decode_success;
   26058       }
   26059       /* VZEROALL = VEX.256.0F.WIG 77 */
   26060       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26061          Int i;
   26062          IRTemp zero128 = newTemp(Ity_V128);
   26063          assign(zero128, mkV128(0));
   26064          for (i = 0; i < 16; i++) {
   26065             putYMMRegLoAndZU(i, mkexpr(zero128));
   26066          }
   26067          DIP("vzeroall\n");
   26068          goto decode_success;
   26069       }
   26070       break;
   26071 
   26072    case 0x7C:
   26073    case 0x7D:
   26074       /* VHADDPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.F2.0F.WIG 7C /r */
   26075       /* VHSUBPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.F2.0F.WIG 7D /r */
   26076       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26077          IRTemp sV     = newTemp(Ity_V128);
   26078          IRTemp dV     = newTemp(Ity_V128);
   26079          Bool   isAdd  = opc == 0x7C;
   26080          const HChar* str = isAdd ? "add" : "sub";
   26081          UChar modrm   = getUChar(delta);
   26082          UInt   rG     = gregOfRexRM(pfx,modrm);
   26083          UInt   rV     = getVexNvvvv(pfx);
   26084          if (epartIsReg(modrm)) {
   26085             UInt rE = eregOfRexRM(pfx,modrm);
   26086             assign( sV, getXMMReg(rE) );
   26087             DIP("vh%spd %s,%s,%s\n", str, nameXMMReg(rE),
   26088                 nameXMMReg(rV), nameXMMReg(rG));
   26089             delta += 1;
   26090          } else {
   26091             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   26092             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   26093             DIP("vh%spd %s,%s,%s\n", str, dis_buf,
   26094                 nameXMMReg(rV), nameXMMReg(rG));
   26095             delta += alen;
   26096          }
   26097          assign( dV, getXMMReg(rV) );
   26098          putYMMRegLoAndZU( rG, mkexpr( math_HADDPS_128 ( dV, sV, isAdd ) ) );
   26099          *uses_vvvv = True;
   26100          goto decode_success;
   26101       }
   26102       /* VHADDPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.F2.0F.WIG 7C /r */
   26103       /* VHSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.F2.0F.WIG 7D /r */
   26104       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26105          IRTemp sV     = newTemp(Ity_V256);
   26106          IRTemp dV     = newTemp(Ity_V256);
   26107          IRTemp s1, s0, d1, d0;
   26108          Bool   isAdd  = opc == 0x7C;
   26109          const HChar* str = isAdd ? "add" : "sub";
   26110          UChar modrm   = getUChar(delta);
   26111          UInt   rG     = gregOfRexRM(pfx,modrm);
   26112          UInt   rV     = getVexNvvvv(pfx);
   26113          s1 = s0 = d1 = d0 = IRTemp_INVALID;
   26114          if (epartIsReg(modrm)) {
   26115             UInt rE = eregOfRexRM(pfx,modrm);
   26116             assign( sV, getYMMReg(rE) );
   26117             DIP("vh%spd %s,%s,%s\n", str, nameYMMReg(rE),
   26118                 nameYMMReg(rV), nameYMMReg(rG));
   26119             delta += 1;
   26120          } else {
   26121             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   26122             assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   26123             DIP("vh%spd %s,%s,%s\n", str, dis_buf,
   26124                 nameYMMReg(rV), nameYMMReg(rG));
   26125             delta += alen;
   26126          }
   26127          assign( dV, getYMMReg(rV) );
   26128          breakupV256toV128s( dV, &d1, &d0 );
   26129          breakupV256toV128s( sV, &s1, &s0 );
   26130          putYMMReg( rG, binop(Iop_V128HLtoV256,
   26131                               mkexpr( math_HADDPS_128 ( d1, s1, isAdd ) ),
   26132                               mkexpr( math_HADDPS_128 ( d0, s0, isAdd ) ) ) );
   26133          *uses_vvvv = True;
   26134          goto decode_success;
   26135       }
   26136       /* VHADDPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 7C /r */
   26137       /* VHSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 7D /r */
   26138       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26139          IRTemp sV     = newTemp(Ity_V128);
   26140          IRTemp dV     = newTemp(Ity_V128);
   26141          Bool   isAdd  = opc == 0x7C;
   26142          const HChar* str = isAdd ? "add" : "sub";
   26143          UChar modrm   = getUChar(delta);
   26144          UInt   rG     = gregOfRexRM(pfx,modrm);
   26145          UInt   rV     = getVexNvvvv(pfx);
   26146          if (epartIsReg(modrm)) {
   26147             UInt rE = eregOfRexRM(pfx,modrm);
   26148             assign( sV, getXMMReg(rE) );
   26149             DIP("vh%spd %s,%s,%s\n", str, nameXMMReg(rE),
   26150                 nameXMMReg(rV), nameXMMReg(rG));
   26151             delta += 1;
   26152          } else {
   26153             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   26154             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   26155             DIP("vh%spd %s,%s,%s\n", str, dis_buf,
   26156                 nameXMMReg(rV), nameXMMReg(rG));
   26157             delta += alen;
   26158          }
   26159          assign( dV, getXMMReg(rV) );
   26160          putYMMRegLoAndZU( rG, mkexpr( math_HADDPD_128 ( dV, sV, isAdd ) ) );
   26161          *uses_vvvv = True;
   26162          goto decode_success;
   26163       }
   26164       /* VHADDPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 7C /r */
   26165       /* VHSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 7D /r */
   26166       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26167          IRTemp sV     = newTemp(Ity_V256);
   26168          IRTemp dV     = newTemp(Ity_V256);
   26169          IRTemp s1, s0, d1, d0;
   26170          Bool   isAdd  = opc == 0x7C;
   26171          const HChar* str = isAdd ? "add" : "sub";
   26172          UChar modrm   = getUChar(delta);
   26173          UInt   rG     = gregOfRexRM(pfx,modrm);
   26174          UInt   rV     = getVexNvvvv(pfx);
   26175          s1 = s0 = d1 = d0 = IRTemp_INVALID;
   26176          if (epartIsReg(modrm)) {
   26177             UInt rE = eregOfRexRM(pfx,modrm);
   26178             assign( sV, getYMMReg(rE) );
   26179             DIP("vh%spd %s,%s,%s\n", str, nameYMMReg(rE),
   26180                 nameYMMReg(rV), nameYMMReg(rG));
   26181             delta += 1;
   26182          } else {
   26183             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   26184             assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   26185             DIP("vh%spd %s,%s,%s\n", str, dis_buf,
   26186                 nameYMMReg(rV), nameYMMReg(rG));
   26187             delta += alen;
   26188          }
   26189          assign( dV, getYMMReg(rV) );
   26190          breakupV256toV128s( dV, &d1, &d0 );
   26191          breakupV256toV128s( sV, &s1, &s0 );
   26192          putYMMReg( rG, binop(Iop_V128HLtoV256,
   26193                               mkexpr( math_HADDPD_128 ( d1, s1, isAdd ) ),
   26194                               mkexpr( math_HADDPD_128 ( d0, s0, isAdd ) ) ) );
   26195          *uses_vvvv = True;
   26196          goto decode_success;
   26197       }
   26198       break;
   26199 
   26200    case 0x7E:
   26201       /* Note the Intel docs don't make sense for this.  I think they
   26202          are wrong.  They seem to imply it is a store when in fact I
   26203          think it is a load.  Also it's unclear whether this is W0, W1
   26204          or WIG. */
   26205       /* VMOVQ xmm2/m64, xmm1 = VEX.128.F3.0F.W0 7E /r */
   26206       if (haveF3no66noF2(pfx)
   26207           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   26208          vassert(sz == 4); /* even tho we are transferring 8, not 4. */
   26209          UChar modrm = getUChar(delta);
   26210          UInt  rG    = gregOfRexRM(pfx,modrm);
   26211          if (epartIsReg(modrm)) {
   26212             UInt rE = eregOfRexRM(pfx,modrm);
   26213             putXMMRegLane64( rG, 0, getXMMRegLane64( rE, 0 ));
   26214             DIP("vmovq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   26215             delta += 1;
   26216          } else {
   26217             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   26218             putXMMRegLane64( rG, 0, loadLE(Ity_I64, mkexpr(addr)) );
   26219             DIP("vmovq %s,%s\n", dis_buf, nameXMMReg(rG));
   26220             delta += alen;
   26221          }
   26222          /* zero bits 255:64 */
   26223          putXMMRegLane64( rG, 1, mkU64(0) );
   26224          putYMMRegLane128( rG, 1, mkV128(0) );
   26225          goto decode_success;
   26226       }
   26227       /* VMOVQ xmm1, r64 = VEX.128.66.0F.W1 7E /r (reg case only) */
   26228       /* Moves from G to E, so is a store-form insn */
   26229       /* Intel docs list this in the VMOVD entry for some reason. */
   26230       if (have66noF2noF3(pfx)
   26231           && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
   26232          UChar modrm = getUChar(delta);
   26233          UInt  rG    = gregOfRexRM(pfx,modrm);
   26234          if (epartIsReg(modrm)) {
   26235             UInt rE = eregOfRexRM(pfx,modrm);
   26236             DIP("vmovq %s,%s\n", nameXMMReg(rG), nameIReg64(rE));
   26237             putIReg64(rE, getXMMRegLane64(rG, 0));
   26238             delta += 1;
   26239          } else {
   26240             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   26241             storeLE( mkexpr(addr), getXMMRegLane64(rG, 0) );
   26242             DIP("vmovq %s,%s\n", dis_buf, nameXMMReg(rG));
   26243             delta += alen;
   26244          }
   26245          goto decode_success;
   26246       }
   26247       /* VMOVD xmm1, m32/r32 = VEX.128.66.0F.W0 7E /r (reg case only) */
   26248       /* Moves from G to E, so is a store-form insn */
   26249       if (have66noF2noF3(pfx)
   26250           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   26251          UChar modrm = getUChar(delta);
   26252          UInt  rG    = gregOfRexRM(pfx,modrm);
   26253          if (epartIsReg(modrm)) {
   26254             UInt rE = eregOfRexRM(pfx,modrm);
   26255             DIP("vmovd %s,%s\n", nameXMMReg(rG), nameIReg32(rE));
   26256             putIReg32(rE, getXMMRegLane32(rG, 0));
   26257             delta += 1;
   26258          } else {
   26259             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   26260             storeLE( mkexpr(addr), getXMMRegLane32(rG, 0) );
   26261             DIP("vmovd %s,%s\n", dis_buf, nameXMMReg(rG));
   26262             delta += alen;
   26263          }
   26264          goto decode_success;
   26265       }
   26266       break;
   26267 
   26268    case 0x7F:
   26269       /* VMOVDQA ymm1, ymm2/m256 = VEX.256.66.0F.WIG 7F */
   26270       /* VMOVDQU ymm1, ymm2/m256 = VEX.256.F3.0F.WIG 7F */
   26271       if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
   26272           && 1==getVexL(pfx)/*256*/) {
   26273          UChar  modrm = getUChar(delta);
   26274          UInt   rS    = gregOfRexRM(pfx, modrm);
   26275          IRTemp tS    = newTemp(Ity_V256);
   26276          Bool   isA   = have66noF2noF3(pfx);
   26277          HChar  ch    = isA ? 'a' : 'u';
   26278          assign(tS, getYMMReg(rS));
   26279          if (epartIsReg(modrm)) {
   26280             UInt rD = eregOfRexRM(pfx, modrm);
   26281             delta += 1;
   26282             putYMMReg(rD, mkexpr(tS));
   26283             DIP("vmovdq%c %s,%s\n", ch, nameYMMReg(rS), nameYMMReg(rD));
   26284          } else {
   26285             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   26286             delta += alen;
   26287             if (isA)
   26288                gen_SEGV_if_not_32_aligned(addr);
   26289             storeLE(mkexpr(addr), mkexpr(tS));
   26290             DIP("vmovdq%c %s,%s\n", ch, nameYMMReg(rS), dis_buf);
   26291          }
   26292          goto decode_success;
   26293       }
   26294       /* VMOVDQA xmm1, xmm2/m128 = VEX.128.66.0F.WIG 7F */
   26295       /* VMOVDQU xmm1, xmm2/m128 = VEX.128.F3.0F.WIG 7F */
   26296       if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
   26297           && 0==getVexL(pfx)/*128*/) {
   26298          UChar  modrm = getUChar(delta);
   26299          UInt   rS    = gregOfRexRM(pfx, modrm);
   26300          IRTemp tS    = newTemp(Ity_V128);
   26301          Bool   isA   = have66noF2noF3(pfx);
   26302          HChar  ch    = isA ? 'a' : 'u';
   26303          assign(tS, getXMMReg(rS));
   26304          if (epartIsReg(modrm)) {
   26305             UInt rD = eregOfRexRM(pfx, modrm);
   26306             delta += 1;
   26307             putYMMRegLoAndZU(rD, mkexpr(tS));
   26308             DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), nameXMMReg(rD));
   26309          } else {
   26310             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   26311             delta += alen;
   26312             if (isA)
   26313                gen_SEGV_if_not_16_aligned(addr);
   26314             storeLE(mkexpr(addr), mkexpr(tS));
   26315             DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), dis_buf);
   26316          }
   26317          goto decode_success;
   26318       }
   26319       break;
   26320 
   26321    case 0xAE:
   26322       /* VSTMXCSR m32 = VEX.LZ.0F.WIG AE /3 */
   26323       if (haveNo66noF2noF3(pfx)
   26324           && 0==getVexL(pfx)/*LZ*/
   26325           && 0==getRexW(pfx) /* be paranoid -- Intel docs don't require this */
   26326           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 3
   26327           && sz == 4) {
   26328          delta = dis_STMXCSR(vbi, pfx, delta, True/*isAvx*/);
   26329          goto decode_success;
   26330       }
   26331       /* VLDMXCSR m32 = VEX.LZ.0F.WIG AE /2 */
   26332       if (haveNo66noF2noF3(pfx)
   26333           && 0==getVexL(pfx)/*LZ*/
   26334           && 0==getRexW(pfx) /* be paranoid -- Intel docs don't require this */
   26335           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 2
   26336           && sz == 4) {
   26337          delta = dis_LDMXCSR(vbi, pfx, delta, True/*isAvx*/);
   26338          goto decode_success;
   26339       }
   26340       break;
   26341 
   26342    case 0xC2:
   26343       /* VCMPSD xmm3/m64(E=argL), xmm2(V=argR), xmm1(G) */
   26344       /* = VEX.NDS.LIG.F2.0F.WIG C2 /r ib */
   26345       if (haveF2no66noF3(pfx)) {
   26346          Long delta0 = delta;
   26347          delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   26348                                           "vcmpsd", False/*!all_lanes*/,
   26349                                           8/*sz*/);
   26350          if (delta > delta0) goto decode_success;
   26351          /* else fall through -- decoding has failed */
   26352       }
   26353       /* VCMPSS xmm3/m32(E=argL), xmm2(V=argR), xmm1(G) */
   26354       /* = VEX.NDS.LIG.F3.0F.WIG C2 /r ib */
   26355       if (haveF3no66noF2(pfx)) {
   26356          Long delta0 = delta;
   26357          delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   26358                                           "vcmpss", False/*!all_lanes*/,
   26359                                           4/*sz*/);
   26360          if (delta > delta0) goto decode_success;
   26361          /* else fall through -- decoding has failed */
   26362       }
   26363       /* VCMPPD xmm3/m128(E=argL), xmm2(V=argR), xmm1(G) */
   26364       /* = VEX.NDS.128.66.0F.WIG C2 /r ib */
   26365       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26366          Long delta0 = delta;
   26367          delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   26368                                           "vcmppd", True/*all_lanes*/,
   26369                                           8/*sz*/);
   26370          if (delta > delta0) goto decode_success;
   26371          /* else fall through -- decoding has failed */
   26372       }
   26373       /* VCMPPD ymm3/m256(E=argL), ymm2(V=argR), ymm1(G) */
   26374       /* = VEX.NDS.256.66.0F.WIG C2 /r ib */
   26375       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26376          Long delta0 = delta;
   26377          delta = dis_AVX256_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   26378                                           "vcmppd", 8/*sz*/);
   26379          if (delta > delta0) goto decode_success;
   26380          /* else fall through -- decoding has failed */
   26381       }
   26382       /* VCMPPS xmm3/m128(E=argL), xmm2(V=argR), xmm1(G) */
   26383       /* = VEX.NDS.128.0F.WIG C2 /r ib */
   26384       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26385          Long delta0 = delta;
   26386          delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   26387                                           "vcmpps", True/*all_lanes*/,
   26388                                           4/*sz*/);
   26389          if (delta > delta0) goto decode_success;
   26390          /* else fall through -- decoding has failed */
   26391       }
   26392       /* VCMPPS ymm3/m256(E=argL), ymm2(V=argR), ymm1(G) */
   26393       /* = VEX.NDS.256.0F.WIG C2 /r ib */
   26394       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26395          Long delta0 = delta;
   26396          delta = dis_AVX256_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   26397                                           "vcmpps", 4/*sz*/);
   26398          if (delta > delta0) goto decode_success;
   26399          /* else fall through -- decoding has failed */
   26400       }
   26401       break;
   26402 
   26403    case 0xC4:
   26404       /* VPINSRW r32/m16, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG C4 /r ib */
   26405       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26406          UChar  modrm = getUChar(delta);
   26407          UInt   rG    = gregOfRexRM(pfx, modrm);
   26408          UInt   rV    = getVexNvvvv(pfx);
   26409          Int    imm8;
   26410          IRTemp new16 = newTemp(Ity_I16);
   26411 
   26412          if ( epartIsReg( modrm ) ) {
   26413             imm8 = (Int)(getUChar(delta+1) & 7);
   26414             assign( new16, unop(Iop_32to16,
   26415                                 getIReg32(eregOfRexRM(pfx,modrm))) );
   26416             delta += 1+1;
   26417             DIP( "vpinsrw $%d,%s,%s\n", imm8,
   26418                  nameIReg32( eregOfRexRM(pfx, modrm) ), nameXMMReg(rG) );
   26419          } else {
   26420             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   26421             imm8 = (Int)(getUChar(delta+alen) & 7);
   26422             assign( new16, loadLE( Ity_I16, mkexpr(addr) ));
   26423             delta += alen+1;
   26424             DIP( "vpinsrw $%d,%s,%s\n",
   26425                  imm8, dis_buf, nameXMMReg(rG) );
   26426          }
   26427 
   26428          IRTemp src_vec = newTemp(Ity_V128);
   26429          assign(src_vec, getXMMReg( rV ));
   26430          IRTemp res_vec = math_PINSRW_128( src_vec, new16, imm8 );
   26431          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   26432          *uses_vvvv = True;
   26433          goto decode_success;
   26434       }
   26435       break;
   26436 
   26437    case 0xC5:
   26438       /* VPEXTRW imm8, xmm1, reg32 = VEX.128.66.0F.W0 C5 /r ib */
   26439       if (have66noF2noF3(pfx)
   26440          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   26441          Long delta0 = delta;
   26442          delta = dis_PEXTRW_128_EregOnly_toG( vbi, pfx, delta,
   26443                                               True/*isAvx*/ );
   26444          if (delta > delta0) goto decode_success;
   26445          /* else fall through -- decoding has failed */
   26446       }
   26447       break;
   26448 
   26449    case 0xC6:
   26450       /* VSHUFPS imm8, xmm3/m128, xmm2, xmm1, xmm2 */
   26451       /* = VEX.NDS.128.0F.WIG C6 /r ib */
   26452       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26453          Int    imm8 = 0;
   26454          IRTemp eV   = newTemp(Ity_V128);
   26455          IRTemp vV   = newTemp(Ity_V128);
   26456          UInt  modrm = getUChar(delta);
   26457          UInt  rG    = gregOfRexRM(pfx,modrm);
   26458          UInt  rV    = getVexNvvvv(pfx);
   26459          assign( vV, getXMMReg(rV) );
   26460          if (epartIsReg(modrm)) {
   26461             UInt rE = eregOfRexRM(pfx,modrm);
   26462             assign( eV, getXMMReg(rE) );
   26463             imm8 = (Int)getUChar(delta+1);
   26464             delta += 1+1;
   26465             DIP("vshufps $%d,%s,%s,%s\n",
   26466                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   26467          } else {
   26468             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   26469             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   26470             imm8 = (Int)getUChar(delta+alen);
   26471             delta += 1+alen;
   26472             DIP("vshufps $%d,%s,%s,%s\n",
   26473                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   26474          }
   26475          IRTemp res = math_SHUFPS_128( eV, vV, imm8 );
   26476          putYMMRegLoAndZU( rG, mkexpr(res) );
   26477          *uses_vvvv = True;
   26478          goto decode_success;
   26479       }
   26480       /* VSHUFPS imm8, ymm3/m256, ymm2, ymm1, ymm2 */
   26481       /* = VEX.NDS.256.0F.WIG C6 /r ib */
   26482       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26483          Int    imm8 = 0;
   26484          IRTemp eV   = newTemp(Ity_V256);
   26485          IRTemp vV   = newTemp(Ity_V256);
   26486          UInt  modrm = getUChar(delta);
   26487          UInt  rG    = gregOfRexRM(pfx,modrm);
   26488          UInt  rV    = getVexNvvvv(pfx);
   26489          assign( vV, getYMMReg(rV) );
   26490          if (epartIsReg(modrm)) {
   26491             UInt rE = eregOfRexRM(pfx,modrm);
   26492             assign( eV, getYMMReg(rE) );
   26493             imm8 = (Int)getUChar(delta+1);
   26494             delta += 1+1;
   26495             DIP("vshufps $%d,%s,%s,%s\n",
   26496                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   26497          } else {
   26498             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   26499             assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
   26500             imm8 = (Int)getUChar(delta+alen);
   26501             delta += 1+alen;
   26502             DIP("vshufps $%d,%s,%s,%s\n",
   26503                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   26504          }
   26505          IRTemp res = math_SHUFPS_256( eV, vV, imm8 );
   26506          putYMMReg( rG, mkexpr(res) );
   26507          *uses_vvvv = True;
   26508          goto decode_success;
   26509       }
   26510       /* VSHUFPD imm8, xmm3/m128, xmm2, xmm1, xmm2 */
   26511       /* = VEX.NDS.128.66.0F.WIG C6 /r ib */
   26512       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26513          Int    imm8 = 0;
   26514          IRTemp eV   = newTemp(Ity_V128);
   26515          IRTemp vV   = newTemp(Ity_V128);
   26516          UInt  modrm = getUChar(delta);
   26517          UInt  rG    = gregOfRexRM(pfx,modrm);
   26518          UInt  rV    = getVexNvvvv(pfx);
   26519          assign( vV, getXMMReg(rV) );
   26520          if (epartIsReg(modrm)) {
   26521             UInt rE = eregOfRexRM(pfx,modrm);
   26522             assign( eV, getXMMReg(rE) );
   26523             imm8 = (Int)getUChar(delta+1);
   26524             delta += 1+1;
   26525             DIP("vshufpd $%d,%s,%s,%s\n",
   26526                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   26527          } else {
   26528             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   26529             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   26530             imm8 = (Int)getUChar(delta+alen);
   26531             delta += 1+alen;
   26532             DIP("vshufpd $%d,%s,%s,%s\n",
   26533                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   26534          }
   26535          IRTemp res = math_SHUFPD_128( eV, vV, imm8 );
   26536          putYMMRegLoAndZU( rG, mkexpr(res) );
   26537          *uses_vvvv = True;
   26538          goto decode_success;
   26539       }
   26540       /* VSHUFPD imm8, ymm3/m256, ymm2, ymm1, ymm2 */
   26541       /* = VEX.NDS.256.66.0F.WIG C6 /r ib */
   26542       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26543          Int    imm8 = 0;
   26544          IRTemp eV   = newTemp(Ity_V256);
   26545          IRTemp vV   = newTemp(Ity_V256);
   26546          UInt  modrm = getUChar(delta);
   26547          UInt  rG    = gregOfRexRM(pfx,modrm);
   26548          UInt  rV    = getVexNvvvv(pfx);
   26549          assign( vV, getYMMReg(rV) );
   26550          if (epartIsReg(modrm)) {
   26551             UInt rE = eregOfRexRM(pfx,modrm);
   26552             assign( eV, getYMMReg(rE) );
   26553             imm8 = (Int)getUChar(delta+1);
   26554             delta += 1+1;
   26555             DIP("vshufpd $%d,%s,%s,%s\n",
   26556                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   26557          } else {
   26558             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   26559             assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
   26560             imm8 = (Int)getUChar(delta+alen);
   26561             delta += 1+alen;
   26562             DIP("vshufpd $%d,%s,%s,%s\n",
   26563                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   26564          }
   26565          IRTemp res = math_SHUFPD_256( eV, vV, imm8 );
   26566          putYMMReg( rG, mkexpr(res) );
   26567          *uses_vvvv = True;
   26568          goto decode_success;
   26569       }
   26570       break;
   26571 
   26572    case 0xD0:
   26573       /* VADDSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D0 /r */
   26574       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26575          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   26576                     uses_vvvv, vbi, pfx, delta,
   26577                     "vaddsubpd", math_ADDSUBPD_128 );
   26578          goto decode_success;
   26579       }
   26580       /* VADDSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D0 /r */
   26581       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26582          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   26583                     uses_vvvv, vbi, pfx, delta,
   26584                     "vaddsubpd", math_ADDSUBPD_256 );
   26585          goto decode_success;
   26586       }
   26587       /* VADDSUBPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.F2.0F.WIG D0 /r */
   26588       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26589          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   26590                     uses_vvvv, vbi, pfx, delta,
   26591                     "vaddsubps", math_ADDSUBPS_128 );
   26592          goto decode_success;
   26593       }
   26594       /* VADDSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.F2.0F.WIG D0 /r */
   26595       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26596          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   26597                     uses_vvvv, vbi, pfx, delta,
   26598                     "vaddsubps", math_ADDSUBPS_256 );
   26599          goto decode_success;
   26600       }
   26601       break;
   26602 
   26603    case 0xD1:
   26604       /* VPSRLW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D1 /r */
   26605       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26606          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   26607                                         "vpsrlw", Iop_ShrN16x8 );
   26608          *uses_vvvv = True;
   26609          goto decode_success;
   26610 
   26611       }
   26612       /* VPSRLW xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D1 /r */
   26613       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26614          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   26615                                         "vpsrlw", Iop_ShrN16x16 );
   26616          *uses_vvvv = True;
   26617          goto decode_success;
   26618 
   26619       }
   26620       break;
   26621 
   26622    case 0xD2:
   26623       /* VPSRLD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D2 /r */
   26624       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26625          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   26626                                         "vpsrld", Iop_ShrN32x4 );
   26627          *uses_vvvv = True;
   26628          goto decode_success;
   26629       }
   26630       /* VPSRLD xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D2 /r */
   26631       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26632          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   26633                                         "vpsrld", Iop_ShrN32x8 );
   26634          *uses_vvvv = True;
   26635          goto decode_success;
   26636       }
   26637       break;
   26638 
   26639    case 0xD3:
   26640       /* VPSRLQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D3 /r */
   26641       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26642          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   26643                                         "vpsrlq", Iop_ShrN64x2 );
   26644          *uses_vvvv = True;
   26645          goto decode_success;
   26646       }
   26647       /* VPSRLQ xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D3 /r */
   26648       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26649          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   26650                                         "vpsrlq", Iop_ShrN64x4 );
   26651          *uses_vvvv = True;
   26652          goto decode_success;
   26653       }
   26654       break;
   26655 
   26656    case 0xD4:
   26657       /* VPADDQ r/m, rV, r ::: r = rV + r/m */
   26658       /* VPADDQ = VEX.NDS.128.66.0F.WIG D4 /r */
   26659       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26660          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26661                     uses_vvvv, vbi, pfx, delta, "vpaddq", Iop_Add64x2 );
   26662          goto decode_success;
   26663       }
   26664       /* VPADDQ r/m, rV, r ::: r = rV + r/m */
   26665       /* VPADDQ = VEX.NDS.256.66.0F.WIG D4 /r */
   26666       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26667          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26668                     uses_vvvv, vbi, pfx, delta, "vpaddq", Iop_Add64x4 );
   26669          goto decode_success;
   26670       }
   26671       break;
   26672 
   26673    case 0xD5:
   26674       /* VPMULLW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D5 /r */
   26675       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26676          delta = dis_AVX128_E_V_to_G(
   26677                     uses_vvvv, vbi, pfx, delta, "vpmullw", Iop_Mul16x8 );
   26678          goto decode_success;
   26679       }
   26680       /* VPMULLW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D5 /r */
   26681       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26682          delta = dis_AVX256_E_V_to_G(
   26683                     uses_vvvv, vbi, pfx, delta, "vpmullw", Iop_Mul16x16 );
   26684          goto decode_success;
   26685       }
   26686       break;
   26687 
   26688    case 0xD6:
   26689       /* I can't even find any Intel docs for this one. */
   26690       /* Basically: 66 0F D6 = MOVQ -- move 64 bits from G (lo half
   26691          xmm) to E (mem or lo half xmm).  Looks like L==0(128), W==0
   26692          (WIG, maybe?) */
   26693       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   26694           && 0==getRexW(pfx)/*this might be redundant, dunno*/) {
   26695          UChar modrm = getUChar(delta);
   26696          UInt  rG    = gregOfRexRM(pfx,modrm);
   26697          if (epartIsReg(modrm)) {
   26698             /* fall through, awaiting test case */
   26699             /* dst: lo half copied, hi half zeroed */
   26700          } else {
   26701             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   26702             storeLE( mkexpr(addr), getXMMRegLane64( rG, 0 ));
   26703             DIP("vmovq %s,%s\n", nameXMMReg(rG), dis_buf );
   26704             delta += alen;
   26705             goto decode_success;
   26706          }
   26707       }
   26708       break;
   26709 
   26710    case 0xD7:
   26711       /* VEX.128.66.0F.WIG D7 /r = VPMOVMSKB xmm1, r32 */
   26712       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26713          delta = dis_PMOVMSKB_128( vbi, pfx, delta, True/*isAvx*/ );
   26714          goto decode_success;
   26715       }
   26716       /* VEX.128.66.0F.WIG D7 /r = VPMOVMSKB ymm1, r32 */
   26717       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26718          delta = dis_PMOVMSKB_256( vbi, pfx, delta );
   26719          goto decode_success;
   26720       }
   26721       break;
   26722 
   26723    case 0xD8:
   26724       /* VPSUBUSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D8 /r */
   26725       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26726          delta = dis_AVX128_E_V_to_G(
   26727                     uses_vvvv, vbi, pfx, delta, "vpsubusb", Iop_QSub8Ux16 );
   26728          goto decode_success;
   26729       }
   26730       /* VPSUBUSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D8 /r */
   26731       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26732          delta = dis_AVX256_E_V_to_G(
   26733                     uses_vvvv, vbi, pfx, delta, "vpsubusb", Iop_QSub8Ux32 );
   26734          goto decode_success;
   26735       }
   26736       break;
   26737 
   26738    case 0xD9:
   26739       /* VPSUBUSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D9 /r */
   26740       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26741          delta = dis_AVX128_E_V_to_G(
   26742                     uses_vvvv, vbi, pfx, delta, "vpsubusw", Iop_QSub16Ux8 );
   26743          goto decode_success;
   26744       }
   26745       /* VPSUBUSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D9 /r */
   26746       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26747          delta = dis_AVX256_E_V_to_G(
   26748                     uses_vvvv, vbi, pfx, delta, "vpsubusw", Iop_QSub16Ux16 );
   26749          goto decode_success;
   26750       }
   26751       break;
   26752 
   26753    case 0xDA:
   26754       /* VPMINUB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DA /r */
   26755       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26756          delta = dis_AVX128_E_V_to_G(
   26757                     uses_vvvv, vbi, pfx, delta, "vpminub", Iop_Min8Ux16 );
   26758          goto decode_success;
   26759       }
   26760       /* VPMINUB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DA /r */
   26761       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26762          delta = dis_AVX256_E_V_to_G(
   26763                     uses_vvvv, vbi, pfx, delta, "vpminub", Iop_Min8Ux32 );
   26764          goto decode_success;
   26765       }
   26766       break;
   26767 
   26768    case 0xDB:
   26769       /* VPAND r/m, rV, r ::: r = rV & r/m */
   26770       /* VEX.NDS.128.66.0F.WIG DB /r = VPAND xmm3/m128, xmm2, xmm1 */
   26771       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26772          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26773                     uses_vvvv, vbi, pfx, delta, "vpand", Iop_AndV128 );
   26774          goto decode_success;
   26775       }
   26776       /* VPAND r/m, rV, r ::: r = rV & r/m */
   26777       /* VEX.NDS.256.66.0F.WIG DB /r = VPAND ymm3/m256, ymm2, ymm1 */
   26778       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26779          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26780                     uses_vvvv, vbi, pfx, delta, "vpand", Iop_AndV256 );
   26781          goto decode_success;
   26782       }
   26783       break;
   26784 
   26785    case 0xDC:
   26786       /* VPADDUSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DC /r */
   26787       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26788          delta = dis_AVX128_E_V_to_G(
   26789                     uses_vvvv, vbi, pfx, delta, "vpaddusb", Iop_QAdd8Ux16 );
   26790          goto decode_success;
   26791       }
   26792       /* VPADDUSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DC /r */
   26793       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26794          delta = dis_AVX256_E_V_to_G(
   26795                     uses_vvvv, vbi, pfx, delta, "vpaddusb", Iop_QAdd8Ux32 );
   26796          goto decode_success;
   26797       }
   26798       break;
   26799 
   26800    case 0xDD:
   26801       /* VPADDUSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DD /r */
   26802       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26803          delta = dis_AVX128_E_V_to_G(
   26804                     uses_vvvv, vbi, pfx, delta, "vpaddusw", Iop_QAdd16Ux8 );
   26805          goto decode_success;
   26806       }
   26807       /* VPADDUSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DD /r */
   26808       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26809          delta = dis_AVX256_E_V_to_G(
   26810                     uses_vvvv, vbi, pfx, delta, "vpaddusw", Iop_QAdd16Ux16 );
   26811          goto decode_success;
   26812       }
   26813       break;
   26814 
   26815    case 0xDE:
   26816       /* VPMAXUB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DE /r */
   26817       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26818          delta = dis_AVX128_E_V_to_G(
   26819                     uses_vvvv, vbi, pfx, delta, "vpmaxub", Iop_Max8Ux16 );
   26820          goto decode_success;
   26821       }
   26822       /* VPMAXUB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DE /r */
   26823       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26824          delta = dis_AVX256_E_V_to_G(
   26825                     uses_vvvv, vbi, pfx, delta, "vpmaxub", Iop_Max8Ux32 );
   26826          goto decode_success;
   26827       }
   26828       break;
   26829 
   26830    case 0xDF:
   26831       /* VPANDN r/m, rV, r ::: r = rV & ~r/m (is that correct, re the ~ ?) */
   26832       /* VEX.NDS.128.66.0F.WIG DF /r = VPANDN xmm3/m128, xmm2, xmm1 */
   26833       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26834          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   26835                     uses_vvvv, vbi, pfx, delta, "vpandn", Iop_AndV128,
   26836                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
   26837          goto decode_success;
   26838       }
   26839       /* VPANDN r/m, rV, r ::: r = rV & ~r/m (is that correct, re the ~ ?) */
   26840       /* VEX.NDS.256.66.0F.WIG DF /r = VPANDN ymm3/m256, ymm2, ymm1 */
   26841       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26842          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
   26843                     uses_vvvv, vbi, pfx, delta, "vpandn", Iop_AndV256,
   26844                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
   26845          goto decode_success;
   26846       }
   26847       break;
   26848 
   26849    case 0xE0:
   26850       /* VPAVGB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E0 /r */
   26851       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26852          delta = dis_AVX128_E_V_to_G(
   26853                     uses_vvvv, vbi, pfx, delta, "vpavgb", Iop_Avg8Ux16 );
   26854          goto decode_success;
   26855       }
   26856       /* VPAVGB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E0 /r */
   26857       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26858          delta = dis_AVX256_E_V_to_G(
   26859                     uses_vvvv, vbi, pfx, delta, "vpavgb", Iop_Avg8Ux32 );
   26860          goto decode_success;
   26861       }
   26862       break;
   26863 
   26864    case 0xE1:
   26865       /* VPSRAW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E1 /r */
   26866       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26867          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   26868                                         "vpsraw", Iop_SarN16x8 );
   26869          *uses_vvvv = True;
   26870          goto decode_success;
   26871       }
   26872       /* VPSRAW xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E1 /r */
   26873       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26874          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   26875                                         "vpsraw", Iop_SarN16x16 );
   26876          *uses_vvvv = True;
   26877          goto decode_success;
   26878       }
   26879       break;
   26880 
   26881    case 0xE2:
   26882       /* VPSRAD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E2 /r */
   26883       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26884          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   26885                                         "vpsrad", Iop_SarN32x4 );
   26886          *uses_vvvv = True;
   26887          goto decode_success;
   26888       }
   26889       /* VPSRAD xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E2 /r */
   26890       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26891          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   26892                                         "vpsrad", Iop_SarN32x8 );
   26893          *uses_vvvv = True;
   26894          goto decode_success;
   26895       }
   26896       break;
   26897 
   26898    case 0xE3:
   26899       /* VPAVGW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E3 /r */
   26900       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26901          delta = dis_AVX128_E_V_to_G(
   26902                     uses_vvvv, vbi, pfx, delta, "vpavgw", Iop_Avg16Ux8 );
   26903          goto decode_success;
   26904       }
   26905       /* VPAVGW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E3 /r */
   26906       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26907          delta = dis_AVX256_E_V_to_G(
   26908                     uses_vvvv, vbi, pfx, delta, "vpavgw", Iop_Avg16Ux16 );
   26909          goto decode_success;
   26910       }
   26911       break;
   26912 
   26913    case 0xE4:
   26914       /* VPMULHUW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E4 /r */
   26915       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26916          delta = dis_AVX128_E_V_to_G(
   26917                     uses_vvvv, vbi, pfx, delta, "vpmulhuw", Iop_MulHi16Ux8 );
   26918          goto decode_success;
   26919       }
   26920       /* VPMULHUW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E4 /r */
   26921       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26922          delta = dis_AVX256_E_V_to_G(
   26923                     uses_vvvv, vbi, pfx, delta, "vpmulhuw", Iop_MulHi16Ux16 );
   26924          goto decode_success;
   26925       }
   26926       break;
   26927 
   26928    case 0xE5:
   26929       /* VPMULHW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E5 /r */
   26930       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26931          delta = dis_AVX128_E_V_to_G(
   26932                     uses_vvvv, vbi, pfx, delta, "vpmulhw", Iop_MulHi16Sx8 );
   26933          goto decode_success;
   26934       }
   26935       /* VPMULHW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E5 /r */
   26936       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26937          delta = dis_AVX256_E_V_to_G(
   26938                     uses_vvvv, vbi, pfx, delta, "vpmulhw", Iop_MulHi16Sx16 );
   26939          goto decode_success;
   26940       }
   26941       break;
   26942 
   26943    case 0xE6:
   26944       /* VCVTDQ2PD xmm2/m64, xmm1 = VEX.128.F3.0F.WIG E6 /r */
   26945       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
   26946          delta = dis_CVTDQ2PD_128(vbi, pfx, delta, True/*isAvx*/);
   26947          goto decode_success;
   26948       }
   26949       /* VCVTDQ2PD xmm2/m128, ymm1 = VEX.256.F3.0F.WIG E6 /r */
   26950       if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
   26951          delta = dis_CVTDQ2PD_256(vbi, pfx, delta);
   26952          goto decode_success;
   26953       }
   26954       /* VCVTTPD2DQ xmm2/m128, xmm1 = VEX.128.66.0F.WIG E6 /r */
   26955       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26956          delta = dis_CVTxPD2DQ_128(vbi, pfx, delta, True/*isAvx*/,
   26957                                    True/*r2zero*/);
   26958          goto decode_success;
   26959       }
   26960       /* VCVTTPD2DQ ymm2/m256, xmm1 = VEX.256.66.0F.WIG E6 /r */
   26961       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26962          delta = dis_CVTxPD2DQ_256(vbi, pfx, delta, True/*r2zero*/);
   26963          goto decode_success;
   26964       }
   26965       /* VCVTPD2DQ xmm2/m128, xmm1 = VEX.128.F2.0F.WIG E6 /r */
   26966       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26967          delta = dis_CVTxPD2DQ_128(vbi, pfx, delta, True/*isAvx*/,
   26968                                    False/*!r2zero*/);
   26969          goto decode_success;
   26970       }
   26971       /* VCVTPD2DQ ymm2/m256, xmm1 = VEX.256.F2.0F.WIG E6 /r */
   26972       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26973          delta = dis_CVTxPD2DQ_256(vbi, pfx, delta, False/*!r2zero*/);
   26974          goto decode_success;
   26975       }
   26976       break;
   26977 
   26978    case 0xE7:
   26979       /* VMOVNTDQ xmm1, m128 = VEX.128.66.0F.WIG E7 /r */
   26980       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26981          UChar modrm = getUChar(delta);
   26982          UInt rG     = gregOfRexRM(pfx,modrm);
   26983          if (!epartIsReg(modrm)) {
   26984             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   26985             gen_SEGV_if_not_16_aligned( addr );
   26986             storeLE( mkexpr(addr), getXMMReg(rG) );
   26987             DIP("vmovntdq %s,%s\n", dis_buf, nameXMMReg(rG));
   26988             delta += alen;
   26989             goto decode_success;
   26990          }
   26991          /* else fall through */
   26992       }
   26993       /* VMOVNTDQ ymm1, m256 = VEX.256.66.0F.WIG E7 /r */
   26994       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26995          UChar modrm = getUChar(delta);
   26996          UInt rG     = gregOfRexRM(pfx,modrm);
   26997          if (!epartIsReg(modrm)) {
   26998             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   26999             gen_SEGV_if_not_32_aligned( addr );
   27000             storeLE( mkexpr(addr), getYMMReg(rG) );
   27001             DIP("vmovntdq %s,%s\n", dis_buf, nameYMMReg(rG));
   27002             delta += alen;
   27003             goto decode_success;
   27004          }
   27005          /* else fall through */
   27006       }
   27007       break;
   27008 
   27009    case 0xE8:
   27010       /* VPSUBSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E8 /r */
   27011       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27012          delta = dis_AVX128_E_V_to_G(
   27013                     uses_vvvv, vbi, pfx, delta, "vpsubsb", Iop_QSub8Sx16 );
   27014          goto decode_success;
   27015       }
   27016       /* VPSUBSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E8 /r */
   27017       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27018          delta = dis_AVX256_E_V_to_G(
   27019                     uses_vvvv, vbi, pfx, delta, "vpsubsb", Iop_QSub8Sx32 );
   27020          goto decode_success;
   27021       }
   27022       break;
   27023 
   27024    case 0xE9:
   27025       /* VPSUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E9 /r */
   27026       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27027          delta = dis_AVX128_E_V_to_G(
   27028                     uses_vvvv, vbi, pfx, delta, "vpsubsw", Iop_QSub16Sx8 );
   27029          goto decode_success;
   27030       }
   27031       /* VPSUBSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E9 /r */
   27032       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27033          delta = dis_AVX256_E_V_to_G(
   27034                     uses_vvvv, vbi, pfx, delta, "vpsubsw", Iop_QSub16Sx16 );
   27035          goto decode_success;
   27036       }
   27037       break;
   27038 
   27039    case 0xEA:
   27040       /* VPMINSW r/m, rV, r ::: r = min-signed16s(rV, r/m) */
   27041       /* VPMINSW = VEX.NDS.128.66.0F.WIG EA /r */
   27042       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27043          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   27044                     uses_vvvv, vbi, pfx, delta, "vpminsw", Iop_Min16Sx8 );
   27045          goto decode_success;
   27046       }
   27047       /* VPMINSW r/m, rV, r ::: r = min-signed16s(rV, r/m) */
   27048       /* VPMINSW = VEX.NDS.256.66.0F.WIG EA /r */
   27049       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27050          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   27051                     uses_vvvv, vbi, pfx, delta, "vpminsw", Iop_Min16Sx16 );
   27052          goto decode_success;
   27053       }
   27054       break;
   27055 
   27056    case 0xEB:
   27057       /* VPOR r/m, rV, r ::: r = rV | r/m */
   27058       /* VPOR = VEX.NDS.128.66.0F.WIG EB /r */
   27059       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27060          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   27061                     uses_vvvv, vbi, pfx, delta, "vpor", Iop_OrV128 );
   27062          goto decode_success;
   27063       }
   27064       /* VPOR r/m, rV, r ::: r = rV | r/m */
   27065       /* VPOR = VEX.NDS.256.66.0F.WIG EB /r */
   27066       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27067          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   27068                     uses_vvvv, vbi, pfx, delta, "vpor", Iop_OrV256 );
   27069          goto decode_success;
   27070       }
   27071       break;
   27072 
   27073    case 0xEC:
   27074       /* VPADDSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG EC /r */
   27075       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27076          delta = dis_AVX128_E_V_to_G(
   27077                     uses_vvvv, vbi, pfx, delta, "vpaddsb", Iop_QAdd8Sx16 );
   27078          goto decode_success;
   27079       }
   27080       /* VPADDSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG EC /r */
   27081       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27082          delta = dis_AVX256_E_V_to_G(
   27083                     uses_vvvv, vbi, pfx, delta, "vpaddsb", Iop_QAdd8Sx32 );
   27084          goto decode_success;
   27085       }
   27086       break;
   27087 
   27088    case 0xED:
   27089       /* VPADDSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG ED /r */
   27090       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27091          delta = dis_AVX128_E_V_to_G(
   27092                     uses_vvvv, vbi, pfx, delta, "vpaddsw", Iop_QAdd16Sx8 );
   27093          goto decode_success;
   27094       }
   27095       /* VPADDSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG ED /r */
   27096       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27097          delta = dis_AVX256_E_V_to_G(
   27098                     uses_vvvv, vbi, pfx, delta, "vpaddsw", Iop_QAdd16Sx16 );
   27099          goto decode_success;
   27100       }
   27101       break;
   27102 
   27103    case 0xEE:
   27104       /* VPMAXSW r/m, rV, r ::: r = max-signed16s(rV, r/m) */
   27105       /* VPMAXSW = VEX.NDS.128.66.0F.WIG EE /r */
   27106       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27107          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   27108                     uses_vvvv, vbi, pfx, delta, "vpmaxsw", Iop_Max16Sx8 );
   27109          goto decode_success;
   27110       }
   27111       /* VPMAXSW r/m, rV, r ::: r = max-signed16s(rV, r/m) */
   27112       /* VPMAXSW = VEX.NDS.256.66.0F.WIG EE /r */
   27113       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27114          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   27115                     uses_vvvv, vbi, pfx, delta, "vpmaxsw", Iop_Max16Sx16 );
   27116          goto decode_success;
   27117       }
   27118       break;
   27119 
   27120    case 0xEF:
   27121       /* VPXOR r/m, rV, r ::: r = rV ^ r/m */
   27122       /* VPXOR = VEX.NDS.128.66.0F.WIG EF /r */
   27123       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27124          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   27125                     uses_vvvv, vbi, pfx, delta, "vpxor", Iop_XorV128 );
   27126          goto decode_success;
   27127       }
   27128       /* VPXOR r/m, rV, r ::: r = rV ^ r/m */
   27129       /* VPXOR = VEX.NDS.256.66.0F.WIG EF /r */
   27130       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27131          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   27132                     uses_vvvv, vbi, pfx, delta, "vpxor", Iop_XorV256 );
   27133          goto decode_success;
   27134       }
   27135       break;
   27136 
   27137    case 0xF0:
   27138       /* VLDDQU m256, ymm1 = VEX.256.F2.0F.WIG F0 /r */
   27139       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27140          UChar  modrm = getUChar(delta);
   27141          UInt   rD    = gregOfRexRM(pfx, modrm);
   27142          IRTemp tD    = newTemp(Ity_V256);
   27143          if (epartIsReg(modrm)) break;
   27144          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   27145          delta += alen;
   27146          assign(tD, loadLE(Ity_V256, mkexpr(addr)));
   27147          DIP("vlddqu %s,%s\n", dis_buf, nameYMMReg(rD));
   27148          putYMMReg(rD, mkexpr(tD));
   27149          goto decode_success;
   27150       }
   27151       /* VLDDQU m128, xmm1 = VEX.128.F2.0F.WIG F0 /r */
   27152       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27153          UChar  modrm = getUChar(delta);
   27154          UInt   rD    = gregOfRexRM(pfx, modrm);
   27155          IRTemp tD    = newTemp(Ity_V128);
   27156          if (epartIsReg(modrm)) break;
   27157          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   27158          delta += alen;
   27159          assign(tD, loadLE(Ity_V128, mkexpr(addr)));
   27160          DIP("vlddqu %s,%s\n", dis_buf, nameXMMReg(rD));
   27161          putYMMRegLoAndZU(rD, mkexpr(tD));
   27162          goto decode_success;
   27163       }
   27164       break;
   27165 
   27166    case 0xF1:
   27167       /* VPSLLW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F1 /r */
   27168       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27169          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   27170                                         "vpsllw", Iop_ShlN16x8 );
   27171          *uses_vvvv = True;
   27172          goto decode_success;
   27173 
   27174       }
   27175       /* VPSLLW xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F1 /r */
   27176       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27177          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   27178                                         "vpsllw", Iop_ShlN16x16 );
   27179          *uses_vvvv = True;
   27180          goto decode_success;
   27181 
   27182       }
   27183       break;
   27184 
   27185    case 0xF2:
   27186       /* VPSLLD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F2 /r */
   27187       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27188          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   27189                                         "vpslld", Iop_ShlN32x4 );
   27190          *uses_vvvv = True;
   27191          goto decode_success;
   27192       }
   27193       /* VPSLLD xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F2 /r */
   27194       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27195          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   27196                                         "vpslld", Iop_ShlN32x8 );
   27197          *uses_vvvv = True;
   27198          goto decode_success;
   27199       }
   27200       break;
   27201 
   27202    case 0xF3:
   27203       /* VPSLLQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F3 /r */
   27204       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27205          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   27206                                         "vpsllq", Iop_ShlN64x2 );
   27207          *uses_vvvv = True;
   27208          goto decode_success;
   27209       }
   27210       /* VPSLLQ xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F3 /r */
   27211       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27212          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   27213                                         "vpsllq", Iop_ShlN64x4 );
   27214          *uses_vvvv = True;
   27215          goto decode_success;
   27216       }
   27217       break;
   27218 
   27219    case 0xF4:
   27220       /* VPMULUDQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F4 /r */
   27221       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27222          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   27223                     uses_vvvv, vbi, pfx, delta,
   27224                     "vpmuludq", math_PMULUDQ_128 );
   27225          goto decode_success;
   27226       }
   27227       /* VPMULUDQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F4 /r */
   27228       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27229          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   27230                     uses_vvvv, vbi, pfx, delta,
   27231                     "vpmuludq", math_PMULUDQ_256 );
   27232          goto decode_success;
   27233       }
   27234       break;
   27235 
   27236    case 0xF5:
   27237       /* VPMADDWD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F5 /r */
   27238       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27239          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   27240                     uses_vvvv, vbi, pfx, delta,
   27241                     "vpmaddwd", math_PMADDWD_128 );
   27242          goto decode_success;
   27243       }
   27244       /* VPMADDWD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F5 /r */
   27245       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27246          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   27247                     uses_vvvv, vbi, pfx, delta,
   27248                     "vpmaddwd", math_PMADDWD_256 );
   27249          goto decode_success;
   27250       }
   27251       break;
   27252 
   27253    case 0xF6:
   27254       /* VPSADBW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F6 /r */
   27255       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27256          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   27257                     uses_vvvv, vbi, pfx, delta,
   27258                     "vpsadbw", math_PSADBW_128 );
   27259          goto decode_success;
   27260       }
   27261       /* VPSADBW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F6 /r */
   27262       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27263          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   27264                     uses_vvvv, vbi, pfx, delta,
   27265                     "vpsadbw", math_PSADBW_256 );
   27266          goto decode_success;
   27267       }
   27268       break;
   27269 
   27270    case 0xF7:
   27271       /* VMASKMOVDQU xmm2, xmm1 = VEX.128.66.0F.WIG F7 /r */
   27272       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   27273           && epartIsReg(getUChar(delta))) {
   27274          delta = dis_MASKMOVDQU( vbi, pfx, delta, True/*isAvx*/ );
   27275          goto decode_success;
   27276       }
   27277       break;
   27278 
   27279    case 0xF8:
   27280       /* VPSUBB r/m, rV, r ::: r = rV - r/m */
   27281       /* VPSUBB = VEX.NDS.128.66.0F.WIG F8 /r */
   27282       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27283          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   27284                     uses_vvvv, vbi, pfx, delta, "vpsubb", Iop_Sub8x16 );
   27285          goto decode_success;
   27286       }
   27287       /* VPSUBB r/m, rV, r ::: r = rV - r/m */
   27288       /* VPSUBB = VEX.NDS.256.66.0F.WIG F8 /r */
   27289       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27290          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   27291                     uses_vvvv, vbi, pfx, delta, "vpsubb", Iop_Sub8x32 );
   27292          goto decode_success;
   27293       }
   27294       break;
   27295 
   27296    case 0xF9:
   27297       /* VPSUBW r/m, rV, r ::: r = rV - r/m */
   27298       /* VPSUBW = VEX.NDS.128.66.0F.WIG F9 /r */
   27299       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27300          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   27301                     uses_vvvv, vbi, pfx, delta, "vpsubw", Iop_Sub16x8 );
   27302          goto decode_success;
   27303       }
   27304       /* VPSUBW r/m, rV, r ::: r = rV - r/m */
   27305       /* VPSUBW = VEX.NDS.256.66.0F.WIG F9 /r */
   27306       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27307          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   27308                     uses_vvvv, vbi, pfx, delta, "vpsubw", Iop_Sub16x16 );
   27309          goto decode_success;
   27310       }
   27311       break;
   27312 
   27313    case 0xFA:
   27314       /* VPSUBD r/m, rV, r ::: r = rV - r/m */
   27315       /* VPSUBD = VEX.NDS.128.66.0F.WIG FA /r */
   27316       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27317          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   27318                     uses_vvvv, vbi, pfx, delta, "vpsubd", Iop_Sub32x4 );
   27319          goto decode_success;
   27320       }
   27321       /* VPSUBD r/m, rV, r ::: r = rV - r/m */
   27322       /* VPSUBD = VEX.NDS.256.66.0F.WIG FA /r */
   27323       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27324          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   27325                     uses_vvvv, vbi, pfx, delta, "vpsubd", Iop_Sub32x8 );
   27326          goto decode_success;
   27327       }
   27328       break;
   27329 
   27330    case 0xFB:
   27331       /* VPSUBQ r/m, rV, r ::: r = rV - r/m */
   27332       /* VPSUBQ = VEX.NDS.128.66.0F.WIG FB /r */
   27333       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27334          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   27335                     uses_vvvv, vbi, pfx, delta, "vpsubq", Iop_Sub64x2 );
   27336          goto decode_success;
   27337       }
   27338       /* VPSUBQ r/m, rV, r ::: r = rV - r/m */
   27339       /* VPSUBQ = VEX.NDS.256.66.0F.WIG FB /r */
   27340       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27341          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   27342                     uses_vvvv, vbi, pfx, delta, "vpsubq", Iop_Sub64x4 );
   27343          goto decode_success;
   27344       }
   27345       break;
   27346 
   27347    case 0xFC:
   27348       /* VPADDB r/m, rV, r ::: r = rV + r/m */
   27349       /* VPADDB = VEX.NDS.128.66.0F.WIG FC /r */
   27350       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27351          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   27352                     uses_vvvv, vbi, pfx, delta, "vpaddb", Iop_Add8x16 );
   27353          goto decode_success;
   27354       }
   27355       /* VPADDB r/m, rV, r ::: r = rV + r/m */
   27356       /* VPADDB = VEX.NDS.256.66.0F.WIG FC /r */
   27357       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27358          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   27359                     uses_vvvv, vbi, pfx, delta, "vpaddb", Iop_Add8x32 );
   27360          goto decode_success;
   27361       }
   27362       break;
   27363 
   27364    case 0xFD:
   27365       /* VPADDW r/m, rV, r ::: r = rV + r/m */
   27366       /* VPADDW = VEX.NDS.128.66.0F.WIG FD /r */
   27367       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27368          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   27369                     uses_vvvv, vbi, pfx, delta, "vpaddw", Iop_Add16x8 );
   27370          goto decode_success;
   27371       }
   27372       /* VPADDW r/m, rV, r ::: r = rV + r/m */
   27373       /* VPADDW = VEX.NDS.256.66.0F.WIG FD /r */
   27374       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27375          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   27376                     uses_vvvv, vbi, pfx, delta, "vpaddw", Iop_Add16x16 );
   27377          goto decode_success;
   27378       }
   27379       break;
   27380 
   27381    case 0xFE:
   27382       /* VPADDD r/m, rV, r ::: r = rV + r/m */
   27383       /* VPADDD = VEX.NDS.128.66.0F.WIG FE /r */
   27384       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27385          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   27386                     uses_vvvv, vbi, pfx, delta, "vpaddd", Iop_Add32x4 );
   27387          goto decode_success;
   27388       }
   27389       /* VPADDD r/m, rV, r ::: r = rV + r/m */
   27390       /* VPADDD = VEX.NDS.256.66.0F.WIG FE /r */
   27391       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27392          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   27393                     uses_vvvv, vbi, pfx, delta, "vpaddd", Iop_Add32x8 );
   27394          goto decode_success;
   27395       }
   27396       break;
   27397 
   27398    default:
   27399       break;
   27400 
   27401    }
   27402 
   27403   //decode_failure:
   27404    return deltaIN;
   27405 
   27406   decode_success:
   27407    return delta;
   27408 }
   27409 
   27410 
   27411 /*------------------------------------------------------------*/
   27412 /*---                                                      ---*/
   27413 /*--- Top-level post-escape decoders: dis_ESC_0F38__VEX    ---*/
   27414 /*---                                                      ---*/
   27415 /*------------------------------------------------------------*/
   27416 
   27417 static IRTemp math_PERMILPS_VAR_128 ( IRTemp dataV, IRTemp ctrlV )
   27418 {
   27419    /* In the control vector, zero out all but the bottom two bits of
   27420       each 32-bit lane. */
   27421    IRExpr* cv1 = binop(Iop_ShrN32x4,
   27422                        binop(Iop_ShlN32x4, mkexpr(ctrlV), mkU8(30)),
   27423                        mkU8(30));
   27424    /* And use the resulting cleaned-up control vector as steering
   27425       in a Perm operation. */
   27426    IRTemp res = newTemp(Ity_V128);
   27427    assign(res, binop(Iop_Perm32x4, mkexpr(dataV), cv1));
   27428    return res;
   27429 }
   27430 
   27431 static IRTemp math_PERMILPS_VAR_256 ( IRTemp dataV, IRTemp ctrlV )
   27432 {
   27433    IRTemp dHi, dLo, cHi, cLo;
   27434    dHi = dLo = cHi = cLo = IRTemp_INVALID;
   27435    breakupV256toV128s( dataV, &dHi, &dLo );
   27436    breakupV256toV128s( ctrlV, &cHi, &cLo );
   27437    IRTemp rHi = math_PERMILPS_VAR_128( dHi, cHi );
   27438    IRTemp rLo = math_PERMILPS_VAR_128( dLo, cLo );
   27439    IRTemp res = newTemp(Ity_V256);
   27440    assign(res, binop(Iop_V128HLtoV256, mkexpr(rHi), mkexpr(rLo)));
   27441    return res;
   27442 }
   27443 
   27444 static IRTemp math_PERMILPD_VAR_128 ( IRTemp dataV, IRTemp ctrlV )
   27445 {
   27446    /* No cleverness here .. */
   27447    IRTemp dHi, dLo, cHi, cLo;
   27448    dHi = dLo = cHi = cLo = IRTemp_INVALID;
   27449    breakupV128to64s( dataV, &dHi, &dLo );
   27450    breakupV128to64s( ctrlV, &cHi, &cLo );
   27451    IRExpr* rHi
   27452       = IRExpr_ITE( unop(Iop_64to1,
   27453                          binop(Iop_Shr64, mkexpr(cHi), mkU8(1))),
   27454                     mkexpr(dHi), mkexpr(dLo) );
   27455    IRExpr* rLo
   27456       = IRExpr_ITE( unop(Iop_64to1,
   27457                          binop(Iop_Shr64, mkexpr(cLo), mkU8(1))),
   27458                     mkexpr(dHi), mkexpr(dLo) );
   27459    IRTemp res = newTemp(Ity_V128);
   27460    assign(res, binop(Iop_64HLtoV128, rHi, rLo));
   27461    return res;
   27462 }
   27463 
   27464 static IRTemp math_PERMILPD_VAR_256 ( IRTemp dataV, IRTemp ctrlV )
   27465 {
   27466    IRTemp dHi, dLo, cHi, cLo;
   27467    dHi = dLo = cHi = cLo = IRTemp_INVALID;
   27468    breakupV256toV128s( dataV, &dHi, &dLo );
   27469    breakupV256toV128s( ctrlV, &cHi, &cLo );
   27470    IRTemp rHi = math_PERMILPD_VAR_128( dHi, cHi );
   27471    IRTemp rLo = math_PERMILPD_VAR_128( dLo, cLo );
   27472    IRTemp res = newTemp(Ity_V256);
   27473    assign(res, binop(Iop_V128HLtoV256, mkexpr(rHi), mkexpr(rLo)));
   27474    return res;
   27475 }
   27476 
   27477 static IRTemp math_VPERMD ( IRTemp ctrlV, IRTemp dataV )
   27478 {
   27479    /* In the control vector, zero out all but the bottom three bits of
   27480       each 32-bit lane. */
   27481    IRExpr* cv1 = binop(Iop_ShrN32x8,
   27482                        binop(Iop_ShlN32x8, mkexpr(ctrlV), mkU8(29)),
   27483                        mkU8(29));
   27484    /* And use the resulting cleaned-up control vector as steering
   27485       in a Perm operation. */
   27486    IRTemp res = newTemp(Ity_V256);
   27487    assign(res, binop(Iop_Perm32x8, mkexpr(dataV), cv1));
   27488    return res;
   27489 }
   27490 
   27491 static Long dis_SHIFTX ( /*OUT*/Bool* uses_vvvv,
   27492                          const VexAbiInfo* vbi, Prefix pfx, Long delta,
   27493                          const HChar* opname, IROp op8 )
   27494 {
   27495    HChar   dis_buf[50];
   27496    Int     alen;
   27497    Int     size = getRexW(pfx) ? 8 : 4;
   27498    IRType  ty   = szToITy(size);
   27499    IRTemp  src  = newTemp(ty);
   27500    IRTemp  amt  = newTemp(ty);
   27501    UChar   rm   = getUChar(delta);
   27502 
   27503    assign( amt, getIRegV(size,pfx) );
   27504    if (epartIsReg(rm)) {
   27505       assign( src, getIRegE(size,pfx,rm) );
   27506       DIP("%s %s,%s,%s\n", opname, nameIRegV(size,pfx),
   27507                            nameIRegE(size,pfx,rm), nameIRegG(size,pfx,rm));
   27508       delta++;
   27509    } else {
   27510       IRTemp addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   27511       assign( src, loadLE(ty, mkexpr(addr)) );
   27512       DIP("%s %s,%s,%s\n", opname, nameIRegV(size,pfx), dis_buf,
   27513                            nameIRegG(size,pfx,rm));
   27514       delta += alen;
   27515    }
   27516 
   27517    putIRegG( size, pfx, rm,
   27518              binop(mkSizedOp(ty,op8), mkexpr(src),
   27519                    narrowTo(Ity_I8, binop(mkSizedOp(ty,Iop_And8), mkexpr(amt),
   27520                                           mkU(ty,8*size-1)))) );
   27521    /* Flags aren't modified.  */
   27522    *uses_vvvv = True;
   27523    return delta;
   27524 }
   27525 
   27526 
   27527 static Long dis_FMA ( const VexAbiInfo* vbi, Prefix pfx, Long delta, UChar opc )
   27528 {
   27529    UChar  modrm   = getUChar(delta);
   27530    UInt   rG      = gregOfRexRM(pfx, modrm);
   27531    UInt   rV      = getVexNvvvv(pfx);
   27532    Bool   scalar  = (opc & 0xF) > 7 && (opc & 1);
   27533    IRType ty      = getRexW(pfx) ? Ity_F64 : Ity_F32;
   27534    IRType vty     = scalar ? ty : getVexL(pfx) ? Ity_V256 : Ity_V128;
   27535    IRTemp vX      = newTemp(vty);
   27536    IRTemp vY      = newTemp(vty);
   27537    IRTemp vZ      = newTemp(vty);
   27538    IRExpr *x[8], *y[8], *z[8];
   27539    IRTemp addr    = IRTemp_INVALID;
   27540    HChar  dis_buf[50];
   27541    Int    alen    = 0;
   27542    const HChar *name;
   27543    const HChar *suffix;
   27544    const HChar *order;
   27545    Bool   negateRes   = False;
   27546    Bool   negateZeven = False;
   27547    Bool   negateZodd  = False;
   27548    Int    i, j;
   27549    Int    count;
   27550    static IROp ops[] = { Iop_V256to64_0, Iop_V256to64_1,
   27551                          Iop_V256to64_2, Iop_V256to64_3,
   27552                          Iop_V128to64, Iop_V128HIto64 };
   27553 
   27554    switch (opc & 0xF) {
   27555    case 0x6:
   27556       name = "addsub";
   27557       negateZeven = True;
   27558       break;
   27559    case 0x7:
   27560       name = "subadd";
   27561       negateZodd = True;
   27562       break;
   27563    case 0x8:
   27564    case 0x9:
   27565       name = "add";
   27566       break;
   27567    case 0xA:
   27568    case 0xB:
   27569       name = "sub";
   27570       negateZeven = True;
   27571       negateZodd = True;
   27572       break;
   27573    case 0xC:
   27574    case 0xD:
   27575       name = "add";
   27576       negateRes = True;
   27577       negateZeven = True;
   27578       negateZodd = True;
   27579       break;
   27580    case 0xE:
   27581    case 0xF:
   27582       name = "sub";
   27583       negateRes = True;
   27584       break;
   27585    default:
   27586       vpanic("dis_FMA(amd64)");
   27587       break;
   27588    }
   27589    switch (opc & 0xF0) {
   27590    case 0x90: order = "132"; break;
   27591    case 0xA0: order = "213"; break;
   27592    case 0xB0: order = "231"; break;
   27593    default: vpanic("dis_FMA(amd64)"); break;
   27594    }
   27595    if (scalar)
   27596       suffix = ty == Ity_F64 ? "sd" : "ss";
   27597    else
   27598       suffix = ty == Ity_F64 ? "pd" : "ps";
   27599 
   27600    if (scalar) {
   27601       assign( vX, ty == Ity_F64
   27602                   ? getXMMRegLane64F(rG, 0) : getXMMRegLane32F(rG, 0) );
   27603       assign( vZ, ty == Ity_F64
   27604                   ? getXMMRegLane64F(rV, 0) : getXMMRegLane32F(rV, 0) );
   27605    } else {
   27606       assign( vX, vty == Ity_V256 ? getYMMReg(rG) : getXMMReg(rG) );
   27607       assign( vZ, vty == Ity_V256 ? getYMMReg(rV) : getXMMReg(rV) );
   27608    }
   27609 
   27610    if (epartIsReg(modrm)) {
   27611       UInt rE = eregOfRexRM(pfx, modrm);
   27612       delta += 1;
   27613       if (scalar)
   27614          assign( vY, ty == Ity_F64
   27615                      ? getXMMRegLane64F(rE, 0) : getXMMRegLane32F(rE, 0) );
   27616       else
   27617          assign( vY, vty == Ity_V256 ? getYMMReg(rE) : getXMMReg(rE) );
   27618       if (vty == Ity_V256) {
   27619          DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
   27620              name, order, suffix, nameYMMReg(rE), nameYMMReg(rV),
   27621              nameYMMReg(rG));
   27622       } else {
   27623          DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
   27624              name, order, suffix, nameXMMReg(rE), nameXMMReg(rV),
   27625              nameXMMReg(rG));
   27626       }
   27627    } else {
   27628       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   27629       delta += alen;
   27630       assign(vY, loadLE(vty, mkexpr(addr)));
   27631       if (vty == Ity_V256) {
   27632          DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
   27633              name, order, suffix, dis_buf, nameYMMReg(rV),
   27634              nameYMMReg(rG));
   27635       } else {
   27636          DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
   27637              name, order, suffix, dis_buf, nameXMMReg(rV),
   27638              nameXMMReg(rG));
   27639       }
   27640    }
   27641 
   27642    /* vX/vY/vZ now in 132 order.  If it is different order, swap the
   27643       arguments.  */
   27644    if ((opc & 0xF0) != 0x90) {
   27645       IRTemp tem = vX;
   27646       if ((opc & 0xF0) == 0xA0) {
   27647          vX = vZ;
   27648          vZ = vY;
   27649          vY = tem;
   27650       } else {
   27651          vX = vZ;
   27652          vZ = tem;
   27653       }
   27654    }
   27655 
   27656    if (scalar) {
   27657       count = 1;
   27658       x[0] = mkexpr(vX);
   27659       y[0] = mkexpr(vY);
   27660       z[0] = mkexpr(vZ);
   27661    } else if (ty == Ity_F32) {
   27662       count = vty == Ity_V256 ? 8 : 4;
   27663       j = vty == Ity_V256 ? 0 : 4;
   27664       for (i = 0; i < count; i += 2) {
   27665          IRTemp tem = newTemp(Ity_I64);
   27666          assign(tem, unop(ops[i / 2 + j], mkexpr(vX)));
   27667          x[i] = unop(Iop_64to32, mkexpr(tem));
   27668          x[i + 1] = unop(Iop_64HIto32, mkexpr(tem));
   27669          tem = newTemp(Ity_I64);
   27670          assign(tem, unop(ops[i / 2 + j], mkexpr(vY)));
   27671          y[i] = unop(Iop_64to32, mkexpr(tem));
   27672          y[i + 1] = unop(Iop_64HIto32, mkexpr(tem));
   27673          tem = newTemp(Ity_I64);
   27674          assign(tem, unop(ops[i / 2 + j], mkexpr(vZ)));
   27675          z[i] = unop(Iop_64to32, mkexpr(tem));
   27676          z[i + 1] = unop(Iop_64HIto32, mkexpr(tem));
   27677       }
   27678    } else {
   27679       count = vty == Ity_V256 ? 4 : 2;
   27680       j = vty == Ity_V256 ? 0 : 4;
   27681       for (i = 0; i < count; i++) {
   27682          x[i] = unop(ops[i + j], mkexpr(vX));
   27683          y[i] = unop(ops[i + j], mkexpr(vY));
   27684          z[i] = unop(ops[i + j], mkexpr(vZ));
   27685       }
   27686    }
   27687    if (!scalar)
   27688       for (i = 0; i < count; i++) {
   27689          IROp op = ty == Ity_F64
   27690                    ? Iop_ReinterpI64asF64 : Iop_ReinterpI32asF32;
   27691          x[i] = unop(op, x[i]);
   27692          y[i] = unop(op, y[i]);
   27693          z[i] = unop(op, z[i]);
   27694       }
   27695    for (i = 0; i < count; i++) {
   27696       if ((i & 1) ? negateZodd : negateZeven)
   27697          z[i] = unop(ty == Ity_F64 ? Iop_NegF64 : Iop_NegF32, z[i]);
   27698       x[i] = IRExpr_Qop(ty == Ity_F64 ? Iop_MAddF64 : Iop_MAddF32,
   27699                         get_FAKE_roundingmode(), x[i], y[i], z[i]);
   27700       if (negateRes)
   27701          x[i] = unop(ty == Ity_F64 ? Iop_NegF64 : Iop_NegF32, x[i]);
   27702       if (ty == Ity_F64)
   27703          putYMMRegLane64F( rG, i, x[i] );
   27704       else
   27705          putYMMRegLane32F( rG, i, x[i] );
   27706    }
   27707    if (vty != Ity_V256)
   27708       putYMMRegLane128( rG, 1, mkV128(0) );
   27709 
   27710    return delta;
   27711 }
   27712 
   27713 
   27714 /* Masked load or masked store. */
   27715 static ULong dis_VMASKMOV ( Bool *uses_vvvv, const VexAbiInfo* vbi,
   27716                             Prefix pfx, Long delta,
   27717                             const HChar* opname, Bool isYMM, IRType ty,
   27718                             Bool isLoad )
   27719 {
   27720    HChar   dis_buf[50];
   27721    Int     alen, i;
   27722    IRTemp  addr;
   27723    UChar   modrm = getUChar(delta);
   27724    UInt    rG    = gregOfRexRM(pfx,modrm);
   27725    UInt    rV    = getVexNvvvv(pfx);
   27726 
   27727    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   27728    delta += alen;
   27729 
   27730    /**/ if (isLoad && isYMM) {
   27731       DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
   27732    }
   27733    else if (isLoad && !isYMM) {
   27734       DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   27735    }
   27736 
   27737    else if (!isLoad && isYMM) {
   27738       DIP("%s %s,%s,%s\n", opname, nameYMMReg(rG), nameYMMReg(rV), dis_buf );
   27739    }
   27740    else {
   27741       vassert(!isLoad && !isYMM);
   27742       DIP("%s %s,%s,%s\n", opname, nameXMMReg(rG), nameXMMReg(rV), dis_buf );
   27743    }
   27744 
   27745    vassert(ty == Ity_I32 || ty == Ity_I64);
   27746    Bool laneIs32 = ty == Ity_I32;
   27747 
   27748    Int nLanes = (isYMM ? 2 : 1) * (laneIs32 ? 4 : 2);
   27749 
   27750    for (i = 0; i < nLanes; i++) {
   27751       IRExpr* shAmt = laneIs32 ? mkU8(31)    : mkU8(63);
   27752       IRExpr* one   = laneIs32 ? mkU32(1)    : mkU64(1);
   27753       IROp    opSHR = laneIs32 ? Iop_Shr32   : Iop_Shr64;
   27754       IROp    opEQ  = laneIs32 ? Iop_CmpEQ32 : Iop_CmpEQ64;
   27755       IRExpr* lane  = (laneIs32 ? getYMMRegLane32 : getYMMRegLane64)( rV, i );
   27756 
   27757       IRTemp  cond = newTemp(Ity_I1);
   27758       assign(cond, binop(opEQ, binop(opSHR, lane, shAmt), one));
   27759 
   27760       IRTemp  data = newTemp(ty);
   27761       IRExpr* ea   = binop(Iop_Add64, mkexpr(addr),
   27762                                       mkU64(i * (laneIs32 ? 4 : 8)));
   27763       if (isLoad) {
   27764          stmt(
   27765             IRStmt_LoadG(
   27766                Iend_LE, laneIs32 ? ILGop_Ident32 : ILGop_Ident64,
   27767                data, ea, laneIs32 ? mkU32(0) : mkU64(0), mkexpr(cond)
   27768          ));
   27769          (laneIs32 ? putYMMRegLane32 : putYMMRegLane64)( rG, i, mkexpr(data) );
   27770       } else {
   27771          assign(data, (laneIs32 ? getYMMRegLane32 : getYMMRegLane64)( rG, i ));
   27772          stmt( IRStmt_StoreG(Iend_LE, ea, mkexpr(data), mkexpr(cond)) );
   27773       }
   27774    }
   27775 
   27776    if (isLoad && !isYMM)
   27777       putYMMRegLane128( rG, 1, mkV128(0) );
   27778 
   27779    *uses_vvvv = True;
   27780    return delta;
   27781 }
   27782 
   27783 
   27784 /* Gather.  */
   27785 static ULong dis_VGATHER ( Bool *uses_vvvv, const VexAbiInfo* vbi,
   27786                            Prefix pfx, Long delta,
   27787                            const HChar* opname, Bool isYMM,
   27788                            Bool isVM64x, IRType ty )
   27789 {
   27790    HChar  dis_buf[50];
   27791    Int    alen, i, vscale, count1, count2;
   27792    IRTemp addr;
   27793    UChar  modrm = getUChar(delta);
   27794    UInt   rG    = gregOfRexRM(pfx,modrm);
   27795    UInt   rV    = getVexNvvvv(pfx);
   27796    UInt   rI;
   27797    IRType dstTy = (isYMM && (ty == Ity_I64 || !isVM64x)) ? Ity_V256 : Ity_V128;
   27798    IRType idxTy = (isYMM && (ty == Ity_I32 || isVM64x)) ? Ity_V256 : Ity_V128;
   27799    IRTemp cond;
   27800    addr = disAVSIBMode ( &alen, vbi, pfx, delta, dis_buf, &rI,
   27801                          idxTy, &vscale );
   27802    if (addr == IRTemp_INVALID || rI == rG || rI == rV || rG == rV)
   27803       return delta;
   27804    if (dstTy == Ity_V256) {
   27805       DIP("%s %s,%s,%s\n", opname, nameYMMReg(rV), dis_buf, nameYMMReg(rG) );
   27806    } else {
   27807       DIP("%s %s,%s,%s\n", opname, nameXMMReg(rV), dis_buf, nameXMMReg(rG) );
   27808    }
   27809    delta += alen;
   27810 
   27811    if (ty == Ity_I32) {
   27812       count1 = isYMM ? 8 : 4;
   27813       count2 = isVM64x ? count1 / 2 : count1;
   27814    } else {
   27815       count1 = count2 = isYMM ? 4 : 2;
   27816    }
   27817 
   27818    /* First update the mask register to copies of the sign bit.  */
   27819    if (ty == Ity_I32) {
   27820       if (isYMM)
   27821          putYMMReg( rV, binop(Iop_SarN32x8, getYMMReg( rV ), mkU8(31)) );
   27822       else
   27823          putYMMRegLoAndZU( rV, binop(Iop_SarN32x4, getXMMReg( rV ), mkU8(31)) );
   27824    } else {
   27825       for (i = 0; i < count1; i++) {
   27826          putYMMRegLane64( rV, i, binop(Iop_Sar64, getYMMRegLane64( rV, i ),
   27827                                        mkU8(63)) );
   27828       }
   27829    }
   27830 
   27831    /* Next gather the individual elements.  If any fault occurs, the
   27832       corresponding mask element will be set and the loop stops.  */
   27833    for (i = 0; i < count2; i++) {
   27834       IRExpr *expr, *addr_expr;
   27835       cond = newTemp(Ity_I1);
   27836       assign( cond,
   27837               binop(ty == Ity_I32 ? Iop_CmpLT32S : Iop_CmpLT64S,
   27838                     ty == Ity_I32 ? getYMMRegLane32( rV, i )
   27839                                   : getYMMRegLane64( rV, i ),
   27840                     mkU(ty, 0)) );
   27841       expr = ty == Ity_I32 ? getYMMRegLane32( rG, i )
   27842                            : getYMMRegLane64( rG, i );
   27843       addr_expr = isVM64x ? getYMMRegLane64( rI, i )
   27844                           : unop(Iop_32Sto64, getYMMRegLane32( rI, i ));
   27845       switch (vscale) {
   27846          case 2: addr_expr = binop(Iop_Shl64, addr_expr, mkU8(1)); break;
   27847          case 4: addr_expr = binop(Iop_Shl64, addr_expr, mkU8(2)); break;
   27848          case 8: addr_expr = binop(Iop_Shl64, addr_expr, mkU8(3)); break;
   27849          default: break;
   27850       }
   27851       addr_expr = binop(Iop_Add64, mkexpr(addr), addr_expr);
   27852       addr_expr = handleAddrOverrides(vbi, pfx, addr_expr);
   27853       addr_expr = IRExpr_ITE(mkexpr(cond), addr_expr, getIReg64(R_RSP));
   27854       expr = IRExpr_ITE(mkexpr(cond), loadLE(ty, addr_expr), expr);
   27855       if (ty == Ity_I32) {
   27856          putYMMRegLane32( rG, i, expr );
   27857          putYMMRegLane32( rV, i, mkU32(0) );
   27858       } else {
   27859          putYMMRegLane64( rG, i, expr);
   27860          putYMMRegLane64( rV, i, mkU64(0) );
   27861       }
   27862    }
   27863 
   27864    if (!isYMM || (ty == Ity_I32 && isVM64x)) {
   27865       if (ty == Ity_I64 || isYMM)
   27866          putYMMRegLane128( rV, 1, mkV128(0) );
   27867       else if (ty == Ity_I32 && count2 == 2) {
   27868          putYMMRegLane64( rV, 1, mkU64(0) );
   27869          putYMMRegLane64( rG, 1, mkU64(0) );
   27870       }
   27871       putYMMRegLane128( rG, 1, mkV128(0) );
   27872    }
   27873 
   27874    *uses_vvvv = True;
   27875    return delta;
   27876 }
   27877 
   27878 
   27879 __attribute__((noinline))
   27880 static
   27881 Long dis_ESC_0F38__VEX (
   27882         /*MB_OUT*/DisResult* dres,
   27883         /*OUT*/   Bool*      uses_vvvv,
   27884         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
   27885         Bool         resteerCisOk,
   27886         void*        callback_opaque,
   27887         const VexArchInfo* archinfo,
   27888         const VexAbiInfo*  vbi,
   27889         Prefix pfx, Int sz, Long deltaIN
   27890      )
   27891 {
   27892    IRTemp addr  = IRTemp_INVALID;
   27893    Int    alen  = 0;
   27894    HChar  dis_buf[50];
   27895    Long   delta = deltaIN;
   27896    UChar  opc   = getUChar(delta);
   27897    delta++;
   27898    *uses_vvvv = False;
   27899 
   27900    switch (opc) {
   27901 
   27902    case 0x00:
   27903       /* VPSHUFB r/m, rV, r ::: r = shuf(rV, r/m) */
   27904       /* VPSHUFB = VEX.NDS.128.66.0F38.WIG 00 /r */
   27905       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27906          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   27907                     uses_vvvv, vbi, pfx, delta, "vpshufb", math_PSHUFB_XMM );
   27908          goto decode_success;
   27909       }
   27910       /* VPSHUFB r/m, rV, r ::: r = shuf(rV, r/m) */
   27911       /* VPSHUFB = VEX.NDS.256.66.0F38.WIG 00 /r */
   27912       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27913          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   27914                     uses_vvvv, vbi, pfx, delta, "vpshufb", math_PSHUFB_YMM );
   27915          goto decode_success;
   27916       }
   27917       break;
   27918 
   27919    case 0x01:
   27920    case 0x02:
   27921    case 0x03:
   27922       /* VPHADDW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 01 /r */
   27923       /* VPHADDD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 02 /r */
   27924       /* VPHADDSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 03 /r */
   27925       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27926          delta = dis_PHADD_128( vbi, pfx, delta, True/*isAvx*/, opc );
   27927          *uses_vvvv = True;
   27928          goto decode_success;
   27929       }
   27930       /* VPHADDW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 01 /r */
   27931       /* VPHADDD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 02 /r */
   27932       /* VPHADDSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 03 /r */
   27933       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27934          delta = dis_PHADD_256( vbi, pfx, delta, opc );
   27935          *uses_vvvv = True;
   27936          goto decode_success;
   27937       }
   27938       break;
   27939 
   27940    case 0x04:
   27941       /* VPMADDUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 04 /r */
   27942       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27943          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   27944                     uses_vvvv, vbi, pfx, delta, "vpmaddubsw",
   27945                     math_PMADDUBSW_128 );
   27946          goto decode_success;
   27947       }
   27948       /* VPMADDUBSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 04 /r */
   27949       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27950          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   27951                     uses_vvvv, vbi, pfx, delta, "vpmaddubsw",
   27952                     math_PMADDUBSW_256 );
   27953          goto decode_success;
   27954       }
   27955       break;
   27956 
   27957    case 0x05:
   27958    case 0x06:
   27959    case 0x07:
   27960       /* VPHSUBW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 05 /r */
   27961       /* VPHSUBD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 06 /r */
   27962       /* VPHSUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 07 /r */
   27963       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27964          delta = dis_PHADD_128( vbi, pfx, delta, True/*isAvx*/, opc );
   27965          *uses_vvvv = True;
   27966          goto decode_success;
   27967       }
   27968       /* VPHSUBW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 05 /r */
   27969       /* VPHSUBD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 06 /r */
   27970       /* VPHSUBSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 07 /r */
   27971       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27972          delta = dis_PHADD_256( vbi, pfx, delta, opc );
   27973          *uses_vvvv = True;
   27974          goto decode_success;
   27975       }
   27976       break;
   27977 
   27978    case 0x08:
   27979    case 0x09:
   27980    case 0x0A:
   27981       /* VPSIGNB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 08 /r */
   27982       /* VPSIGNW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 09 /r */
   27983       /* VPSIGND xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 0A /r */
   27984       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27985          IRTemp sV      = newTemp(Ity_V128);
   27986          IRTemp dV      = newTemp(Ity_V128);
   27987          IRTemp sHi, sLo, dHi, dLo;
   27988          sHi = sLo = dHi = dLo = IRTemp_INVALID;
   27989          HChar  ch      = '?';
   27990          Int    laneszB = 0;
   27991          UChar  modrm   = getUChar(delta);
   27992          UInt   rG      = gregOfRexRM(pfx,modrm);
   27993          UInt   rV      = getVexNvvvv(pfx);
   27994 
   27995          switch (opc) {
   27996             case 0x08: laneszB = 1; ch = 'b'; break;
   27997             case 0x09: laneszB = 2; ch = 'w'; break;
   27998             case 0x0A: laneszB = 4; ch = 'd'; break;
   27999             default: vassert(0);
   28000          }
   28001 
   28002          assign( dV, getXMMReg(rV) );
   28003 
   28004          if (epartIsReg(modrm)) {
   28005             UInt rE = eregOfRexRM(pfx,modrm);
   28006             assign( sV, getXMMReg(rE) );
   28007             delta += 1;
   28008             DIP("vpsign%c %s,%s,%s\n", ch, nameXMMReg(rE),
   28009                 nameXMMReg(rV), nameXMMReg(rG));
   28010          } else {
   28011             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   28012             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   28013             delta += alen;
   28014             DIP("vpsign%c %s,%s,%s\n", ch, dis_buf,
   28015                 nameXMMReg(rV), nameXMMReg(rG));
   28016          }
   28017 
   28018          breakupV128to64s( dV, &dHi, &dLo );
   28019          breakupV128to64s( sV, &sHi, &sLo );
   28020 
   28021          putYMMRegLoAndZU(
   28022             rG,
   28023             binop(Iop_64HLtoV128,
   28024                   dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
   28025                   dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
   28026             )
   28027          );
   28028          *uses_vvvv = True;
   28029          goto decode_success;
   28030       }
   28031       /* VPSIGNB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 08 /r */
   28032       /* VPSIGNW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 09 /r */
   28033       /* VPSIGND ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 0A /r */
   28034       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28035          IRTemp sV      = newTemp(Ity_V256);
   28036          IRTemp dV      = newTemp(Ity_V256);
   28037          IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   28038          s3 = s2 = s1 = s0 = IRTemp_INVALID;
   28039          d3 = d2 = d1 = d0 = IRTemp_INVALID;
   28040          UChar  ch      = '?';
   28041          Int    laneszB = 0;
   28042          UChar  modrm   = getUChar(delta);
   28043          UInt   rG      = gregOfRexRM(pfx,modrm);
   28044          UInt   rV      = getVexNvvvv(pfx);
   28045 
   28046          switch (opc) {
   28047             case 0x08: laneszB = 1; ch = 'b'; break;
   28048             case 0x09: laneszB = 2; ch = 'w'; break;
   28049             case 0x0A: laneszB = 4; ch = 'd'; break;
   28050             default: vassert(0);
   28051          }
   28052 
   28053          assign( dV, getYMMReg(rV) );
   28054 
   28055          if (epartIsReg(modrm)) {
   28056             UInt rE = eregOfRexRM(pfx,modrm);
   28057             assign( sV, getYMMReg(rE) );
   28058             delta += 1;
   28059             DIP("vpsign%c %s,%s,%s\n", ch, nameYMMReg(rE),
   28060                 nameYMMReg(rV), nameYMMReg(rG));
   28061          } else {
   28062             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   28063             assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   28064             delta += alen;
   28065             DIP("vpsign%c %s,%s,%s\n", ch, dis_buf,
   28066                 nameYMMReg(rV), nameYMMReg(rG));
   28067          }
   28068 
   28069          breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
   28070          breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
   28071 
   28072          putYMMReg(
   28073             rG,
   28074             binop( Iop_V128HLtoV256,
   28075                    binop(Iop_64HLtoV128,
   28076                          dis_PSIGN_helper( mkexpr(s3), mkexpr(d3), laneszB ),
   28077                          dis_PSIGN_helper( mkexpr(s2), mkexpr(d2), laneszB )
   28078                    ),
   28079                    binop(Iop_64HLtoV128,
   28080                          dis_PSIGN_helper( mkexpr(s1), mkexpr(d1), laneszB ),
   28081                          dis_PSIGN_helper( mkexpr(s0), mkexpr(d0), laneszB )
   28082                    )
   28083             )
   28084          );
   28085          *uses_vvvv = True;
   28086          goto decode_success;
   28087       }
   28088       break;
   28089 
   28090    case 0x0B:
   28091       /* VPMULHRSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 0B /r */
   28092       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28093          IRTemp sV      = newTemp(Ity_V128);
   28094          IRTemp dV      = newTemp(Ity_V128);
   28095          IRTemp sHi, sLo, dHi, dLo;
   28096          sHi = sLo = dHi = dLo = IRTemp_INVALID;
   28097          UChar  modrm   = getUChar(delta);
   28098          UInt   rG      = gregOfRexRM(pfx,modrm);
   28099          UInt   rV      = getVexNvvvv(pfx);
   28100 
   28101          assign( dV, getXMMReg(rV) );
   28102 
   28103          if (epartIsReg(modrm)) {
   28104             UInt rE = eregOfRexRM(pfx,modrm);
   28105             assign( sV, getXMMReg(rE) );
   28106             delta += 1;
   28107             DIP("vpmulhrsw %s,%s,%s\n", nameXMMReg(rE),
   28108                 nameXMMReg(rV), nameXMMReg(rG));
   28109          } else {
   28110             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   28111             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   28112             delta += alen;
   28113             DIP("vpmulhrsw %s,%s,%s\n", dis_buf,
   28114                 nameXMMReg(rV), nameXMMReg(rG));
   28115          }
   28116 
   28117          breakupV128to64s( dV, &dHi, &dLo );
   28118          breakupV128to64s( sV, &sHi, &sLo );
   28119 
   28120          putYMMRegLoAndZU(
   28121             rG,
   28122             binop(Iop_64HLtoV128,
   28123                   dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
   28124                   dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
   28125             )
   28126          );
   28127          *uses_vvvv = True;
   28128          goto decode_success;
   28129       }
   28130       /* VPMULHRSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 0B /r */
   28131       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28132          IRTemp sV      = newTemp(Ity_V256);
   28133          IRTemp dV      = newTemp(Ity_V256);
   28134          IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   28135          s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   28136          UChar  modrm   = getUChar(delta);
   28137          UInt   rG      = gregOfRexRM(pfx,modrm);
   28138          UInt   rV      = getVexNvvvv(pfx);
   28139 
   28140          assign( dV, getYMMReg(rV) );
   28141 
   28142          if (epartIsReg(modrm)) {
   28143             UInt rE = eregOfRexRM(pfx,modrm);
   28144             assign( sV, getYMMReg(rE) );
   28145             delta += 1;
   28146             DIP("vpmulhrsw %s,%s,%s\n", nameYMMReg(rE),
   28147                 nameYMMReg(rV), nameYMMReg(rG));
   28148          } else {
   28149             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   28150             assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   28151             delta += alen;
   28152             DIP("vpmulhrsw %s,%s,%s\n", dis_buf,
   28153                 nameYMMReg(rV), nameYMMReg(rG));
   28154          }
   28155 
   28156          breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
   28157          breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
   28158 
   28159          putYMMReg(
   28160             rG,
   28161             binop(Iop_V128HLtoV256,
   28162                   binop(Iop_64HLtoV128,
   28163                         dis_PMULHRSW_helper( mkexpr(s3), mkexpr(d3) ),
   28164                         dis_PMULHRSW_helper( mkexpr(s2), mkexpr(d2) ) ),
   28165                   binop(Iop_64HLtoV128,
   28166                         dis_PMULHRSW_helper( mkexpr(s1), mkexpr(d1) ),
   28167                         dis_PMULHRSW_helper( mkexpr(s0), mkexpr(d0) ) )
   28168             )
   28169          );
   28170          *uses_vvvv = True;
   28171          goto decode_success;
   28172       }
   28173       break;
   28174 
   28175    case 0x0C:
   28176       /* VPERMILPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0C /r */
   28177       if (have66noF2noF3(pfx)
   28178           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   28179          UChar  modrm = getUChar(delta);
   28180          UInt   rG    = gregOfRexRM(pfx, modrm);
   28181          UInt   rV    = getVexNvvvv(pfx);
   28182          IRTemp ctrlV = newTemp(Ity_V128);
   28183          if (epartIsReg(modrm)) {
   28184             UInt rE = eregOfRexRM(pfx, modrm);
   28185             delta += 1;
   28186             DIP("vpermilps %s,%s,%s\n",
   28187                 nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   28188             assign(ctrlV, getXMMReg(rE));
   28189          } else {
   28190             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28191             delta += alen;
   28192             DIP("vpermilps %s,%s,%s\n",
   28193                 dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   28194             assign(ctrlV, loadLE(Ity_V128, mkexpr(addr)));
   28195          }
   28196          IRTemp dataV = newTemp(Ity_V128);
   28197          assign(dataV, getXMMReg(rV));
   28198          IRTemp resV = math_PERMILPS_VAR_128(dataV, ctrlV);
   28199          putYMMRegLoAndZU(rG, mkexpr(resV));
   28200          *uses_vvvv = True;
   28201          goto decode_success;
   28202       }
   28203       /* VPERMILPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0C /r */
   28204       if (have66noF2noF3(pfx)
   28205           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   28206          UChar  modrm = getUChar(delta);
   28207          UInt   rG    = gregOfRexRM(pfx, modrm);
   28208          UInt   rV    = getVexNvvvv(pfx);
   28209          IRTemp ctrlV = newTemp(Ity_V256);
   28210          if (epartIsReg(modrm)) {
   28211             UInt rE = eregOfRexRM(pfx, modrm);
   28212             delta += 1;
   28213             DIP("vpermilps %s,%s,%s\n",
   28214                 nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   28215             assign(ctrlV, getYMMReg(rE));
   28216          } else {
   28217             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28218             delta += alen;
   28219             DIP("vpermilps %s,%s,%s\n",
   28220                 dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   28221             assign(ctrlV, loadLE(Ity_V256, mkexpr(addr)));
   28222          }
   28223          IRTemp dataV = newTemp(Ity_V256);
   28224          assign(dataV, getYMMReg(rV));
   28225          IRTemp resV = math_PERMILPS_VAR_256(dataV, ctrlV);
   28226          putYMMReg(rG, mkexpr(resV));
   28227          *uses_vvvv = True;
   28228          goto decode_success;
   28229       }
   28230       break;
   28231 
   28232    case 0x0D:
   28233       /* VPERMILPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0D /r */
   28234       if (have66noF2noF3(pfx)
   28235           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   28236          UChar  modrm = getUChar(delta);
   28237          UInt   rG    = gregOfRexRM(pfx, modrm);
   28238          UInt   rV    = getVexNvvvv(pfx);
   28239          IRTemp ctrlV = newTemp(Ity_V128);
   28240          if (epartIsReg(modrm)) {
   28241             UInt rE = eregOfRexRM(pfx, modrm);
   28242             delta += 1;
   28243             DIP("vpermilpd %s,%s,%s\n",
   28244                 nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   28245             assign(ctrlV, getXMMReg(rE));
   28246          } else {
   28247             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28248             delta += alen;
   28249             DIP("vpermilpd %s,%s,%s\n",
   28250                 dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   28251             assign(ctrlV, loadLE(Ity_V128, mkexpr(addr)));
   28252          }
   28253          IRTemp dataV = newTemp(Ity_V128);
   28254          assign(dataV, getXMMReg(rV));
   28255          IRTemp resV = math_PERMILPD_VAR_128(dataV, ctrlV);
   28256          putYMMRegLoAndZU(rG, mkexpr(resV));
   28257          *uses_vvvv = True;
   28258          goto decode_success;
   28259       }
   28260       /* VPERMILPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0D /r */
   28261       if (have66noF2noF3(pfx)
   28262           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   28263          UChar  modrm = getUChar(delta);
   28264          UInt   rG    = gregOfRexRM(pfx, modrm);
   28265          UInt   rV    = getVexNvvvv(pfx);
   28266          IRTemp ctrlV = newTemp(Ity_V256);
   28267          if (epartIsReg(modrm)) {
   28268             UInt rE = eregOfRexRM(pfx, modrm);
   28269             delta += 1;
   28270             DIP("vpermilpd %s,%s,%s\n",
   28271                 nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   28272             assign(ctrlV, getYMMReg(rE));
   28273          } else {
   28274             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28275             delta += alen;
   28276             DIP("vpermilpd %s,%s,%s\n",
   28277                 dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   28278             assign(ctrlV, loadLE(Ity_V256, mkexpr(addr)));
   28279          }
   28280          IRTemp dataV = newTemp(Ity_V256);
   28281          assign(dataV, getYMMReg(rV));
   28282          IRTemp resV = math_PERMILPD_VAR_256(dataV, ctrlV);
   28283          putYMMReg(rG, mkexpr(resV));
   28284          *uses_vvvv = True;
   28285          goto decode_success;
   28286       }
   28287       break;
   28288 
   28289    case 0x0E:
   28290       /* VTESTPS xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 0E /r */
   28291       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28292          delta = dis_xTESTy_128( vbi, pfx, delta, True/*isAvx*/, 32 );
   28293          goto decode_success;
   28294       }
   28295       /* VTESTPS ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 0E /r */
   28296       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28297          delta = dis_xTESTy_256( vbi, pfx, delta, 32 );
   28298          goto decode_success;
   28299       }
   28300       break;
   28301 
   28302    case 0x0F:
   28303       /* VTESTPD xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 0F /r */
   28304       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28305          delta = dis_xTESTy_128( vbi, pfx, delta, True/*isAvx*/, 64 );
   28306          goto decode_success;
   28307       }
   28308       /* VTESTPD ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 0F /r */
   28309       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28310          delta = dis_xTESTy_256( vbi, pfx, delta, 64 );
   28311          goto decode_success;
   28312       }
   28313       break;
   28314 
   28315    case 0x16:
   28316       /* VPERMPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 16 /r */
   28317       if (have66noF2noF3(pfx)
   28318           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   28319          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   28320                     uses_vvvv, vbi, pfx, delta, "vpermps", math_VPERMD );
   28321          goto decode_success;
   28322       }
   28323       break;
   28324 
   28325    case 0x17:
   28326       /* VPTEST xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 17 /r */
   28327       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28328          delta = dis_xTESTy_128( vbi, pfx, delta, True/*isAvx*/, 0 );
   28329          goto decode_success;
   28330       }
   28331       /* VPTEST ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 17 /r */
   28332       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28333          delta = dis_xTESTy_256( vbi, pfx, delta, 0 );
   28334          goto decode_success;
   28335       }
   28336       break;
   28337 
   28338    case 0x18:
   28339       /* VBROADCASTSS m32, xmm1 = VEX.128.66.0F38.WIG 18 /r */
   28340       if (have66noF2noF3(pfx)
   28341           && 0==getVexL(pfx)/*128*/
   28342           && !epartIsReg(getUChar(delta))) {
   28343          UChar modrm = getUChar(delta);
   28344          UInt  rG    = gregOfRexRM(pfx, modrm);
   28345          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28346          delta += alen;
   28347          DIP("vbroadcastss %s,%s\n", dis_buf, nameXMMReg(rG));
   28348          IRTemp t32 = newTemp(Ity_I32);
   28349          assign(t32, loadLE(Ity_I32, mkexpr(addr)));
   28350          IRTemp t64 = newTemp(Ity_I64);
   28351          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   28352          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
   28353          putYMMRegLoAndZU(rG, res);
   28354          goto decode_success;
   28355       }
   28356       /* VBROADCASTSS m32, ymm1 = VEX.256.66.0F38.WIG 18 /r */
   28357       if (have66noF2noF3(pfx)
   28358           && 1==getVexL(pfx)/*256*/
   28359           && !epartIsReg(getUChar(delta))) {
   28360          UChar modrm = getUChar(delta);
   28361          UInt  rG    = gregOfRexRM(pfx, modrm);
   28362          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28363          delta += alen;
   28364          DIP("vbroadcastss %s,%s\n", dis_buf, nameYMMReg(rG));
   28365          IRTemp t32 = newTemp(Ity_I32);
   28366          assign(t32, loadLE(Ity_I32, mkexpr(addr)));
   28367          IRTemp t64 = newTemp(Ity_I64);
   28368          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   28369          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   28370                                                   mkexpr(t64), mkexpr(t64));
   28371          putYMMReg(rG, res);
   28372          goto decode_success;
   28373       }
   28374       /* VBROADCASTSS xmm2, xmm1 = VEX.128.66.0F38.WIG 18 /r */
   28375       if (have66noF2noF3(pfx)
   28376           && 0==getVexL(pfx)/*128*/
   28377           && epartIsReg(getUChar(delta))) {
   28378          UChar modrm = getUChar(delta);
   28379          UInt  rG    = gregOfRexRM(pfx, modrm);
   28380          UInt  rE    = eregOfRexRM(pfx, modrm);
   28381          DIP("vbroadcastss %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   28382          IRTemp t32 = newTemp(Ity_I32);
   28383          assign(t32, getXMMRegLane32(rE, 0));
   28384          IRTemp t64 = newTemp(Ity_I64);
   28385          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   28386          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
   28387          putYMMRegLoAndZU(rG, res);
   28388          delta++;
   28389          goto decode_success;
   28390       }
   28391       /* VBROADCASTSS xmm2, ymm1 = VEX.256.66.0F38.WIG 18 /r */
   28392       if (have66noF2noF3(pfx)
   28393           && 1==getVexL(pfx)/*256*/
   28394           && epartIsReg(getUChar(delta))) {
   28395          UChar modrm = getUChar(delta);
   28396          UInt  rG    = gregOfRexRM(pfx, modrm);
   28397          UInt  rE    = eregOfRexRM(pfx, modrm);
   28398          DIP("vbroadcastss %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   28399          IRTemp t32 = newTemp(Ity_I32);
   28400          assign(t32, getXMMRegLane32(rE, 0));
   28401          IRTemp t64 = newTemp(Ity_I64);
   28402          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   28403          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   28404                                                   mkexpr(t64), mkexpr(t64));
   28405          putYMMReg(rG, res);
   28406          delta++;
   28407          goto decode_success;
   28408       }
   28409       break;
   28410 
   28411    case 0x19:
   28412       /* VBROADCASTSD m64, ymm1 = VEX.256.66.0F38.WIG 19 /r */
   28413       if (have66noF2noF3(pfx)
   28414           && 1==getVexL(pfx)/*256*/
   28415           && !epartIsReg(getUChar(delta))) {
   28416          UChar modrm = getUChar(delta);
   28417          UInt  rG    = gregOfRexRM(pfx, modrm);
   28418          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28419          delta += alen;
   28420          DIP("vbroadcastsd %s,%s\n", dis_buf, nameYMMReg(rG));
   28421          IRTemp t64 = newTemp(Ity_I64);
   28422          assign(t64, loadLE(Ity_I64, mkexpr(addr)));
   28423          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   28424                                                   mkexpr(t64), mkexpr(t64));
   28425          putYMMReg(rG, res);
   28426          goto decode_success;
   28427       }
   28428       /* VBROADCASTSD xmm2, ymm1 = VEX.256.66.0F38.WIG 19 /r */
   28429       if (have66noF2noF3(pfx)
   28430           && 1==getVexL(pfx)/*256*/
   28431           && epartIsReg(getUChar(delta))) {
   28432          UChar modrm = getUChar(delta);
   28433          UInt  rG    = gregOfRexRM(pfx, modrm);
   28434          UInt  rE    = eregOfRexRM(pfx, modrm);
   28435          DIP("vbroadcastsd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   28436          IRTemp t64 = newTemp(Ity_I64);
   28437          assign(t64, getXMMRegLane64(rE, 0));
   28438          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   28439                                                   mkexpr(t64), mkexpr(t64));
   28440          putYMMReg(rG, res);
   28441          delta++;
   28442          goto decode_success;
   28443       }
   28444       break;
   28445 
   28446    case 0x1A:
   28447       /* VBROADCASTF128 m128, ymm1 = VEX.256.66.0F38.WIG 1A /r */
   28448       if (have66noF2noF3(pfx)
   28449           && 1==getVexL(pfx)/*256*/
   28450           && !epartIsReg(getUChar(delta))) {
   28451          UChar modrm = getUChar(delta);
   28452          UInt  rG    = gregOfRexRM(pfx, modrm);
   28453          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28454          delta += alen;
   28455          DIP("vbroadcastf128 %s,%s\n", dis_buf, nameYMMReg(rG));
   28456          IRTemp t128 = newTemp(Ity_V128);
   28457          assign(t128, loadLE(Ity_V128, mkexpr(addr)));
   28458          putYMMReg( rG, binop(Iop_V128HLtoV256, mkexpr(t128), mkexpr(t128)) );
   28459          goto decode_success;
   28460       }
   28461       break;
   28462 
   28463    case 0x1C:
   28464       /* VPABSB xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 1C /r */
   28465       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28466          delta = dis_AVX128_E_to_G_unary(
   28467                     uses_vvvv, vbi, pfx, delta,
   28468                     "vpabsb", math_PABS_XMM_pap1 );
   28469          goto decode_success;
   28470       }
   28471       /* VPABSB ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 1C /r */
   28472       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28473          delta = dis_AVX256_E_to_G_unary(
   28474                     uses_vvvv, vbi, pfx, delta,
   28475                     "vpabsb", math_PABS_YMM_pap1 );
   28476          goto decode_success;
   28477       }
   28478       break;
   28479 
   28480    case 0x1D:
   28481       /* VPABSW xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 1D /r */
   28482       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28483          delta = dis_AVX128_E_to_G_unary(
   28484                     uses_vvvv, vbi, pfx, delta,
   28485                     "vpabsw", math_PABS_XMM_pap2 );
   28486          goto decode_success;
   28487       }
   28488       /* VPABSW ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 1D /r */
   28489       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28490          delta = dis_AVX256_E_to_G_unary(
   28491                     uses_vvvv, vbi, pfx, delta,
   28492                     "vpabsw", math_PABS_YMM_pap2 );
   28493          goto decode_success;
   28494       }
   28495       break;
   28496 
   28497    case 0x1E:
   28498       /* VPABSD xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 1E /r */
   28499       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28500          delta = dis_AVX128_E_to_G_unary(
   28501                     uses_vvvv, vbi, pfx, delta,
   28502                     "vpabsd", math_PABS_XMM_pap4 );
   28503          goto decode_success;
   28504       }
   28505       /* VPABSD ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 1E /r */
   28506       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28507          delta = dis_AVX256_E_to_G_unary(
   28508                     uses_vvvv, vbi, pfx, delta,
   28509                     "vpabsd", math_PABS_YMM_pap4 );
   28510          goto decode_success;
   28511       }
   28512       break;
   28513 
   28514    case 0x20:
   28515       /* VPMOVSXBW xmm2/m64, xmm1 */
   28516       /* VPMOVSXBW = VEX.128.66.0F38.WIG 20 /r */
   28517       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28518          delta = dis_PMOVxXBW_128( vbi, pfx, delta,
   28519                                    True/*isAvx*/, False/*!xIsZ*/ );
   28520          goto decode_success;
   28521       }
   28522       /* VPMOVSXBW xmm2/m128, ymm1 */
   28523       /* VPMOVSXBW = VEX.256.66.0F38.WIG 20 /r */
   28524       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28525          delta = dis_PMOVxXBW_256( vbi, pfx, delta, False/*!xIsZ*/ );
   28526          goto decode_success;
   28527       }
   28528       break;
   28529 
   28530    case 0x21:
   28531       /* VPMOVSXBD xmm2/m32, xmm1 */
   28532       /* VPMOVSXBD = VEX.128.66.0F38.WIG 21 /r */
   28533       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28534          delta = dis_PMOVxXBD_128( vbi, pfx, delta,
   28535                                    True/*isAvx*/, False/*!xIsZ*/ );
   28536          goto decode_success;
   28537       }
   28538       /* VPMOVSXBD xmm2/m64, ymm1 */
   28539       /* VPMOVSXBD = VEX.256.66.0F38.WIG 21 /r */
   28540       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28541          delta = dis_PMOVxXBD_256( vbi, pfx, delta, False/*!xIsZ*/ );
   28542          goto decode_success;
   28543       }
   28544       break;
   28545 
   28546    case 0x22:
   28547       /* VPMOVSXBQ xmm2/m16, xmm1 */
   28548       /* VPMOVSXBQ = VEX.128.66.0F38.WIG 22 /r */
   28549       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28550          delta = dis_PMOVSXBQ_128( vbi, pfx, delta, True/*isAvx*/ );
   28551          goto decode_success;
   28552       }
   28553       /* VPMOVSXBQ xmm2/m32, ymm1 */
   28554       /* VPMOVSXBQ = VEX.256.66.0F38.WIG 22 /r */
   28555       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28556          delta = dis_PMOVSXBQ_256( vbi, pfx, delta );
   28557          goto decode_success;
   28558       }
   28559       break;
   28560 
   28561    case 0x23:
   28562       /* VPMOVSXWD xmm2/m64, xmm1 = VEX.128.66.0F38.WIG 23 /r */
   28563       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28564          delta = dis_PMOVxXWD_128( vbi, pfx, delta,
   28565                                    True/*isAvx*/, False/*!xIsZ*/ );
   28566          goto decode_success;
   28567       }
   28568       /* VPMOVSXWD xmm2/m128, ymm1 = VEX.256.66.0F38.WIG 23 /r */
   28569       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28570          delta = dis_PMOVxXWD_256( vbi, pfx, delta, False/*!xIsZ*/ );
   28571          goto decode_success;
   28572       }
   28573       break;
   28574 
   28575    case 0x24:
   28576       /* VPMOVSXWQ xmm2/m32, xmm1 = VEX.128.66.0F38.WIG 24 /r */
   28577       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28578          delta = dis_PMOVSXWQ_128( vbi, pfx, delta, True/*isAvx*/ );
   28579          goto decode_success;
   28580       }
   28581       /* VPMOVSXWQ xmm2/m64, ymm1 = VEX.256.66.0F38.WIG 24 /r */
   28582       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28583          delta = dis_PMOVSXWQ_256( vbi, pfx, delta );
   28584          goto decode_success;
   28585       }
   28586       break;
   28587 
   28588    case 0x25:
   28589       /* VPMOVSXDQ xmm2/m64, xmm1 = VEX.128.66.0F38.WIG 25 /r */
   28590       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28591          delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
   28592                                    True/*isAvx*/, False/*!xIsZ*/ );
   28593          goto decode_success;
   28594       }
   28595       /* VPMOVSXDQ xmm2/m128, ymm1 = VEX.256.66.0F38.WIG 25 /r */
   28596       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28597          delta = dis_PMOVxXDQ_256( vbi, pfx, delta, False/*!xIsZ*/ );
   28598          goto decode_success;
   28599       }
   28600       break;
   28601 
   28602    case 0x28:
   28603       /* VPMULDQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 28 /r */
   28604       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28605          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   28606                     uses_vvvv, vbi, pfx, delta,
   28607                     "vpmuldq", math_PMULDQ_128 );
   28608          goto decode_success;
   28609       }
   28610       /* VPMULDQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 28 /r */
   28611       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28612          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   28613                     uses_vvvv, vbi, pfx, delta,
   28614                     "vpmuldq", math_PMULDQ_256 );
   28615          goto decode_success;
   28616       }
   28617       break;
   28618 
   28619    case 0x29:
   28620       /* VPCMPEQQ r/m, rV, r ::: r = rV `eq-by-64s` r/m */
   28621       /* VPCMPEQQ = VEX.NDS.128.66.0F38.WIG 29 /r */
   28622       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28623          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28624                     uses_vvvv, vbi, pfx, delta, "vpcmpeqq", Iop_CmpEQ64x2 );
   28625          goto decode_success;
   28626       }
   28627       /* VPCMPEQQ r/m, rV, r ::: r = rV `eq-by-64s` r/m */
   28628       /* VPCMPEQQ = VEX.NDS.256.66.0F38.WIG 29 /r */
   28629       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28630          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28631                     uses_vvvv, vbi, pfx, delta, "vpcmpeqq", Iop_CmpEQ64x4 );
   28632          goto decode_success;
   28633       }
   28634       break;
   28635 
   28636    case 0x2A:
   28637       /* VMOVNTDQA m128, xmm1 = VEX.128.66.0F38.WIG 2A /r */
   28638       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28639           && !epartIsReg(getUChar(delta))) {
   28640          UChar  modrm = getUChar(delta);
   28641          UInt   rD    = gregOfRexRM(pfx, modrm);
   28642          IRTemp tD    = newTemp(Ity_V128);
   28643          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28644          delta += alen;
   28645          gen_SEGV_if_not_16_aligned(addr);
   28646          assign(tD, loadLE(Ity_V128, mkexpr(addr)));
   28647          DIP("vmovntdqa %s,%s\n", dis_buf, nameXMMReg(rD));
   28648          putYMMRegLoAndZU(rD, mkexpr(tD));
   28649          goto decode_success;
   28650       }
   28651       /* VMOVNTDQA m256, ymm1 = VEX.256.66.0F38.WIG 2A /r */
   28652       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28653           && !epartIsReg(getUChar(delta))) {
   28654          UChar  modrm = getUChar(delta);
   28655          UInt   rD    = gregOfRexRM(pfx, modrm);
   28656          IRTemp tD    = newTemp(Ity_V256);
   28657          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28658          delta += alen;
   28659          gen_SEGV_if_not_32_aligned(addr);
   28660          assign(tD, loadLE(Ity_V256, mkexpr(addr)));
   28661          DIP("vmovntdqa %s,%s\n", dis_buf, nameYMMReg(rD));
   28662          putYMMReg(rD, mkexpr(tD));
   28663          goto decode_success;
   28664       }
   28665       break;
   28666 
   28667    case 0x2B:
   28668       /* VPACKUSDW r/m, rV, r ::: r = QNarrowBin32Sto16Ux8(rV, r/m) */
   28669       /* VPACKUSDW = VEX.NDS.128.66.0F38.WIG 2B /r */
   28670       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28671          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   28672                     uses_vvvv, vbi, pfx, delta, "vpackusdw",
   28673                     Iop_QNarrowBin32Sto16Ux8, NULL,
   28674                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   28675          goto decode_success;
   28676       }
   28677       /* VPACKUSDW r/m, rV, r ::: r = QNarrowBin32Sto16Ux8(rV, r/m) */
   28678       /* VPACKUSDW = VEX.NDS.256.66.0F38.WIG 2B /r */
   28679       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28680          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   28681                     uses_vvvv, vbi, pfx, delta, "vpackusdw",
   28682                     math_VPACKUSDW_YMM );
   28683          goto decode_success;
   28684       }
   28685       break;
   28686 
   28687    case 0x2C:
   28688       /* VMASKMOVPS m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 2C /r */
   28689       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28690           && 0==getRexW(pfx)/*W0*/
   28691           && !epartIsReg(getUChar(delta))) {
   28692          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
   28693                                /*!isYMM*/False, Ity_I32, /*isLoad*/True );
   28694          goto decode_success;
   28695       }
   28696       /* VMASKMOVPS m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 2C /r */
   28697       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28698           && 0==getRexW(pfx)/*W0*/
   28699           && !epartIsReg(getUChar(delta))) {
   28700          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
   28701                                /*isYMM*/True, Ity_I32, /*isLoad*/True );
   28702          goto decode_success;
   28703       }
   28704       break;
   28705 
   28706    case 0x2D:
   28707       /* VMASKMOVPD m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 2D /r */
   28708       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28709           && 0==getRexW(pfx)/*W0*/
   28710           && !epartIsReg(getUChar(delta))) {
   28711          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
   28712                                /*!isYMM*/False, Ity_I64, /*isLoad*/True );
   28713          goto decode_success;
   28714       }
   28715       /* VMASKMOVPD m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 2D /r */
   28716       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28717           && 0==getRexW(pfx)/*W0*/
   28718           && !epartIsReg(getUChar(delta))) {
   28719          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
   28720                                /*isYMM*/True, Ity_I64, /*isLoad*/True );
   28721          goto decode_success;
   28722       }
   28723       break;
   28724 
   28725    case 0x2E:
   28726       /* VMASKMOVPS xmm1, xmm2, m128 = VEX.NDS.128.66.0F38.W0 2E /r */
   28727       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28728           && 0==getRexW(pfx)/*W0*/
   28729           && !epartIsReg(getUChar(delta))) {
   28730          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
   28731                                /*!isYMM*/False, Ity_I32, /*!isLoad*/False );
   28732          goto decode_success;
   28733       }
   28734       /* VMASKMOVPS ymm1, ymm2, m256 = VEX.NDS.256.66.0F38.W0 2E /r */
   28735       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28736           && 0==getRexW(pfx)/*W0*/
   28737           && !epartIsReg(getUChar(delta))) {
   28738          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
   28739                                /*isYMM*/True, Ity_I32, /*!isLoad*/False );
   28740          goto decode_success;
   28741       }
   28742       break;
   28743 
   28744    case 0x2F:
   28745       /* VMASKMOVPD xmm1, xmm2, m128 = VEX.NDS.128.66.0F38.W0 2F /r */
   28746       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28747           && 0==getRexW(pfx)/*W0*/
   28748           && !epartIsReg(getUChar(delta))) {
   28749          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
   28750                                /*!isYMM*/False, Ity_I64, /*!isLoad*/False );
   28751          goto decode_success;
   28752       }
   28753       /* VMASKMOVPD ymm1, ymm2, m256 = VEX.NDS.256.66.0F38.W0 2F /r */
   28754       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28755           && 0==getRexW(pfx)/*W0*/
   28756           && !epartIsReg(getUChar(delta))) {
   28757          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
   28758                                /*isYMM*/True, Ity_I64, /*!isLoad*/False );
   28759          goto decode_success;
   28760       }
   28761       break;
   28762 
   28763    case 0x30:
   28764       /* VPMOVZXBW xmm2/m64, xmm1 */
   28765       /* VPMOVZXBW = VEX.128.66.0F38.WIG 30 /r */
   28766       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28767          delta = dis_PMOVxXBW_128( vbi, pfx, delta,
   28768                                    True/*isAvx*/, True/*xIsZ*/ );
   28769          goto decode_success;
   28770       }
   28771       /* VPMOVZXBW xmm2/m128, ymm1 */
   28772       /* VPMOVZXBW = VEX.256.66.0F38.WIG 30 /r */
   28773       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28774          delta = dis_PMOVxXBW_256( vbi, pfx, delta, True/*xIsZ*/ );
   28775          goto decode_success;
   28776       }
   28777       break;
   28778 
   28779    case 0x31:
   28780       /* VPMOVZXBD xmm2/m32, xmm1 */
   28781       /* VPMOVZXBD = VEX.128.66.0F38.WIG 31 /r */
   28782       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28783          delta = dis_PMOVxXBD_128( vbi, pfx, delta,
   28784                                    True/*isAvx*/, True/*xIsZ*/ );
   28785          goto decode_success;
   28786       }
   28787       /* VPMOVZXBD xmm2/m64, ymm1 */
   28788       /* VPMOVZXBD = VEX.256.66.0F38.WIG 31 /r */
   28789       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28790          delta = dis_PMOVxXBD_256( vbi, pfx, delta, True/*xIsZ*/ );
   28791          goto decode_success;
   28792       }
   28793       break;
   28794 
   28795    case 0x32:
   28796       /* VPMOVZXBQ xmm2/m16, xmm1 */
   28797       /* VPMOVZXBQ = VEX.128.66.0F38.WIG 32 /r */
   28798       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28799          delta = dis_PMOVZXBQ_128( vbi, pfx, delta, True/*isAvx*/ );
   28800          goto decode_success;
   28801       }
   28802       /* VPMOVZXBQ xmm2/m32, ymm1 */
   28803       /* VPMOVZXBQ = VEX.256.66.0F38.WIG 32 /r */
   28804       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28805          delta = dis_PMOVZXBQ_256( vbi, pfx, delta );
   28806          goto decode_success;
   28807       }
   28808       break;
   28809 
   28810    case 0x33:
   28811       /* VPMOVZXWD xmm2/m64, xmm1 */
   28812       /* VPMOVZXWD = VEX.128.66.0F38.WIG 33 /r */
   28813       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28814          delta = dis_PMOVxXWD_128( vbi, pfx, delta,
   28815                                    True/*isAvx*/, True/*xIsZ*/ );
   28816          goto decode_success;
   28817       }
   28818       /* VPMOVZXWD xmm2/m128, ymm1 */
   28819       /* VPMOVZXWD = VEX.256.66.0F38.WIG 33 /r */
   28820       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28821          delta = dis_PMOVxXWD_256( vbi, pfx, delta, True/*xIsZ*/ );
   28822          goto decode_success;
   28823       }
   28824       break;
   28825 
   28826    case 0x34:
   28827       /* VPMOVZXWQ xmm2/m32, xmm1 = VEX.128.66.0F38.WIG 34 /r */
   28828       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28829          delta = dis_PMOVZXWQ_128( vbi, pfx, delta, True/*isAvx*/ );
   28830          goto decode_success;
   28831       }
   28832       /* VPMOVZXWQ xmm2/m64, ymm1 = VEX.256.66.0F38.WIG 34 /r */
   28833       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28834          delta = dis_PMOVZXWQ_256( vbi, pfx, delta );
   28835          goto decode_success;
   28836       }
   28837       break;
   28838 
   28839    case 0x35:
   28840       /* VPMOVZXDQ xmm2/m64, xmm1 = VEX.128.66.0F38.WIG 35 /r */
   28841       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28842          delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
   28843                                    True/*isAvx*/, True/*xIsZ*/ );
   28844          goto decode_success;
   28845       }
   28846       /* VPMOVZXDQ xmm2/m128, ymm1 = VEX.256.66.0F38.WIG 35 /r */
   28847       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28848          delta = dis_PMOVxXDQ_256( vbi, pfx, delta, True/*xIsZ*/ );
   28849          goto decode_success;
   28850       }
   28851       break;
   28852 
   28853    case 0x36:
   28854       /* VPERMD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 36 /r */
   28855       if (have66noF2noF3(pfx)
   28856           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   28857          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   28858                     uses_vvvv, vbi, pfx, delta, "vpermd", math_VPERMD );
   28859          goto decode_success;
   28860       }
   28861       break;
   28862 
   28863    case 0x37:
   28864       /* VPCMPGTQ r/m, rV, r ::: r = rV `>s-by-64s` r/m */
   28865       /* VPCMPGTQ = VEX.NDS.128.66.0F38.WIG 37 /r */
   28866       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28867          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28868                     uses_vvvv, vbi, pfx, delta, "vpcmpgtq", Iop_CmpGT64Sx2 );
   28869          goto decode_success;
   28870       }
   28871       /* VPCMPGTQ r/m, rV, r ::: r = rV `>s-by-64s` r/m */
   28872       /* VPCMPGTQ = VEX.NDS.256.66.0F38.WIG 37 /r */
   28873       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28874          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28875                     uses_vvvv, vbi, pfx, delta, "vpcmpgtq", Iop_CmpGT64Sx4 );
   28876          goto decode_success;
   28877       }
   28878       break;
   28879 
   28880    case 0x38:
   28881       /* VPMINSB r/m, rV, r ::: r = min-signed-8s(rV, r/m) */
   28882       /* VPMINSB = VEX.NDS.128.66.0F38.WIG 38 /r */
   28883       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28884          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28885                     uses_vvvv, vbi, pfx, delta, "vpminsb", Iop_Min8Sx16 );
   28886          goto decode_success;
   28887       }
   28888       /* VPMINSB r/m, rV, r ::: r = min-signed-8s(rV, r/m) */
   28889       /* VPMINSB = VEX.NDS.256.66.0F38.WIG 38 /r */
   28890       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28891          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28892                     uses_vvvv, vbi, pfx, delta, "vpminsb", Iop_Min8Sx32 );
   28893          goto decode_success;
   28894       }
   28895       break;
   28896 
   28897    case 0x39:
   28898       /* VPMINSD r/m, rV, r ::: r = min-signed-32s(rV, r/m) */
   28899       /* VPMINSD = VEX.NDS.128.66.0F38.WIG 39 /r */
   28900       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28901          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28902                     uses_vvvv, vbi, pfx, delta, "vpminsd", Iop_Min32Sx4 );
   28903          goto decode_success;
   28904       }
   28905       /* VPMINSD r/m, rV, r ::: r = min-signed-32s(rV, r/m) */
   28906       /* VPMINSD = VEX.NDS.256.66.0F38.WIG 39 /r */
   28907       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28908          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28909                     uses_vvvv, vbi, pfx, delta, "vpminsd", Iop_Min32Sx8 );
   28910          goto decode_success;
   28911       }
   28912       break;
   28913 
   28914    case 0x3A:
   28915       /* VPMINUW r/m, rV, r ::: r = min-unsigned-16s(rV, r/m) */
   28916       /* VPMINUW = VEX.NDS.128.66.0F38.WIG 3A /r */
   28917       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28918          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28919                     uses_vvvv, vbi, pfx, delta, "vpminuw", Iop_Min16Ux8 );
   28920          goto decode_success;
   28921       }
   28922       /* VPMINUW r/m, rV, r ::: r = min-unsigned-16s(rV, r/m) */
   28923       /* VPMINUW = VEX.NDS.256.66.0F38.WIG 3A /r */
   28924       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28925          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28926                     uses_vvvv, vbi, pfx, delta, "vpminuw", Iop_Min16Ux16 );
   28927          goto decode_success;
   28928       }
   28929       break;
   28930 
   28931    case 0x3B:
   28932       /* VPMINUD r/m, rV, r ::: r = min-unsigned-32s(rV, r/m) */
   28933       /* VPMINUD = VEX.NDS.128.66.0F38.WIG 3B /r */
   28934       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28935          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28936                     uses_vvvv, vbi, pfx, delta, "vpminud", Iop_Min32Ux4 );
   28937          goto decode_success;
   28938       }
   28939       /* VPMINUD r/m, rV, r ::: r = min-unsigned-32s(rV, r/m) */
   28940       /* VPMINUD = VEX.NDS.256.66.0F38.WIG 3B /r */
   28941       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28942          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28943                     uses_vvvv, vbi, pfx, delta, "vpminud", Iop_Min32Ux8 );
   28944          goto decode_success;
   28945       }
   28946       break;
   28947 
   28948    case 0x3C:
   28949       /* VPMAXSB r/m, rV, r ::: r = max-signed-8s(rV, r/m) */
   28950       /* VPMAXSB = VEX.NDS.128.66.0F38.WIG 3C /r */
   28951       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28952          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28953                     uses_vvvv, vbi, pfx, delta, "vpmaxsb", Iop_Max8Sx16 );
   28954          goto decode_success;
   28955       }
   28956       /* VPMAXSB r/m, rV, r ::: r = max-signed-8s(rV, r/m) */
   28957       /* VPMAXSB = VEX.NDS.256.66.0F38.WIG 3C /r */
   28958       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28959          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28960                     uses_vvvv, vbi, pfx, delta, "vpmaxsb", Iop_Max8Sx32 );
   28961          goto decode_success;
   28962       }
   28963       break;
   28964 
   28965    case 0x3D:
   28966       /* VPMAXSD r/m, rV, r ::: r = max-signed-32s(rV, r/m) */
   28967       /* VPMAXSD = VEX.NDS.128.66.0F38.WIG 3D /r */
   28968       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28969          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28970                     uses_vvvv, vbi, pfx, delta, "vpmaxsd", Iop_Max32Sx4 );
   28971          goto decode_success;
   28972       }
   28973       /* VPMAXSD r/m, rV, r ::: r = max-signed-32s(rV, r/m) */
   28974       /* VPMAXSD = VEX.NDS.256.66.0F38.WIG 3D /r */
   28975       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28976          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28977                     uses_vvvv, vbi, pfx, delta, "vpmaxsd", Iop_Max32Sx8 );
   28978          goto decode_success;
   28979       }
   28980       break;
   28981 
   28982    case 0x3E:
   28983       /* VPMAXUW r/m, rV, r ::: r = max-unsigned-16s(rV, r/m) */
   28984       /* VPMAXUW = VEX.NDS.128.66.0F38.WIG 3E /r */
   28985       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28986          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28987                     uses_vvvv, vbi, pfx, delta, "vpmaxuw", Iop_Max16Ux8 );
   28988          goto decode_success;
   28989       }
   28990       /* VPMAXUW r/m, rV, r ::: r = max-unsigned-16s(rV, r/m) */
   28991       /* VPMAXUW = VEX.NDS.256.66.0F38.WIG 3E /r */
   28992       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28993          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28994                     uses_vvvv, vbi, pfx, delta, "vpmaxuw", Iop_Max16Ux16 );
   28995          goto decode_success;
   28996       }
   28997       break;
   28998 
   28999    case 0x3F:
   29000       /* VPMAXUD r/m, rV, r ::: r = max-unsigned-32s(rV, r/m) */
   29001       /* VPMAXUD = VEX.NDS.128.66.0F38.WIG 3F /r */
   29002       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   29003          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   29004                     uses_vvvv, vbi, pfx, delta, "vpmaxud", Iop_Max32Ux4 );
   29005          goto decode_success;
   29006       }
   29007       /* VPMAXUD r/m, rV, r ::: r = max-unsigned-32s(rV, r/m) */
   29008       /* VPMAXUD = VEX.NDS.256.66.0F38.WIG 3F /r */
   29009       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   29010          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   29011                     uses_vvvv, vbi, pfx, delta, "vpmaxud", Iop_Max32Ux8 );
   29012          goto decode_success;
   29013       }
   29014       break;
   29015 
   29016    case 0x40:
   29017       /* VPMULLD r/m, rV, r ::: r = mul-32s(rV, r/m) */
   29018       /* VPMULLD = VEX.NDS.128.66.0F38.WIG 40 /r */
   29019       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   29020          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   29021                     uses_vvvv, vbi, pfx, delta, "vpmulld", Iop_Mul32x4 );
   29022          goto decode_success;
   29023       }
   29024       /* VPMULLD r/m, rV, r ::: r = mul-32s(rV, r/m) */
   29025       /* VPMULLD = VEX.NDS.256.66.0F38.WIG 40 /r */
   29026       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   29027          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   29028                     uses_vvvv, vbi, pfx, delta, "vpmulld", Iop_Mul32x8 );
   29029          goto decode_success;
   29030       }
   29031       break;
   29032 
   29033    case 0x41:
   29034       /* VPHMINPOSUW xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 41 /r */
   29035       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   29036          delta = dis_PHMINPOSUW_128( vbi, pfx, delta, True/*isAvx*/ );
   29037          goto decode_success;
   29038       }
   29039       break;
   29040 
   29041    case 0x45:
   29042       /* VPSRLVD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 45 /r */
   29043       /* VPSRLVD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 45 /r */
   29044       if (have66noF2noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
   29045          delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsrlvd",
   29046                                          Iop_Shr32, 1==getVexL(pfx) );
   29047          *uses_vvvv = True;
   29048          goto decode_success;
   29049       }
   29050       /* VPSRLVQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 45 /r */
   29051       /* VPSRLVQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 45 /r */
   29052       if (have66noF2noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
   29053          delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsrlvq",
   29054                                          Iop_Shr64, 1==getVexL(pfx) );
   29055          *uses_vvvv = True;
   29056          goto decode_success;
   29057       }
   29058       break;
   29059 
   29060    case 0x46:
   29061       /* VPSRAVD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 46 /r */
   29062       /* VPSRAVD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 46 /r */
   29063       if (have66noF2noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
   29064          delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsravd",
   29065                                          Iop_Sar32, 1==getVexL(pfx) );
   29066          *uses_vvvv = True;
   29067          goto decode_success;
   29068       }
   29069       break;
   29070 
   29071    case 0x47:
   29072       /* VPSLLVD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 47 /r */
   29073       /* VPSLLVD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 47 /r */
   29074       if (have66noF2noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
   29075          delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsllvd",
   29076                                          Iop_Shl32, 1==getVexL(pfx) );
   29077          *uses_vvvv = True;
   29078          goto decode_success;
   29079       }
   29080       /* VPSLLVQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 47 /r */
   29081       /* VPSLLVQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 47 /r */
   29082       if (have66noF2noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
   29083          delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsllvq",
   29084                                          Iop_Shl64, 1==getVexL(pfx) );
   29085          *uses_vvvv = True;
   29086          goto decode_success;
   29087       }
   29088       break;
   29089 
   29090    case 0x58:
   29091       /* VPBROADCASTD xmm2/m32, xmm1 = VEX.128.66.0F38.W0 58 /r */
   29092       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29093           && 0==getRexW(pfx)/*W0*/) {
   29094          UChar modrm = getUChar(delta);
   29095          UInt  rG    = gregOfRexRM(pfx, modrm);
   29096          IRTemp t32 = newTemp(Ity_I32);
   29097          if (epartIsReg(modrm)) {
   29098             UInt rE = eregOfRexRM(pfx, modrm);
   29099             delta++;
   29100             DIP("vpbroadcastd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   29101             assign(t32, getXMMRegLane32(rE, 0));
   29102          } else {
   29103             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   29104             delta += alen;
   29105             DIP("vpbroadcastd %s,%s\n", dis_buf, nameXMMReg(rG));
   29106             assign(t32, loadLE(Ity_I32, mkexpr(addr)));
   29107          }
   29108          IRTemp t64 = newTemp(Ity_I64);
   29109          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   29110          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
   29111          putYMMRegLoAndZU(rG, res);
   29112          goto decode_success;
   29113       }
   29114       /* VPBROADCASTD xmm2/m32, ymm1 = VEX.256.66.0F38.W0 58 /r */
   29115       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29116           && 0==getRexW(pfx)/*W0*/) {
   29117          UChar modrm = getUChar(delta);
   29118          UInt  rG    = gregOfRexRM(pfx, modrm);
   29119          IRTemp t32 = newTemp(Ity_I32);
   29120          if (epartIsReg(modrm)) {
   29121             UInt rE = eregOfRexRM(pfx, modrm);
   29122             delta++;
   29123             DIP("vpbroadcastd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   29124             assign(t32, getXMMRegLane32(rE, 0));
   29125          } else {
   29126             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   29127             delta += alen;
   29128             DIP("vpbroadcastd %s,%s\n", dis_buf, nameYMMReg(rG));
   29129             assign(t32, loadLE(Ity_I32, mkexpr(addr)));
   29130          }
   29131          IRTemp t64 = newTemp(Ity_I64);
   29132          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   29133          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   29134                                                   mkexpr(t64), mkexpr(t64));
   29135          putYMMReg(rG, res);
   29136          goto decode_success;
   29137       }
   29138       break;
   29139 
   29140    case 0x59:
   29141       /* VPBROADCASTQ xmm2/m64, xmm1 = VEX.128.66.0F38.W0 59 /r */
   29142       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29143           && 0==getRexW(pfx)/*W0*/) {
   29144          UChar modrm = getUChar(delta);
   29145          UInt  rG    = gregOfRexRM(pfx, modrm);
   29146          IRTemp t64 = newTemp(Ity_I64);
   29147          if (epartIsReg(modrm)) {
   29148             UInt rE = eregOfRexRM(pfx, modrm);
   29149             delta++;
   29150             DIP("vpbroadcastq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   29151             assign(t64, getXMMRegLane64(rE, 0));
   29152          } else {
   29153             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   29154             delta += alen;
   29155             DIP("vpbroadcastq %s,%s\n", dis_buf, nameXMMReg(rG));
   29156             assign(t64, loadLE(Ity_I64, mkexpr(addr)));
   29157          }
   29158          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
   29159          putYMMRegLoAndZU(rG, res);
   29160          goto decode_success;
   29161       }
   29162       /* VPBROADCASTQ xmm2/m64, ymm1 = VEX.256.66.0F38.W0 59 /r */
   29163       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29164           && 0==getRexW(pfx)/*W0*/) {
   29165          UChar modrm = getUChar(delta);
   29166          UInt  rG    = gregOfRexRM(pfx, modrm);
   29167          IRTemp t64 = newTemp(Ity_I64);
   29168          if (epartIsReg(modrm)) {
   29169             UInt rE = eregOfRexRM(pfx, modrm);
   29170             delta++;
   29171             DIP("vpbroadcastq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   29172             assign(t64, getXMMRegLane64(rE, 0));
   29173          } else {
   29174             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   29175             delta += alen;
   29176             DIP("vpbroadcastq %s,%s\n", dis_buf, nameYMMReg(rG));
   29177             assign(t64, loadLE(Ity_I64, mkexpr(addr)));
   29178          }
   29179          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   29180                                                   mkexpr(t64), mkexpr(t64));
   29181          putYMMReg(rG, res);
   29182          goto decode_success;
   29183       }
   29184       break;
   29185 
   29186    case 0x5A:
   29187       /* VBROADCASTI128 m128, ymm1 = VEX.256.66.0F38.WIG 5A /r */
   29188       if (have66noF2noF3(pfx)
   29189           && 1==getVexL(pfx)/*256*/
   29190           && !epartIsReg(getUChar(delta))) {
   29191          UChar modrm = getUChar(delta);
   29192          UInt  rG    = gregOfRexRM(pfx, modrm);
   29193          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   29194          delta += alen;
   29195          DIP("vbroadcasti128 %s,%s\n", dis_buf, nameYMMReg(rG));
   29196          IRTemp t128 = newTemp(Ity_V128);
   29197          assign(t128, loadLE(Ity_V128, mkexpr(addr)));
   29198          putYMMReg( rG, binop(Iop_V128HLtoV256, mkexpr(t128), mkexpr(t128)) );
   29199          goto decode_success;
   29200       }
   29201       break;
   29202 
   29203    case 0x78:
   29204       /* VPBROADCASTB xmm2/m8, xmm1 = VEX.128.66.0F38.W0 78 /r */
   29205       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29206           && 0==getRexW(pfx)/*W0*/) {
   29207          UChar modrm = getUChar(delta);
   29208          UInt  rG    = gregOfRexRM(pfx, modrm);
   29209          IRTemp t8   = newTemp(Ity_I8);
   29210          if (epartIsReg(modrm)) {
   29211             UInt rE = eregOfRexRM(pfx, modrm);
   29212             delta++;
   29213             DIP("vpbroadcastb %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   29214             assign(t8, unop(Iop_32to8, getXMMRegLane32(rE, 0)));
   29215          } else {
   29216             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   29217             delta += alen;
   29218             DIP("vpbroadcastb %s,%s\n", dis_buf, nameXMMReg(rG));
   29219             assign(t8, loadLE(Ity_I8, mkexpr(addr)));
   29220          }
   29221          IRTemp t16 = newTemp(Ity_I16);
   29222          assign(t16, binop(Iop_8HLto16, mkexpr(t8), mkexpr(t8)));
   29223          IRTemp t32 = newTemp(Ity_I32);
   29224          assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
   29225          IRTemp t64 = newTemp(Ity_I64);
   29226          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   29227          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
   29228          putYMMRegLoAndZU(rG, res);
   29229          goto decode_success;
   29230       }
   29231       /* VPBROADCASTB xmm2/m8, ymm1 = VEX.256.66.0F38.W0 78 /r */
   29232       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29233           && 0==getRexW(pfx)/*W0*/) {
   29234          UChar modrm = getUChar(delta);
   29235          UInt  rG    = gregOfRexRM(pfx, modrm);
   29236          IRTemp t8   = newTemp(Ity_I8);
   29237          if (epartIsReg(modrm)) {
   29238             UInt rE = eregOfRexRM(pfx, modrm);
   29239             delta++;
   29240             DIP("vpbroadcastb %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   29241             assign(t8, unop(Iop_32to8, getXMMRegLane32(rE, 0)));
   29242          } else {
   29243             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   29244             delta += alen;
   29245             DIP("vpbroadcastb %s,%s\n", dis_buf, nameYMMReg(rG));
   29246             assign(t8, loadLE(Ity_I8, mkexpr(addr)));
   29247          }
   29248          IRTemp t16 = newTemp(Ity_I16);
   29249          assign(t16, binop(Iop_8HLto16, mkexpr(t8), mkexpr(t8)));
   29250          IRTemp t32 = newTemp(Ity_I32);
   29251          assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
   29252          IRTemp t64 = newTemp(Ity_I64);
   29253          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   29254          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   29255                                                   mkexpr(t64), mkexpr(t64));
   29256          putYMMReg(rG, res);
   29257          goto decode_success;
   29258       }
   29259       break;
   29260 
   29261    case 0x79:
   29262       /* VPBROADCASTW xmm2/m16, xmm1 = VEX.128.66.0F38.W0 79 /r */
   29263       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29264           && 0==getRexW(pfx)/*W0*/) {
   29265          UChar modrm = getUChar(delta);
   29266          UInt  rG    = gregOfRexRM(pfx, modrm);
   29267          IRTemp t16  = newTemp(Ity_I16);
   29268          if (epartIsReg(modrm)) {
   29269             UInt rE = eregOfRexRM(pfx, modrm);
   29270             delta++;
   29271             DIP("vpbroadcastw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   29272             assign(t16, unop(Iop_32to16, getXMMRegLane32(rE, 0)));
   29273          } else {
   29274             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   29275             delta += alen;
   29276             DIP("vpbroadcastw %s,%s\n", dis_buf, nameXMMReg(rG));
   29277             assign(t16, loadLE(Ity_I16, mkexpr(addr)));
   29278          }
   29279          IRTemp t32 = newTemp(Ity_I32);
   29280          assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
   29281          IRTemp t64 = newTemp(Ity_I64);
   29282          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   29283          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
   29284          putYMMRegLoAndZU(rG, res);
   29285          goto decode_success;
   29286       }
   29287       /* VPBROADCASTW xmm2/m16, ymm1 = VEX.256.66.0F38.W0 79 /r */
   29288       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29289           && 0==getRexW(pfx)/*W0*/) {
   29290          UChar modrm = getUChar(delta);
   29291          UInt  rG    = gregOfRexRM(pfx, modrm);
   29292          IRTemp t16  = newTemp(Ity_I16);
   29293          if (epartIsReg(modrm)) {
   29294             UInt rE = eregOfRexRM(pfx, modrm);
   29295             delta++;
   29296             DIP("vpbroadcastw %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   29297             assign(t16, unop(Iop_32to16, getXMMRegLane32(rE, 0)));
   29298          } else {
   29299             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   29300             delta += alen;
   29301             DIP("vpbroadcastw %s,%s\n", dis_buf, nameYMMReg(rG));
   29302             assign(t16, loadLE(Ity_I16, mkexpr(addr)));
   29303          }
   29304          IRTemp t32 = newTemp(Ity_I32);
   29305          assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
   29306          IRTemp t64 = newTemp(Ity_I64);
   29307          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   29308          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   29309                                                   mkexpr(t64), mkexpr(t64));
   29310          putYMMReg(rG, res);
   29311          goto decode_success;
   29312       }
   29313       break;
   29314 
   29315    case 0x8C:
   29316       /* VPMASKMOVD m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 8C /r */
   29317       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29318           && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   29319          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
   29320                                /*!isYMM*/False, Ity_I32, /*isLoad*/True );
   29321          goto decode_success;
   29322       }
   29323       /* VPMASKMOVD m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 8C /r */
   29324       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29325           && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   29326          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
   29327                                /*isYMM*/True, Ity_I32, /*isLoad*/True );
   29328          goto decode_success;
   29329       }
   29330       /* VPMASKMOVQ m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 8C /r */
   29331       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29332           && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   29333          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
   29334                                /*!isYMM*/False, Ity_I64, /*isLoad*/True );
   29335          goto decode_success;
   29336       }
   29337       /* VPMASKMOVQ m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 8C /r */
   29338       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29339           && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   29340          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
   29341                                /*isYMM*/True, Ity_I64, /*isLoad*/True );
   29342          goto decode_success;
   29343       }
   29344       break;
   29345 
   29346    case 0x8E:
   29347       /* VPMASKMOVD xmm1, xmm2, m128 = VEX.NDS.128.66.0F38.W0 8E /r */
   29348       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29349           && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   29350          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
   29351                                /*!isYMM*/False, Ity_I32, /*!isLoad*/False );
   29352          goto decode_success;
   29353       }
   29354       /* VPMASKMOVD ymm1, ymm2, m256 = VEX.NDS.256.66.0F38.W0 8E /r */
   29355       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29356           && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   29357          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
   29358                                /*isYMM*/True, Ity_I32, /*!isLoad*/False );
   29359          goto decode_success;
   29360       }
   29361       /* VPMASKMOVQ xmm1, xmm2, m128 = VEX.NDS.128.66.0F38.W1 8E /r */
   29362       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29363           && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   29364          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
   29365                                /*!isYMM*/False, Ity_I64, /*!isLoad*/False );
   29366          goto decode_success;
   29367       }
   29368       /* VPMASKMOVQ ymm1, ymm2, m256 = VEX.NDS.256.66.0F38.W1 8E /r */
   29369       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29370           && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   29371          delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
   29372                                /*isYMM*/True, Ity_I64, /*!isLoad*/False );
   29373          goto decode_success;
   29374       }
   29375       break;
   29376 
   29377    case 0x90:
   29378       /* VPGATHERDD xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W0 90 /r */
   29379       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29380           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   29381          Long delta0 = delta;
   29382          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdd",
   29383                               /*!isYMM*/False, /*!isVM64x*/False, Ity_I32 );
   29384          if (delta != delta0)
   29385             goto decode_success;
   29386       }
   29387       /* VPGATHERDD ymm2, vm32y, ymm1 = VEX.DDS.256.66.0F38.W0 90 /r */
   29388       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29389           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   29390          Long delta0 = delta;
   29391          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdd",
   29392                               /*isYMM*/True, /*!isVM64x*/False, Ity_I32 );
   29393          if (delta != delta0)
   29394             goto decode_success;
   29395       }
   29396       /* VPGATHERDQ xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W1 90 /r */
   29397       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29398           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   29399          Long delta0 = delta;
   29400          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdq",
   29401                               /*!isYMM*/False, /*!isVM64x*/False, Ity_I64 );
   29402          if (delta != delta0)
   29403             goto decode_success;
   29404       }
   29405       /* VPGATHERDQ ymm2, vm32x, ymm1 = VEX.DDS.256.66.0F38.W1 90 /r */
   29406       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29407           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   29408          Long delta0 = delta;
   29409          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdq",
   29410                               /*isYMM*/True, /*!isVM64x*/False, Ity_I64 );
   29411          if (delta != delta0)
   29412             goto decode_success;
   29413       }
   29414       break;
   29415 
   29416    case 0x91:
   29417       /* VPGATHERQD xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W0 91 /r */
   29418       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29419           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   29420          Long delta0 = delta;
   29421          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqd",
   29422                               /*!isYMM*/False, /*isVM64x*/True, Ity_I32 );
   29423          if (delta != delta0)
   29424             goto decode_success;
   29425       }
   29426       /* VPGATHERQD xmm2, vm64y, xmm1 = VEX.DDS.256.66.0F38.W0 91 /r */
   29427       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29428           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   29429          Long delta0 = delta;
   29430          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqd",
   29431                               /*isYMM*/True, /*isVM64x*/True, Ity_I32 );
   29432          if (delta != delta0)
   29433             goto decode_success;
   29434       }
   29435       /* VPGATHERQQ xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W1 91 /r */
   29436       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29437           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   29438          Long delta0 = delta;
   29439          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqq",
   29440                               /*!isYMM*/False, /*isVM64x*/True, Ity_I64 );
   29441          if (delta != delta0)
   29442             goto decode_success;
   29443       }
   29444       /* VPGATHERQQ ymm2, vm64y, ymm1 = VEX.DDS.256.66.0F38.W1 91 /r */
   29445       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29446           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   29447          Long delta0 = delta;
   29448          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqq",
   29449                               /*isYMM*/True, /*isVM64x*/True, Ity_I64 );
   29450          if (delta != delta0)
   29451             goto decode_success;
   29452       }
   29453       break;
   29454 
   29455    case 0x92:
   29456       /* VGATHERDPS xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W0 92 /r */
   29457       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29458           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   29459          Long delta0 = delta;
   29460          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdps",
   29461                               /*!isYMM*/False, /*!isVM64x*/False, Ity_I32 );
   29462          if (delta != delta0)
   29463             goto decode_success;
   29464       }
   29465       /* VGATHERDPS ymm2, vm32y, ymm1 = VEX.DDS.256.66.0F38.W0 92 /r */
   29466       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29467           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   29468          Long delta0 = delta;
   29469          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdps",
   29470                               /*isYMM*/True, /*!isVM64x*/False, Ity_I32 );
   29471          if (delta != delta0)
   29472             goto decode_success;
   29473       }
   29474       /* VGATHERDPD xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W1 92 /r */
   29475       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29476           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   29477          Long delta0 = delta;
   29478          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdpd",
   29479                               /*!isYMM*/False, /*!isVM64x*/False, Ity_I64 );
   29480          if (delta != delta0)
   29481             goto decode_success;
   29482       }
   29483       /* VGATHERDPD ymm2, vm32x, ymm1 = VEX.DDS.256.66.0F38.W1 92 /r */
   29484       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29485           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   29486          Long delta0 = delta;
   29487          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdpd",
   29488                               /*isYMM*/True, /*!isVM64x*/False, Ity_I64 );
   29489          if (delta != delta0)
   29490             goto decode_success;
   29491       }
   29492       break;
   29493 
   29494    case 0x93:
   29495       /* VGATHERQPS xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W0 93 /r */
   29496       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29497           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   29498          Long delta0 = delta;
   29499          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqps",
   29500                               /*!isYMM*/False, /*isVM64x*/True, Ity_I32 );
   29501          if (delta != delta0)
   29502             goto decode_success;
   29503       }
   29504       /* VGATHERQPS xmm2, vm64y, xmm1 = VEX.DDS.256.66.0F38.W0 93 /r */
   29505       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29506           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   29507          Long delta0 = delta;
   29508          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqps",
   29509                               /*isYMM*/True, /*isVM64x*/True, Ity_I32 );
   29510          if (delta != delta0)
   29511             goto decode_success;
   29512       }
   29513       /* VGATHERQPD xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W1 93 /r */
   29514       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   29515           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   29516          Long delta0 = delta;
   29517          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqpd",
   29518                               /*!isYMM*/False, /*isVM64x*/True, Ity_I64 );
   29519          if (delta != delta0)
   29520             goto decode_success;
   29521       }
   29522       /* VGATHERQPD ymm2, vm64y, ymm1 = VEX.DDS.256.66.0F38.W1 93 /r */
   29523       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29524           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   29525          Long delta0 = delta;
   29526          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqpd",
   29527                               /*isYMM*/True, /*isVM64x*/True, Ity_I64 );
   29528          if (delta != delta0)
   29529             goto decode_success;
   29530       }
   29531       break;
   29532 
   29533    case 0x96 ... 0x9F:
   29534    case 0xA6 ... 0xAF:
   29535    case 0xB6 ... 0xBF:
   29536       /* VFMADDSUB132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 96 /r */
   29537       /* VFMADDSUB132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 96 /r */
   29538       /* VFMADDSUB132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 96 /r */
   29539       /* VFMADDSUB132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 96 /r */
   29540       /* VFMSUBADD132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 97 /r */
   29541       /* VFMSUBADD132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 97 /r */
   29542       /* VFMSUBADD132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 97 /r */
   29543       /* VFMSUBADD132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 97 /r */
   29544       /* VFMADD132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 98 /r */
   29545       /* VFMADD132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 98 /r */
   29546       /* VFMADD132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 98 /r */
   29547       /* VFMADD132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 98 /r */
   29548       /* VFMADD132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 99 /r */
   29549       /* VFMADD132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 99 /r */
   29550       /* VFMSUB132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 9A /r */
   29551       /* VFMSUB132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 9A /r */
   29552       /* VFMSUB132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 9A /r */
   29553       /* VFMSUB132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 9A /r */
   29554       /* VFMSUB132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 9B /r */
   29555       /* VFMSUB132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 9B /r */
   29556       /* VFNMADD132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 9C /r */
   29557       /* VFNMADD132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 9C /r */
   29558       /* VFNMADD132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 9C /r */
   29559       /* VFNMADD132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 9C /r */
   29560       /* VFNMADD132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 9D /r */
   29561       /* VFNMADD132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 9D /r */
   29562       /* VFNMSUB132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 9E /r */
   29563       /* VFNMSUB132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 9E /r */
   29564       /* VFNMSUB132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 9E /r */
   29565       /* VFNMSUB132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 9E /r */
   29566       /* VFNMSUB132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 9F /r */
   29567       /* VFNMSUB132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 9F /r */
   29568       /* VFMADDSUB213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 A6 /r */
   29569       /* VFMADDSUB213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 A6 /r */
   29570       /* VFMADDSUB213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 A6 /r */
   29571       /* VFMADDSUB213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 A6 /r */
   29572       /* VFMSUBADD213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 A7 /r */
   29573       /* VFMSUBADD213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 A7 /r */
   29574       /* VFMSUBADD213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 A7 /r */
   29575       /* VFMSUBADD213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 A7 /r */
   29576       /* VFMADD213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 A8 /r */
   29577       /* VFMADD213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 A8 /r */
   29578       /* VFMADD213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 A8 /r */
   29579       /* VFMADD213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 A8 /r */
   29580       /* VFMADD213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 A9 /r */
   29581       /* VFMADD213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 A9 /r */
   29582       /* VFMSUB213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 AA /r */
   29583       /* VFMSUB213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 AA /r */
   29584       /* VFMSUB213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 AA /r */
   29585       /* VFMSUB213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 AA /r */
   29586       /* VFMSUB213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 AB /r */
   29587       /* VFMSUB213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 AB /r */
   29588       /* VFNMADD213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 AC /r */
   29589       /* VFNMADD213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 AC /r */
   29590       /* VFNMADD213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 AC /r */
   29591       /* VFNMADD213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 AC /r */
   29592       /* VFNMADD213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 AD /r */
   29593       /* VFNMADD213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 AD /r */
   29594       /* VFNMSUB213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 AE /r */
   29595       /* VFNMSUB213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 AE /r */
   29596       /* VFNMSUB213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 AE /r */
   29597       /* VFNMSUB213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 AE /r */
   29598       /* VFNMSUB213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 AF /r */
   29599       /* VFNMSUB213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 AF /r */
   29600       /* VFMADDSUB231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 B6 /r */
   29601       /* VFMADDSUB231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 B6 /r */
   29602       /* VFMADDSUB231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 B6 /r */
   29603       /* VFMADDSUB231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 B6 /r */
   29604       /* VFMSUBADD231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 B7 /r */
   29605       /* VFMSUBADD231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 B7 /r */
   29606       /* VFMSUBADD231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 B7 /r */
   29607       /* VFMSUBADD231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 B7 /r */
   29608       /* VFMADD231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 B8 /r */
   29609       /* VFMADD231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 B8 /r */
   29610       /* VFMADD231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 B8 /r */
   29611       /* VFMADD231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 B8 /r */
   29612       /* VFMADD231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 B9 /r */
   29613       /* VFMADD231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 B9 /r */
   29614       /* VFMSUB231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 BA /r */
   29615       /* VFMSUB231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 BA /r */
   29616       /* VFMSUB231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 BA /r */
   29617       /* VFMSUB231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 BA /r */
   29618       /* VFMSUB231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 BB /r */
   29619       /* VFMSUB231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 BB /r */
   29620       /* VFNMADD231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 BC /r */
   29621       /* VFNMADD231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 BC /r */
   29622       /* VFNMADD231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 BC /r */
   29623       /* VFNMADD231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 BC /r */
   29624       /* VFNMADD231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 BD /r */
   29625       /* VFNMADD231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 BD /r */
   29626       /* VFNMSUB231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 BE /r */
   29627       /* VFNMSUB231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 BE /r */
   29628       /* VFNMSUB231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 BE /r */
   29629       /* VFNMSUB231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 BE /r */
   29630       /* VFNMSUB231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 BF /r */
   29631       /* VFNMSUB231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 BF /r */
   29632       if (have66noF2noF3(pfx)) {
   29633          delta = dis_FMA( vbi, pfx, delta, opc );
   29634          *uses_vvvv = True;
   29635          goto decode_success;
   29636       }
   29637       break;
   29638 
   29639    case 0xDB:
   29640    case 0xDC:
   29641    case 0xDD:
   29642    case 0xDE:
   29643    case 0xDF:
   29644       /* VAESIMC xmm2/m128, xmm1 = VEX.128.66.0F38.WIG DB /r */
   29645       /* VAESENC xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DC /r */
   29646       /* VAESENCLAST xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DD /r */
   29647       /* VAESDEC xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DE /r */
   29648       /* VAESDECLAST xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DF /r */
   29649       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   29650          delta = dis_AESx( vbi, pfx, delta, True/*!isAvx*/, opc );
   29651          if (opc != 0xDB) *uses_vvvv = True;
   29652          goto decode_success;
   29653       }
   29654       break;
   29655 
   29656    case 0xF2:
   29657       /* ANDN r/m32, r32b, r32a = VEX.NDS.LZ.0F38.W0 F2 /r */
   29658       /* ANDN r/m64, r64b, r64a = VEX.NDS.LZ.0F38.W1 F2 /r */
   29659       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   29660          Int     size = getRexW(pfx) ? 8 : 4;
   29661          IRType  ty   = szToITy(size);
   29662          IRTemp  dst  = newTemp(ty);
   29663          IRTemp  src1 = newTemp(ty);
   29664          IRTemp  src2 = newTemp(ty);
   29665          UChar   rm   = getUChar(delta);
   29666 
   29667          assign( src1, getIRegV(size,pfx) );
   29668          if (epartIsReg(rm)) {
   29669             assign( src2, getIRegE(size,pfx,rm) );
   29670             DIP("andn %s,%s,%s\n", nameIRegE(size,pfx,rm),
   29671                 nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
   29672             delta++;
   29673          } else {
   29674             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29675             assign( src2, loadLE(ty, mkexpr(addr)) );
   29676             DIP("andn %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
   29677                 nameIRegG(size,pfx,rm));
   29678             delta += alen;
   29679          }
   29680 
   29681          assign( dst, binop( mkSizedOp(ty,Iop_And8),
   29682                              unop( mkSizedOp(ty,Iop_Not8), mkexpr(src1) ),
   29683                              mkexpr(src2) ) );
   29684          putIRegG( size, pfx, rm, mkexpr(dst) );
   29685          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
   29686                                                ? AMD64G_CC_OP_ANDN64
   29687                                                : AMD64G_CC_OP_ANDN32)) );
   29688          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
   29689          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
   29690          *uses_vvvv = True;
   29691          goto decode_success;
   29692       }
   29693       break;
   29694 
   29695    case 0xF3:
   29696       /* BLSI r/m32, r32 = VEX.NDD.LZ.0F38.W0 F3 /3 */
   29697       /* BLSI r/m64, r64 = VEX.NDD.LZ.0F38.W1 F3 /3 */
   29698       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/
   29699           && !haveREX(pfx) && gregLO3ofRM(getUChar(delta)) == 3) {
   29700          Int     size = getRexW(pfx) ? 8 : 4;
   29701          IRType  ty   = szToITy(size);
   29702          IRTemp  src  = newTemp(ty);
   29703          IRTemp  dst  = newTemp(ty);
   29704          UChar   rm   = getUChar(delta);
   29705 
   29706          if (epartIsReg(rm)) {
   29707             assign( src, getIRegE(size,pfx,rm) );
   29708             DIP("blsi %s,%s\n", nameIRegE(size,pfx,rm),
   29709                 nameIRegV(size,pfx));
   29710             delta++;
   29711          } else {
   29712             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29713             assign( src, loadLE(ty, mkexpr(addr)) );
   29714             DIP("blsi %s,%s\n", dis_buf, nameIRegV(size,pfx));
   29715             delta += alen;
   29716          }
   29717 
   29718          assign( dst, binop(mkSizedOp(ty,Iop_And8),
   29719                             binop(mkSizedOp(ty,Iop_Sub8), mkU(ty, 0),
   29720                                   mkexpr(src)), mkexpr(src)) );
   29721          putIRegV( size, pfx, mkexpr(dst) );
   29722          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
   29723                                                ? AMD64G_CC_OP_BLSI64
   29724                                                : AMD64G_CC_OP_BLSI32)) );
   29725          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
   29726          stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(src))) );
   29727          *uses_vvvv = True;
   29728          goto decode_success;
   29729       }
   29730       /* BLSMSK r/m32, r32 = VEX.NDD.LZ.0F38.W0 F3 /2 */
   29731       /* BLSMSK r/m64, r64 = VEX.NDD.LZ.0F38.W1 F3 /2 */
   29732       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/
   29733           && !haveREX(pfx) && gregLO3ofRM(getUChar(delta)) == 2) {
   29734          Int     size = getRexW(pfx) ? 8 : 4;
   29735          IRType  ty   = szToITy(size);
   29736          IRTemp  src  = newTemp(ty);
   29737          IRTemp  dst  = newTemp(ty);
   29738          UChar   rm   = getUChar(delta);
   29739 
   29740          if (epartIsReg(rm)) {
   29741             assign( src, getIRegE(size,pfx,rm) );
   29742             DIP("blsmsk %s,%s\n", nameIRegE(size,pfx,rm),
   29743                 nameIRegV(size,pfx));
   29744             delta++;
   29745          } else {
   29746             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29747             assign( src, loadLE(ty, mkexpr(addr)) );
   29748             DIP("blsmsk %s,%s\n", dis_buf, nameIRegV(size,pfx));
   29749             delta += alen;
   29750          }
   29751 
   29752          assign( dst, binop(mkSizedOp(ty,Iop_Xor8),
   29753                             binop(mkSizedOp(ty,Iop_Sub8), mkexpr(src),
   29754                                   mkU(ty, 1)), mkexpr(src)) );
   29755          putIRegV( size, pfx, mkexpr(dst) );
   29756          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
   29757                                                ? AMD64G_CC_OP_BLSMSK64
   29758                                                : AMD64G_CC_OP_BLSMSK32)) );
   29759          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
   29760          stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(src))) );
   29761          *uses_vvvv = True;
   29762          goto decode_success;
   29763       }
   29764       /* BLSR r/m32, r32 = VEX.NDD.LZ.0F38.W0 F3 /1 */
   29765       /* BLSR r/m64, r64 = VEX.NDD.LZ.0F38.W1 F3 /1 */
   29766       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/
   29767           && !haveREX(pfx) && gregLO3ofRM(getUChar(delta)) == 1) {
   29768          Int     size = getRexW(pfx) ? 8 : 4;
   29769          IRType  ty   = szToITy(size);
   29770          IRTemp  src  = newTemp(ty);
   29771          IRTemp  dst  = newTemp(ty);
   29772          UChar   rm   = getUChar(delta);
   29773 
   29774          if (epartIsReg(rm)) {
   29775             assign( src, getIRegE(size,pfx,rm) );
   29776             DIP("blsr %s,%s\n", nameIRegE(size,pfx,rm),
   29777                 nameIRegV(size,pfx));
   29778             delta++;
   29779          } else {
   29780             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29781             assign( src, loadLE(ty, mkexpr(addr)) );
   29782             DIP("blsr %s,%s\n", dis_buf, nameIRegV(size,pfx));
   29783             delta += alen;
   29784          }
   29785 
   29786          assign( dst, binop(mkSizedOp(ty,Iop_And8),
   29787                             binop(mkSizedOp(ty,Iop_Sub8), mkexpr(src),
   29788                                   mkU(ty, 1)), mkexpr(src)) );
   29789          putIRegV( size, pfx, mkexpr(dst) );
   29790          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
   29791                                                ? AMD64G_CC_OP_BLSR64
   29792                                                : AMD64G_CC_OP_BLSR32)) );
   29793          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
   29794          stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(src))) );
   29795          *uses_vvvv = True;
   29796          goto decode_success;
   29797       }
   29798       break;
   29799 
   29800    case 0xF5:
   29801       /* BZHI r32b, r/m32, r32a = VEX.NDS.LZ.0F38.W0 F5 /r */
   29802       /* BZHI r64b, r/m64, r64a = VEX.NDS.LZ.0F38.W1 F5 /r */
   29803       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   29804          Int     size  = getRexW(pfx) ? 8 : 4;
   29805          IRType  ty    = szToITy(size);
   29806          IRTemp  dst   = newTemp(ty);
   29807          IRTemp  src1  = newTemp(ty);
   29808          IRTemp  src2  = newTemp(ty);
   29809          IRTemp  start = newTemp(Ity_I8);
   29810          IRTemp  cond  = newTemp(Ity_I1);
   29811          UChar   rm    = getUChar(delta);
   29812 
   29813          assign( src2, getIRegV(size,pfx) );
   29814          if (epartIsReg(rm)) {
   29815             assign( src1, getIRegE(size,pfx,rm) );
   29816             DIP("bzhi %s,%s,%s\n", nameIRegV(size,pfx),
   29817                 nameIRegE(size,pfx,rm), nameIRegG(size,pfx,rm));
   29818             delta++;
   29819          } else {
   29820             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29821             assign( src1, loadLE(ty, mkexpr(addr)) );
   29822             DIP("bzhi %s,%s,%s\n", nameIRegV(size,pfx), dis_buf,
   29823                 nameIRegG(size,pfx,rm));
   29824             delta += alen;
   29825          }
   29826 
   29827          assign( start, narrowTo( Ity_I8, mkexpr(src2) ) );
   29828          assign( cond, binop(Iop_CmpLT32U,
   29829                              unop(Iop_8Uto32, mkexpr(start)),
   29830                              mkU32(8*size)) );
   29831          /* if (start < opsize) {
   29832                if (start == 0)
   29833                   dst = 0;
   29834                else
   29835                   dst = (src1 << (opsize-start)) u>> (opsize-start);
   29836             } else {
   29837                dst = src1;
   29838             } */
   29839          assign( dst,
   29840                  IRExpr_ITE(
   29841                     mkexpr(cond),
   29842                     IRExpr_ITE(
   29843                        binop(Iop_CmpEQ8, mkexpr(start), mkU8(0)),
   29844                        mkU(ty, 0),
   29845                        binop(
   29846                           mkSizedOp(ty,Iop_Shr8),
   29847                           binop(
   29848                              mkSizedOp(ty,Iop_Shl8),
   29849                              mkexpr(src1),
   29850                              binop(Iop_Sub8, mkU8(8*size), mkexpr(start))
   29851                           ),
   29852                           binop(Iop_Sub8, mkU8(8*size), mkexpr(start))
   29853                        )
   29854                     ),
   29855                     mkexpr(src1)
   29856                  )
   29857                );
   29858          putIRegG( size, pfx, rm, mkexpr(dst) );
   29859          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
   29860                                                ? AMD64G_CC_OP_BLSR64
   29861                                                : AMD64G_CC_OP_BLSR32)) );
   29862          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
   29863          stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(cond))) );
   29864          *uses_vvvv = True;
   29865          goto decode_success;
   29866       }
   29867       /* PDEP r/m32, r32b, r32a = VEX.NDS.LZ.F2.0F38.W0 F5 /r */
   29868       /* PDEP r/m64, r64b, r64a = VEX.NDS.LZ.F2.0F38.W1 F5 /r */
   29869       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   29870          Int     size = getRexW(pfx) ? 8 : 4;
   29871          IRType  ty   = szToITy(size);
   29872          IRTemp  src  = newTemp(ty);
   29873          IRTemp  mask = newTemp(ty);
   29874          UChar   rm   = getUChar(delta);
   29875 
   29876          assign( src, getIRegV(size,pfx) );
   29877          if (epartIsReg(rm)) {
   29878             assign( mask, getIRegE(size,pfx,rm) );
   29879             DIP("pdep %s,%s,%s\n", nameIRegE(size,pfx,rm),
   29880                 nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
   29881             delta++;
   29882          } else {
   29883             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29884             assign( mask, loadLE(ty, mkexpr(addr)) );
   29885             DIP("pdep %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
   29886                 nameIRegG(size,pfx,rm));
   29887             delta += alen;
   29888          }
   29889 
   29890          IRExpr** args = mkIRExprVec_2( widenUto64(mkexpr(src)),
   29891                                         widenUto64(mkexpr(mask)) );
   29892          putIRegG( size, pfx, rm,
   29893                    narrowTo(ty, mkIRExprCCall(Ity_I64, 0/*regparms*/,
   29894                                               "amd64g_calculate_pdep",
   29895                                               &amd64g_calculate_pdep, args)) );
   29896          *uses_vvvv = True;
   29897          /* Flags aren't modified.  */
   29898          goto decode_success;
   29899       }
   29900       /* PEXT r/m32, r32b, r32a = VEX.NDS.LZ.F3.0F38.W0 F5 /r */
   29901       /* PEXT r/m64, r64b, r64a = VEX.NDS.LZ.F3.0F38.W1 F5 /r */
   29902       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   29903          Int     size = getRexW(pfx) ? 8 : 4;
   29904          IRType  ty   = szToITy(size);
   29905          IRTemp  src  = newTemp(ty);
   29906          IRTemp  mask = newTemp(ty);
   29907          UChar   rm   = getUChar(delta);
   29908 
   29909          assign( src, getIRegV(size,pfx) );
   29910          if (epartIsReg(rm)) {
   29911             assign( mask, getIRegE(size,pfx,rm) );
   29912             DIP("pext %s,%s,%s\n", nameIRegE(size,pfx,rm),
   29913                 nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
   29914             delta++;
   29915          } else {
   29916             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29917             assign( mask, loadLE(ty, mkexpr(addr)) );
   29918             DIP("pext %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
   29919                 nameIRegG(size,pfx,rm));
   29920             delta += alen;
   29921          }
   29922 
   29923          /* First mask off bits not set in mask, they are ignored
   29924             and it should be fine if they contain undefined values.  */
   29925          IRExpr* masked = binop(mkSizedOp(ty,Iop_And8),
   29926                                 mkexpr(src), mkexpr(mask));
   29927          IRExpr** args = mkIRExprVec_2( widenUto64(masked),
   29928                                         widenUto64(mkexpr(mask)) );
   29929          putIRegG( size, pfx, rm,
   29930                    narrowTo(ty, mkIRExprCCall(Ity_I64, 0/*regparms*/,
   29931                                               "amd64g_calculate_pext",
   29932                                               &amd64g_calculate_pext, args)) );
   29933          *uses_vvvv = True;
   29934          /* Flags aren't modified.  */
   29935          goto decode_success;
   29936       }
   29937       break;
   29938 
   29939    case 0xF6:
   29940       /* MULX r/m32, r32b, r32a = VEX.NDD.LZ.F2.0F38.W0 F6 /r */
   29941       /* MULX r/m64, r64b, r64a = VEX.NDD.LZ.F2.0F38.W1 F6 /r */
   29942       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   29943          Int     size = getRexW(pfx) ? 8 : 4;
   29944          IRType  ty   = szToITy(size);
   29945          IRTemp  src1 = newTemp(ty);
   29946          IRTemp  src2 = newTemp(ty);
   29947          IRTemp  res  = newTemp(size == 8 ? Ity_I128 : Ity_I64);
   29948          UChar   rm   = getUChar(delta);
   29949 
   29950          assign( src1, getIRegRDX(size) );
   29951          if (epartIsReg(rm)) {
   29952             assign( src2, getIRegE(size,pfx,rm) );
   29953             DIP("mulx %s,%s,%s\n", nameIRegE(size,pfx,rm),
   29954                 nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
   29955             delta++;
   29956          } else {
   29957             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29958             assign( src2, loadLE(ty, mkexpr(addr)) );
   29959             DIP("mulx %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
   29960                 nameIRegG(size,pfx,rm));
   29961             delta += alen;
   29962          }
   29963 
   29964          assign( res, binop(size == 8 ? Iop_MullU64 : Iop_MullU32,
   29965                             mkexpr(src1), mkexpr(src2)) );
   29966          putIRegV( size, pfx,
   29967                    unop(size == 8 ? Iop_128to64 : Iop_64to32, mkexpr(res)) );
   29968          putIRegG( size, pfx, rm,
   29969                    unop(size == 8 ? Iop_128HIto64 : Iop_64HIto32,
   29970                         mkexpr(res)) );
   29971          *uses_vvvv = True;
   29972          /* Flags aren't modified.  */
   29973          goto decode_success;
   29974       }
   29975       break;
   29976 
   29977    case 0xF7:
   29978       /* SARX r32b, r/m32, r32a = VEX.NDS.LZ.F3.0F38.W0 F7 /r */
   29979       /* SARX r64b, r/m64, r64a = VEX.NDS.LZ.F3.0F38.W1 F7 /r */
   29980       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   29981          delta = dis_SHIFTX( uses_vvvv, vbi, pfx, delta, "sarx", Iop_Sar8 );
   29982          goto decode_success;
   29983       }
   29984       /* SHLX r32b, r/m32, r32a = VEX.NDS.LZ.66.0F38.W0 F7 /r */
   29985       /* SHLX r64b, r/m64, r64a = VEX.NDS.LZ.66.0F38.W1 F7 /r */
   29986       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   29987          delta = dis_SHIFTX( uses_vvvv, vbi, pfx, delta, "shlx", Iop_Shl8 );
   29988          goto decode_success;
   29989       }
   29990       /* SHRX r32b, r/m32, r32a = VEX.NDS.LZ.F2.0F38.W0 F7 /r */
   29991       /* SHRX r64b, r/m64, r64a = VEX.NDS.LZ.F2.0F38.W1 F7 /r */
   29992       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   29993          delta = dis_SHIFTX( uses_vvvv, vbi, pfx, delta, "shrx", Iop_Shr8 );
   29994          goto decode_success;
   29995       }
   29996       /* BEXTR r32b, r/m32, r32a = VEX.NDS.LZ.0F38.W0 F7 /r */
   29997       /* BEXTR r64b, r/m64, r64a = VEX.NDS.LZ.0F38.W1 F7 /r */
   29998       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   29999          Int     size  = getRexW(pfx) ? 8 : 4;
   30000          IRType  ty    = szToITy(size);
   30001          IRTemp  dst   = newTemp(ty);
   30002          IRTemp  src1  = newTemp(ty);
   30003          IRTemp  src2  = newTemp(ty);
   30004          IRTemp  stle  = newTemp(Ity_I16);
   30005          IRTemp  start = newTemp(Ity_I8);
   30006          IRTemp  len   = newTemp(Ity_I8);
   30007          UChar   rm    = getUChar(delta);
   30008 
   30009          assign( src2, getIRegV(size,pfx) );
   30010          if (epartIsReg(rm)) {
   30011             assign( src1, getIRegE(size,pfx,rm) );
   30012             DIP("bextr %s,%s,%s\n", nameIRegV(size,pfx),
   30013                 nameIRegE(size,pfx,rm), nameIRegG(size,pfx,rm));
   30014             delta++;
   30015          } else {
   30016             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   30017             assign( src1, loadLE(ty, mkexpr(addr)) );
   30018             DIP("bextr %s,%s,%s\n", nameIRegV(size,pfx), dis_buf,
   30019                 nameIRegG(size,pfx,rm));
   30020             delta += alen;
   30021          }
   30022 
   30023          assign( stle, narrowTo( Ity_I16, mkexpr(src2) ) );
   30024          assign( start, unop( Iop_16to8, mkexpr(stle) ) );
   30025          assign( len, unop( Iop_16HIto8, mkexpr(stle) ) );
   30026          /* if (start+len < opsize) {
   30027                if (len != 0)
   30028                   dst = (src1 << (opsize-start-len)) u>> (opsize-len);
   30029                else
   30030                   dst = 0;
   30031             } else {
   30032                if (start < opsize)
   30033                   dst = src1 u>> start;
   30034                else
   30035                   dst = 0;
   30036             } */
   30037          assign( dst,
   30038                  IRExpr_ITE(
   30039                     binop(Iop_CmpLT32U,
   30040                           binop(Iop_Add32,
   30041                                 unop(Iop_8Uto32, mkexpr(start)),
   30042                                 unop(Iop_8Uto32, mkexpr(len))),
   30043                           mkU32(8*size)),
   30044                     IRExpr_ITE(
   30045                        binop(Iop_CmpEQ8, mkexpr(len), mkU8(0)),
   30046                        mkU(ty, 0),
   30047                        binop(mkSizedOp(ty,Iop_Shr8),
   30048                              binop(mkSizedOp(ty,Iop_Shl8), mkexpr(src1),
   30049                                    binop(Iop_Sub8,
   30050                                          binop(Iop_Sub8, mkU8(8*size),
   30051                                                mkexpr(start)),
   30052                                          mkexpr(len))),
   30053                              binop(Iop_Sub8, mkU8(8*size),
   30054                                    mkexpr(len)))
   30055                     ),
   30056                     IRExpr_ITE(
   30057                        binop(Iop_CmpLT32U,
   30058                              unop(Iop_8Uto32, mkexpr(start)),
   30059                              mkU32(8*size)),
   30060                        binop(mkSizedOp(ty,Iop_Shr8), mkexpr(src1),
   30061                              mkexpr(start)),
   30062                        mkU(ty, 0)
   30063                     )
   30064                  )
   30065                );
   30066          putIRegG( size, pfx, rm, mkexpr(dst) );
   30067          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
   30068                                                ? AMD64G_CC_OP_ANDN64
   30069                                                : AMD64G_CC_OP_ANDN32)) );
   30070          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
   30071          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
   30072          *uses_vvvv = True;
   30073          goto decode_success;
   30074       }
   30075       break;
   30076 
   30077    default:
   30078       break;
   30079 
   30080    }
   30081 
   30082   //decode_failure:
   30083    return deltaIN;
   30084 
   30085   decode_success:
   30086    return delta;
   30087 }
   30088 
   30089 
   30090 /*------------------------------------------------------------*/
   30091 /*---                                                      ---*/
   30092 /*--- Top-level post-escape decoders: dis_ESC_0F3A__VEX    ---*/
   30093 /*---                                                      ---*/
   30094 /*------------------------------------------------------------*/
   30095 
   30096 static IRTemp math_VPERMILPS_128 ( IRTemp sV, UInt imm8 )
   30097 {
   30098    vassert(imm8 < 256);
   30099    IRTemp s3, s2, s1, s0;
   30100    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   30101    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   30102 #  define SEL(_nn) (((_nn)==0) ? s0 : ((_nn)==1) ? s1 \
   30103                                     : ((_nn)==2) ? s2 : s3)
   30104    IRTemp res = newTemp(Ity_V128);
   30105    assign(res, mkV128from32s( SEL((imm8 >> 6) & 3),
   30106                               SEL((imm8 >> 4) & 3),
   30107                               SEL((imm8 >> 2) & 3),
   30108                               SEL((imm8 >> 0) & 3) ));
   30109 #  undef SEL
   30110    return res;
   30111 }
   30112 
   30113 __attribute__((noinline))
   30114 static
   30115 Long dis_ESC_0F3A__VEX (
   30116         /*MB_OUT*/DisResult* dres,
   30117         /*OUT*/   Bool*      uses_vvvv,
   30118         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
   30119         Bool         resteerCisOk,
   30120         void*        callback_opaque,
   30121         const VexArchInfo* archinfo,
   30122         const VexAbiInfo*  vbi,
   30123         Prefix pfx, Int sz, Long deltaIN
   30124      )
   30125 {
   30126    IRTemp addr  = IRTemp_INVALID;
   30127    Int    alen  = 0;
   30128    HChar  dis_buf[50];
   30129    Long   delta = deltaIN;
   30130    UChar  opc   = getUChar(delta);
   30131    delta++;
   30132    *uses_vvvv = False;
   30133 
   30134    switch (opc) {
   30135 
   30136    case 0x00:
   30137    case 0x01:
   30138       /* VPERMQ imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.W1 00 /r ib */
   30139       /* VPERMPD imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.W1 01 /r ib */
   30140       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   30141           && 1==getRexW(pfx)/*W1*/) {
   30142          UChar  modrm = getUChar(delta);
   30143          UInt   imm8  = 0;
   30144          UInt   rG    = gregOfRexRM(pfx, modrm);
   30145          IRTemp sV    = newTemp(Ity_V256);
   30146          const HChar *name  = opc == 0 ? "vpermq" : "vpermpd";
   30147          if (epartIsReg(modrm)) {
   30148             UInt rE = eregOfRexRM(pfx, modrm);
   30149             delta += 1;
   30150             imm8 = getUChar(delta);
   30151             DIP("%s $%u,%s,%s\n",
   30152                 name, imm8, nameYMMReg(rE), nameYMMReg(rG));
   30153             assign(sV, getYMMReg(rE));
   30154          } else {
   30155             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30156             delta += alen;
   30157             imm8 = getUChar(delta);
   30158             DIP("%s $%u,%s,%s\n",
   30159                 name, imm8, dis_buf, nameYMMReg(rG));
   30160             assign(sV, loadLE(Ity_V256, mkexpr(addr)));
   30161          }
   30162          delta++;
   30163          IRTemp s[4];
   30164          s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID;
   30165          breakupV256to64s(sV, &s[3], &s[2], &s[1], &s[0]);
   30166          IRTemp dV = newTemp(Ity_V256);
   30167          assign(dV, IRExpr_Qop(Iop_64x4toV256,
   30168                                mkexpr(s[(imm8 >> 6) & 3]),
   30169                                mkexpr(s[(imm8 >> 4) & 3]),
   30170                                mkexpr(s[(imm8 >> 2) & 3]),
   30171                                mkexpr(s[(imm8 >> 0) & 3])));
   30172          putYMMReg(rG, mkexpr(dV));
   30173          goto decode_success;
   30174       }
   30175       break;
   30176 
   30177    case 0x02:
   30178       /* VPBLENDD imm8, xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W0 02 /r ib */
   30179       if (have66noF2noF3(pfx)
   30180           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   30181          UChar  modrm = getUChar(delta);
   30182          UInt   imm8  = 0;
   30183          UInt   rG    = gregOfRexRM(pfx, modrm);
   30184          UInt   rV    = getVexNvvvv(pfx);
   30185          IRTemp sV    = newTemp(Ity_V128);
   30186          IRTemp dV    = newTemp(Ity_V128);
   30187          UInt   i;
   30188          IRTemp s[4], d[4];
   30189          assign(sV, getXMMReg(rV));
   30190          if (epartIsReg(modrm)) {
   30191             UInt rE = eregOfRexRM(pfx, modrm);
   30192             delta += 1;
   30193             imm8 = getUChar(delta);
   30194             DIP("vpblendd $%u,%s,%s,%s\n",
   30195                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   30196             assign(dV, getXMMReg(rE));
   30197          } else {
   30198             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30199             delta += alen;
   30200             imm8 = getUChar(delta);
   30201             DIP("vpblendd $%u,%s,%s,%s\n",
   30202                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   30203             assign(dV, loadLE(Ity_V128, mkexpr(addr)));
   30204          }
   30205          delta++;
   30206          for (i = 0; i < 4; i++) {
   30207             s[i] = IRTemp_INVALID;
   30208             d[i] = IRTemp_INVALID;
   30209          }
   30210          breakupV128to32s( sV, &s[3], &s[2], &s[1], &s[0] );
   30211          breakupV128to32s( dV, &d[3], &d[2], &d[1], &d[0] );
   30212          for (i = 0; i < 4; i++)
   30213             putYMMRegLane32(rG, i, mkexpr((imm8 & (1<<i)) ? d[i] : s[i]));
   30214          putYMMRegLane128(rG, 1, mkV128(0));
   30215          *uses_vvvv = True;
   30216          goto decode_success;
   30217       }
   30218       /* VPBLENDD imm8, ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F3A.W0 02 /r ib */
   30219       if (have66noF2noF3(pfx)
   30220           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   30221          UChar  modrm = getUChar(delta);
   30222          UInt   imm8  = 0;
   30223          UInt   rG    = gregOfRexRM(pfx, modrm);
   30224          UInt   rV    = getVexNvvvv(pfx);
   30225          IRTemp sV    = newTemp(Ity_V256);
   30226          IRTemp dV    = newTemp(Ity_V256);
   30227          UInt   i;
   30228          IRTemp s[8], d[8];
   30229          assign(sV, getYMMReg(rV));
   30230          if (epartIsReg(modrm)) {
   30231             UInt rE = eregOfRexRM(pfx, modrm);
   30232             delta += 1;
   30233             imm8 = getUChar(delta);
   30234             DIP("vpblendd $%u,%s,%s,%s\n",
   30235                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   30236             assign(dV, getYMMReg(rE));
   30237          } else {
   30238             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30239             delta += alen;
   30240             imm8 = getUChar(delta);
   30241             DIP("vpblendd $%u,%s,%s,%s\n",
   30242                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   30243             assign(dV, loadLE(Ity_V256, mkexpr(addr)));
   30244          }
   30245          delta++;
   30246          for (i = 0; i < 8; i++) {
   30247             s[i] = IRTemp_INVALID;
   30248             d[i] = IRTemp_INVALID;
   30249          }
   30250          breakupV256to32s( sV, &s[7], &s[6], &s[5], &s[4],
   30251                                &s[3], &s[2], &s[1], &s[0] );
   30252          breakupV256to32s( dV, &d[7], &d[6], &d[5], &d[4],
   30253                                &d[3], &d[2], &d[1], &d[0] );
   30254          for (i = 0; i < 8; i++)
   30255             putYMMRegLane32(rG, i, mkexpr((imm8 & (1<<i)) ? d[i] : s[i]));
   30256          *uses_vvvv = True;
   30257          goto decode_success;
   30258       }
   30259       break;
   30260 
   30261    case 0x04:
   30262       /* VPERMILPS imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.WIG 04 /r ib */
   30263       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   30264          UChar  modrm = getUChar(delta);
   30265          UInt   imm8  = 0;
   30266          UInt   rG    = gregOfRexRM(pfx, modrm);
   30267          IRTemp sV    = newTemp(Ity_V256);
   30268          if (epartIsReg(modrm)) {
   30269             UInt rE = eregOfRexRM(pfx, modrm);
   30270             delta += 1;
   30271             imm8 = getUChar(delta);
   30272             DIP("vpermilps $%u,%s,%s\n",
   30273                 imm8, nameYMMReg(rE), nameYMMReg(rG));
   30274             assign(sV, getYMMReg(rE));
   30275          } else {
   30276             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30277             delta += alen;
   30278             imm8 = getUChar(delta);
   30279             DIP("vpermilps $%u,%s,%s\n",
   30280                 imm8, dis_buf, nameYMMReg(rG));
   30281             assign(sV, loadLE(Ity_V256, mkexpr(addr)));
   30282          }
   30283          delta++;
   30284          IRTemp  sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   30285          breakupV256toV128s( sV, &sVhi, &sVlo );
   30286          IRTemp  dVhi = math_VPERMILPS_128( sVhi, imm8 );
   30287          IRTemp  dVlo = math_VPERMILPS_128( sVlo, imm8 );
   30288          IRExpr* res  = binop(Iop_V128HLtoV256, mkexpr(dVhi), mkexpr(dVlo));
   30289          putYMMReg(rG, res);
   30290          goto decode_success;
   30291       }
   30292       /* VPERMILPS imm8, xmm2/m128, xmm1 = VEX.128.66.0F3A.WIG 04 /r ib */
   30293       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30294          UChar  modrm = getUChar(delta);
   30295          UInt   imm8  = 0;
   30296          UInt   rG    = gregOfRexRM(pfx, modrm);
   30297          IRTemp sV    = newTemp(Ity_V128);
   30298          if (epartIsReg(modrm)) {
   30299             UInt rE = eregOfRexRM(pfx, modrm);
   30300             delta += 1;
   30301             imm8 = getUChar(delta);
   30302             DIP("vpermilps $%u,%s,%s\n",
   30303                 imm8, nameXMMReg(rE), nameXMMReg(rG));
   30304             assign(sV, getXMMReg(rE));
   30305          } else {
   30306             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30307             delta += alen;
   30308             imm8 = getUChar(delta);
   30309             DIP("vpermilps $%u,%s,%s\n",
   30310                 imm8, dis_buf, nameXMMReg(rG));
   30311             assign(sV, loadLE(Ity_V128, mkexpr(addr)));
   30312          }
   30313          delta++;
   30314          putYMMRegLoAndZU(rG, mkexpr ( math_VPERMILPS_128 ( sV, imm8 ) ) );
   30315          goto decode_success;
   30316       }
   30317       break;
   30318 
   30319    case 0x05:
   30320       /* VPERMILPD imm8, xmm2/m128, xmm1 = VEX.128.66.0F3A.WIG 05 /r ib */
   30321       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30322          UChar  modrm = getUChar(delta);
   30323          UInt   imm8  = 0;
   30324          UInt   rG    = gregOfRexRM(pfx, modrm);
   30325          IRTemp sV    = newTemp(Ity_V128);
   30326          if (epartIsReg(modrm)) {
   30327             UInt rE = eregOfRexRM(pfx, modrm);
   30328             delta += 1;
   30329             imm8 = getUChar(delta);
   30330             DIP("vpermilpd $%u,%s,%s\n",
   30331                 imm8, nameXMMReg(rE), nameXMMReg(rG));
   30332             assign(sV, getXMMReg(rE));
   30333          } else {
   30334             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30335             delta += alen;
   30336             imm8 = getUChar(delta);
   30337             DIP("vpermilpd $%u,%s,%s\n",
   30338                 imm8, dis_buf, nameXMMReg(rG));
   30339             assign(sV, loadLE(Ity_V128, mkexpr(addr)));
   30340          }
   30341          delta++;
   30342          IRTemp s1 = newTemp(Ity_I64);
   30343          IRTemp s0 = newTemp(Ity_I64);
   30344          assign(s1, unop(Iop_V128HIto64, mkexpr(sV)));
   30345          assign(s0, unop(Iop_V128to64,   mkexpr(sV)));
   30346          IRTemp dV = newTemp(Ity_V128);
   30347          assign(dV, binop(Iop_64HLtoV128,
   30348                                mkexpr((imm8 & (1<<1)) ? s1 : s0),
   30349                                mkexpr((imm8 & (1<<0)) ? s1 : s0)));
   30350          putYMMRegLoAndZU(rG, mkexpr(dV));
   30351          goto decode_success;
   30352       }
   30353       /* VPERMILPD imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.WIG 05 /r ib */
   30354       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   30355          UChar  modrm = getUChar(delta);
   30356          UInt   imm8  = 0;
   30357          UInt   rG    = gregOfRexRM(pfx, modrm);
   30358          IRTemp sV    = newTemp(Ity_V256);
   30359          if (epartIsReg(modrm)) {
   30360             UInt rE = eregOfRexRM(pfx, modrm);
   30361             delta += 1;
   30362             imm8 = getUChar(delta);
   30363             DIP("vpermilpd $%u,%s,%s\n",
   30364                 imm8, nameYMMReg(rE), nameYMMReg(rG));
   30365             assign(sV, getYMMReg(rE));
   30366          } else {
   30367             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30368             delta += alen;
   30369             imm8 = getUChar(delta);
   30370             DIP("vpermilpd $%u,%s,%s\n",
   30371                 imm8, dis_buf, nameYMMReg(rG));
   30372             assign(sV, loadLE(Ity_V256, mkexpr(addr)));
   30373          }
   30374          delta++;
   30375          IRTemp s3, s2, s1, s0;
   30376          s3 = s2 = s1 = s0 = IRTemp_INVALID;
   30377          breakupV256to64s(sV, &s3, &s2, &s1, &s0);
   30378          IRTemp dV = newTemp(Ity_V256);
   30379          assign(dV, IRExpr_Qop(Iop_64x4toV256,
   30380                                mkexpr((imm8 & (1<<3)) ? s3 : s2),
   30381                                mkexpr((imm8 & (1<<2)) ? s3 : s2),
   30382                                mkexpr((imm8 & (1<<1)) ? s1 : s0),
   30383                                mkexpr((imm8 & (1<<0)) ? s1 : s0)));
   30384          putYMMReg(rG, mkexpr(dV));
   30385          goto decode_success;
   30386       }
   30387       break;
   30388 
   30389    case 0x06:
   30390       /* VPERM2F128 imm8, ymm3/m256, ymm2, ymm1 = VEX.NDS.66.0F3A.W0 06 /r ib */
   30391       if (have66noF2noF3(pfx)
   30392           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   30393          UChar  modrm = getUChar(delta);
   30394          UInt   imm8  = 0;
   30395          UInt   rG    = gregOfRexRM(pfx, modrm);
   30396          UInt   rV    = getVexNvvvv(pfx);
   30397          IRTemp s00   = newTemp(Ity_V128);
   30398          IRTemp s01   = newTemp(Ity_V128);
   30399          IRTemp s10   = newTemp(Ity_V128);
   30400          IRTemp s11   = newTemp(Ity_V128);
   30401          assign(s00, getYMMRegLane128(rV, 0));
   30402          assign(s01, getYMMRegLane128(rV, 1));
   30403          if (epartIsReg(modrm)) {
   30404             UInt rE = eregOfRexRM(pfx, modrm);
   30405             delta += 1;
   30406             imm8 = getUChar(delta);
   30407             DIP("vperm2f128 $%u,%s,%s,%s\n",
   30408                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   30409             assign(s10, getYMMRegLane128(rE, 0));
   30410             assign(s11, getYMMRegLane128(rE, 1));
   30411          } else {
   30412             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30413             delta += alen;
   30414             imm8 = getUChar(delta);
   30415             DIP("vperm2f128 $%u,%s,%s,%s\n",
   30416                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   30417             assign(s10, loadLE(Ity_V128, binop(Iop_Add64,
   30418                                                mkexpr(addr), mkU64(0))));
   30419             assign(s11, loadLE(Ity_V128, binop(Iop_Add64,
   30420                                                mkexpr(addr), mkU64(16))));
   30421          }
   30422          delta++;
   30423 #        define SEL(_nn) (((_nn)==0) ? s00 : ((_nn)==1) ? s01 \
   30424                                            : ((_nn)==2) ? s10 : s11)
   30425          putYMMRegLane128(rG, 0, mkexpr(SEL((imm8 >> 0) & 3)));
   30426          putYMMRegLane128(rG, 1, mkexpr(SEL((imm8 >> 4) & 3)));
   30427 #        undef SEL
   30428          if (imm8 & (1<<3)) putYMMRegLane128(rG, 0, mkV128(0));
   30429          if (imm8 & (1<<7)) putYMMRegLane128(rG, 1, mkV128(0));
   30430          *uses_vvvv = True;
   30431          goto decode_success;
   30432       }
   30433       break;
   30434 
   30435    case 0x08:
   30436       /* VROUNDPS imm8, xmm2/m128, xmm1 */
   30437       /* VROUNDPS = VEX.NDS.128.66.0F3A.WIG 08 ib */
   30438       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30439          UChar  modrm = getUChar(delta);
   30440          UInt   rG    = gregOfRexRM(pfx, modrm);
   30441          IRTemp src   = newTemp(Ity_V128);
   30442          IRTemp s0    = IRTemp_INVALID;
   30443          IRTemp s1    = IRTemp_INVALID;
   30444          IRTemp s2    = IRTemp_INVALID;
   30445          IRTemp s3    = IRTemp_INVALID;
   30446          IRTemp rm    = newTemp(Ity_I32);
   30447          Int    imm   = 0;
   30448 
   30449          modrm = getUChar(delta);
   30450 
   30451          if (epartIsReg(modrm)) {
   30452             UInt rE = eregOfRexRM(pfx, modrm);
   30453             assign( src, getXMMReg( rE ) );
   30454             imm = getUChar(delta+1);
   30455             if (imm & ~15) break;
   30456             delta += 1+1;
   30457             DIP( "vroundps $%d,%s,%s\n", imm, nameXMMReg(rE), nameXMMReg(rG) );
   30458          } else {
   30459             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30460             assign( src, loadLE(Ity_V128, mkexpr(addr) ) );
   30461             imm = getUChar(delta+alen);
   30462             if (imm & ~15) break;
   30463             delta += alen+1;
   30464             DIP( "vroundps $%d,%s,%s\n", imm, dis_buf, nameXMMReg(rG) );
   30465          }
   30466 
   30467          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   30468             that encoding is the same as the encoding for IRRoundingMode,
   30469             we can use that value directly in the IR as a rounding
   30470             mode. */
   30471          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   30472 
   30473          breakupV128to32s( src, &s3, &s2, &s1, &s0 );
   30474          putYMMRegLane128( rG, 1, mkV128(0) );
   30475 #        define CVT(s) binop(Iop_RoundF32toInt, mkexpr(rm), \
   30476                              unop(Iop_ReinterpI32asF32, mkexpr(s)))
   30477          putYMMRegLane32F( rG, 3, CVT(s3) );
   30478          putYMMRegLane32F( rG, 2, CVT(s2) );
   30479          putYMMRegLane32F( rG, 1, CVT(s1) );
   30480          putYMMRegLane32F( rG, 0, CVT(s0) );
   30481 #        undef CVT
   30482          goto decode_success;
   30483       }
   30484       /* VROUNDPS imm8, ymm2/m256, ymm1 */
   30485       /* VROUNDPS = VEX.NDS.256.66.0F3A.WIG 08 ib */
   30486       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   30487          UChar  modrm = getUChar(delta);
   30488          UInt   rG    = gregOfRexRM(pfx, modrm);
   30489          IRTemp src   = newTemp(Ity_V256);
   30490          IRTemp s0    = IRTemp_INVALID;
   30491          IRTemp s1    = IRTemp_INVALID;
   30492          IRTemp s2    = IRTemp_INVALID;
   30493          IRTemp s3    = IRTemp_INVALID;
   30494          IRTemp s4    = IRTemp_INVALID;
   30495          IRTemp s5    = IRTemp_INVALID;
   30496          IRTemp s6    = IRTemp_INVALID;
   30497          IRTemp s7    = IRTemp_INVALID;
   30498          IRTemp rm    = newTemp(Ity_I32);
   30499          Int    imm   = 0;
   30500 
   30501          modrm = getUChar(delta);
   30502 
   30503          if (epartIsReg(modrm)) {
   30504             UInt rE = eregOfRexRM(pfx, modrm);
   30505             assign( src, getYMMReg( rE ) );
   30506             imm = getUChar(delta+1);
   30507             if (imm & ~15) break;
   30508             delta += 1+1;
   30509             DIP( "vroundps $%d,%s,%s\n", imm, nameYMMReg(rE), nameYMMReg(rG) );
   30510          } else {
   30511             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30512             assign( src, loadLE(Ity_V256, mkexpr(addr) ) );
   30513             imm = getUChar(delta+alen);
   30514             if (imm & ~15) break;
   30515             delta += alen+1;
   30516             DIP( "vroundps $%d,%s,%s\n", imm, dis_buf, nameYMMReg(rG) );
   30517          }
   30518 
   30519          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   30520             that encoding is the same as the encoding for IRRoundingMode,
   30521             we can use that value directly in the IR as a rounding
   30522             mode. */
   30523          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   30524 
   30525          breakupV256to32s( src, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
   30526 #        define CVT(s) binop(Iop_RoundF32toInt, mkexpr(rm), \
   30527                              unop(Iop_ReinterpI32asF32, mkexpr(s)))
   30528          putYMMRegLane32F( rG, 7, CVT(s7) );
   30529          putYMMRegLane32F( rG, 6, CVT(s6) );
   30530          putYMMRegLane32F( rG, 5, CVT(s5) );
   30531          putYMMRegLane32F( rG, 4, CVT(s4) );
   30532          putYMMRegLane32F( rG, 3, CVT(s3) );
   30533          putYMMRegLane32F( rG, 2, CVT(s2) );
   30534          putYMMRegLane32F( rG, 1, CVT(s1) );
   30535          putYMMRegLane32F( rG, 0, CVT(s0) );
   30536 #        undef CVT
   30537          goto decode_success;
   30538       }
   30539 
   30540    case 0x09:
   30541       /* VROUNDPD imm8, xmm2/m128, xmm1 */
   30542       /* VROUNDPD = VEX.NDS.128.66.0F3A.WIG 09 ib */
   30543       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30544          UChar  modrm = getUChar(delta);
   30545          UInt   rG    = gregOfRexRM(pfx, modrm);
   30546          IRTemp src   = newTemp(Ity_V128);
   30547          IRTemp s0    = IRTemp_INVALID;
   30548          IRTemp s1    = IRTemp_INVALID;
   30549          IRTemp rm    = newTemp(Ity_I32);
   30550          Int    imm   = 0;
   30551 
   30552          modrm = getUChar(delta);
   30553 
   30554          if (epartIsReg(modrm)) {
   30555             UInt rE = eregOfRexRM(pfx, modrm);
   30556             assign( src, getXMMReg( rE ) );
   30557             imm = getUChar(delta+1);
   30558             if (imm & ~15) break;
   30559             delta += 1+1;
   30560             DIP( "vroundpd $%d,%s,%s\n", imm, nameXMMReg(rE), nameXMMReg(rG) );
   30561          } else {
   30562             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30563             assign( src, loadLE(Ity_V128, mkexpr(addr) ) );
   30564             imm = getUChar(delta+alen);
   30565             if (imm & ~15) break;
   30566             delta += alen+1;
   30567             DIP( "vroundpd $%d,%s,%s\n", imm, dis_buf, nameXMMReg(rG) );
   30568          }
   30569 
   30570          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   30571             that encoding is the same as the encoding for IRRoundingMode,
   30572             we can use that value directly in the IR as a rounding
   30573             mode. */
   30574          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   30575 
   30576          breakupV128to64s( src, &s1, &s0 );
   30577          putYMMRegLane128( rG, 1, mkV128(0) );
   30578 #        define CVT(s) binop(Iop_RoundF64toInt, mkexpr(rm), \
   30579                              unop(Iop_ReinterpI64asF64, mkexpr(s)))
   30580          putYMMRegLane64F( rG, 1, CVT(s1) );
   30581          putYMMRegLane64F( rG, 0, CVT(s0) );
   30582 #        undef CVT
   30583          goto decode_success;
   30584       }
   30585       /* VROUNDPD imm8, ymm2/m256, ymm1 */
   30586       /* VROUNDPD = VEX.NDS.256.66.0F3A.WIG 09 ib */
   30587       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   30588          UChar  modrm = getUChar(delta);
   30589          UInt   rG    = gregOfRexRM(pfx, modrm);
   30590          IRTemp src   = newTemp(Ity_V256);
   30591          IRTemp s0    = IRTemp_INVALID;
   30592          IRTemp s1    = IRTemp_INVALID;
   30593          IRTemp s2    = IRTemp_INVALID;
   30594          IRTemp s3    = IRTemp_INVALID;
   30595          IRTemp rm    = newTemp(Ity_I32);
   30596          Int    imm   = 0;
   30597 
   30598          modrm = getUChar(delta);
   30599 
   30600          if (epartIsReg(modrm)) {
   30601             UInt rE = eregOfRexRM(pfx, modrm);
   30602             assign( src, getYMMReg( rE ) );
   30603             imm = getUChar(delta+1);
   30604             if (imm & ~15) break;
   30605             delta += 1+1;
   30606             DIP( "vroundpd $%d,%s,%s\n", imm, nameYMMReg(rE), nameYMMReg(rG) );
   30607          } else {
   30608             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30609             assign( src, loadLE(Ity_V256, mkexpr(addr) ) );
   30610             imm = getUChar(delta+alen);
   30611             if (imm & ~15) break;
   30612             delta += alen+1;
   30613             DIP( "vroundps $%d,%s,%s\n", imm, dis_buf, nameYMMReg(rG) );
   30614          }
   30615 
   30616          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   30617             that encoding is the same as the encoding for IRRoundingMode,
   30618             we can use that value directly in the IR as a rounding
   30619             mode. */
   30620          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   30621 
   30622          breakupV256to64s( src, &s3, &s2, &s1, &s0 );
   30623 #        define CVT(s) binop(Iop_RoundF64toInt, mkexpr(rm), \
   30624                              unop(Iop_ReinterpI64asF64, mkexpr(s)))
   30625          putYMMRegLane64F( rG, 3, CVT(s3) );
   30626          putYMMRegLane64F( rG, 2, CVT(s2) );
   30627          putYMMRegLane64F( rG, 1, CVT(s1) );
   30628          putYMMRegLane64F( rG, 0, CVT(s0) );
   30629 #        undef CVT
   30630          goto decode_success;
   30631       }
   30632 
   30633    case 0x0A:
   30634    case 0x0B:
   30635       /* VROUNDSS imm8, xmm3/m32, xmm2, xmm1 */
   30636       /* VROUNDSS = VEX.NDS.128.66.0F3A.WIG 0A ib */
   30637       /* VROUNDSD imm8, xmm3/m64, xmm2, xmm1 */
   30638       /* VROUNDSD = VEX.NDS.128.66.0F3A.WIG 0B ib */
   30639       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30640          UChar  modrm = getUChar(delta);
   30641          UInt   rG    = gregOfRexRM(pfx, modrm);
   30642          UInt   rV    = getVexNvvvv(pfx);
   30643          Bool   isD   = opc == 0x0B;
   30644          IRTemp src   = newTemp(isD ? Ity_F64 : Ity_F32);
   30645          IRTemp res   = newTemp(isD ? Ity_F64 : Ity_F32);
   30646          Int    imm   = 0;
   30647 
   30648          if (epartIsReg(modrm)) {
   30649             UInt rE = eregOfRexRM(pfx, modrm);
   30650             assign( src,
   30651                     isD ? getXMMRegLane64F(rE, 0) : getXMMRegLane32F(rE, 0) );
   30652             imm = getUChar(delta+1);
   30653             if (imm & ~15) break;
   30654             delta += 1+1;
   30655             DIP( "vrounds%c $%d,%s,%s,%s\n",
   30656                  isD ? 'd' : 's',
   30657                  imm, nameXMMReg( rE ), nameXMMReg( rV ), nameXMMReg( rG ) );
   30658          } else {
   30659             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30660             assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
   30661             imm = getUChar(delta+alen);
   30662             if (imm & ~15) break;
   30663             delta += alen+1;
   30664             DIP( "vrounds%c $%d,%s,%s,%s\n",
   30665                  isD ? 'd' : 's',
   30666                  imm, dis_buf, nameXMMReg( rV ), nameXMMReg( rG ) );
   30667          }
   30668 
   30669          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   30670             that encoding is the same as the encoding for IRRoundingMode,
   30671             we can use that value directly in the IR as a rounding
   30672             mode. */
   30673          assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
   30674                            (imm & 4) ? get_sse_roundingmode()
   30675                                      : mkU32(imm & 3),
   30676                            mkexpr(src)) );
   30677 
   30678          if (isD)
   30679             putXMMRegLane64F( rG, 0, mkexpr(res) );
   30680          else {
   30681             putXMMRegLane32F( rG, 0, mkexpr(res) );
   30682             putXMMRegLane32F( rG, 1, getXMMRegLane32F( rV, 1 ) );
   30683          }
   30684          putXMMRegLane64F( rG, 1, getXMMRegLane64F( rV, 1 ) );
   30685          putYMMRegLane128( rG, 1, mkV128(0) );
   30686          *uses_vvvv = True;
   30687          goto decode_success;
   30688       }
   30689       break;
   30690 
   30691    case 0x0C:
   30692       /* VBLENDPS imm8, ymm3/m256, ymm2, ymm1 */
   30693       /* VBLENDPS = VEX.NDS.256.66.0F3A.WIG 0C /r ib */
   30694       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   30695          UChar  modrm = getUChar(delta);
   30696          UInt   imm8;
   30697          UInt   rG    = gregOfRexRM(pfx, modrm);
   30698          UInt   rV    = getVexNvvvv(pfx);
   30699          IRTemp sV    = newTemp(Ity_V256);
   30700          IRTemp sE    = newTemp(Ity_V256);
   30701          assign ( sV, getYMMReg(rV) );
   30702          if (epartIsReg(modrm)) {
   30703             UInt rE = eregOfRexRM(pfx, modrm);
   30704             delta += 1;
   30705             imm8 = getUChar(delta);
   30706             DIP("vblendps $%u,%s,%s,%s\n",
   30707                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   30708             assign(sE, getYMMReg(rE));
   30709          } else {
   30710             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30711             delta += alen;
   30712             imm8 = getUChar(delta);
   30713             DIP("vblendps $%u,%s,%s,%s\n",
   30714                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   30715             assign(sE, loadLE(Ity_V256, mkexpr(addr)));
   30716          }
   30717          delta++;
   30718          putYMMReg( rG,
   30719                     mkexpr( math_BLENDPS_256( sE, sV, imm8) ) );
   30720          *uses_vvvv = True;
   30721          goto decode_success;
   30722       }
   30723       /* VBLENDPS imm8, xmm3/m128, xmm2, xmm1 */
   30724       /* VBLENDPS = VEX.NDS.128.66.0F3A.WIG 0C /r ib */
   30725       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30726          UChar  modrm = getUChar(delta);
   30727          UInt   imm8;
   30728          UInt   rG    = gregOfRexRM(pfx, modrm);
   30729          UInt   rV    = getVexNvvvv(pfx);
   30730          IRTemp sV    = newTemp(Ity_V128);
   30731          IRTemp sE    = newTemp(Ity_V128);
   30732          assign ( sV, getXMMReg(rV) );
   30733          if (epartIsReg(modrm)) {
   30734             UInt rE = eregOfRexRM(pfx, modrm);
   30735             delta += 1;
   30736             imm8 = getUChar(delta);
   30737             DIP("vblendps $%u,%s,%s,%s\n",
   30738                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   30739             assign(sE, getXMMReg(rE));
   30740          } else {
   30741             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30742             delta += alen;
   30743             imm8 = getUChar(delta);
   30744             DIP("vblendps $%u,%s,%s,%s\n",
   30745                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   30746             assign(sE, loadLE(Ity_V128, mkexpr(addr)));
   30747          }
   30748          delta++;
   30749          putYMMRegLoAndZU( rG,
   30750                            mkexpr( math_BLENDPS_128( sE, sV, imm8) ) );
   30751          *uses_vvvv = True;
   30752          goto decode_success;
   30753       }
   30754       break;
   30755 
   30756    case 0x0D:
   30757       /* VBLENDPD imm8, ymm3/m256, ymm2, ymm1 */
   30758       /* VBLENDPD = VEX.NDS.256.66.0F3A.WIG 0D /r ib */
   30759       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   30760          UChar  modrm = getUChar(delta);
   30761          UInt   imm8;
   30762          UInt   rG    = gregOfRexRM(pfx, modrm);
   30763          UInt   rV    = getVexNvvvv(pfx);
   30764          IRTemp sV    = newTemp(Ity_V256);
   30765          IRTemp sE    = newTemp(Ity_V256);
   30766          assign ( sV, getYMMReg(rV) );
   30767          if (epartIsReg(modrm)) {
   30768             UInt rE = eregOfRexRM(pfx, modrm);
   30769             delta += 1;
   30770             imm8 = getUChar(delta);
   30771             DIP("vblendpd $%u,%s,%s,%s\n",
   30772                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   30773             assign(sE, getYMMReg(rE));
   30774          } else {
   30775             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30776             delta += alen;
   30777             imm8 = getUChar(delta);
   30778             DIP("vblendpd $%u,%s,%s,%s\n",
   30779                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   30780             assign(sE, loadLE(Ity_V256, mkexpr(addr)));
   30781          }
   30782          delta++;
   30783          putYMMReg( rG,
   30784                     mkexpr( math_BLENDPD_256( sE, sV, imm8) ) );
   30785          *uses_vvvv = True;
   30786          goto decode_success;
   30787       }
   30788       /* VBLENDPD imm8, xmm3/m128, xmm2, xmm1 */
   30789       /* VBLENDPD = VEX.NDS.128.66.0F3A.WIG 0D /r ib */
   30790       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30791          UChar  modrm = getUChar(delta);
   30792          UInt   imm8;
   30793          UInt   rG    = gregOfRexRM(pfx, modrm);
   30794          UInt   rV    = getVexNvvvv(pfx);
   30795          IRTemp sV    = newTemp(Ity_V128);
   30796          IRTemp sE    = newTemp(Ity_V128);
   30797          assign ( sV, getXMMReg(rV) );
   30798          if (epartIsReg(modrm)) {
   30799             UInt rE = eregOfRexRM(pfx, modrm);
   30800             delta += 1;
   30801             imm8 = getUChar(delta);
   30802             DIP("vblendpd $%u,%s,%s,%s\n",
   30803                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   30804             assign(sE, getXMMReg(rE));
   30805          } else {
   30806             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30807             delta += alen;
   30808             imm8 = getUChar(delta);
   30809             DIP("vblendpd $%u,%s,%s,%s\n",
   30810                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   30811             assign(sE, loadLE(Ity_V128, mkexpr(addr)));
   30812          }
   30813          delta++;
   30814          putYMMRegLoAndZU( rG,
   30815                            mkexpr( math_BLENDPD_128( sE, sV, imm8) ) );
   30816          *uses_vvvv = True;
   30817          goto decode_success;
   30818       }
   30819       break;
   30820 
   30821    case 0x0E:
   30822       /* VPBLENDW imm8, xmm3/m128, xmm2, xmm1 */
   30823       /* VPBLENDW = VEX.NDS.128.66.0F3A.WIG 0E /r ib */
   30824       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30825          UChar  modrm = getUChar(delta);
   30826          UInt   imm8;
   30827          UInt   rG    = gregOfRexRM(pfx, modrm);
   30828          UInt   rV    = getVexNvvvv(pfx);
   30829          IRTemp sV    = newTemp(Ity_V128);
   30830          IRTemp sE    = newTemp(Ity_V128);
   30831          assign ( sV, getXMMReg(rV) );
   30832          if (epartIsReg(modrm)) {
   30833             UInt rE = eregOfRexRM(pfx, modrm);
   30834             delta += 1;
   30835             imm8 = getUChar(delta);
   30836             DIP("vpblendw $%u,%s,%s,%s\n",
   30837                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   30838             assign(sE, getXMMReg(rE));
   30839          } else {
   30840             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30841             delta += alen;
   30842             imm8 = getUChar(delta);
   30843             DIP("vpblendw $%u,%s,%s,%s\n",
   30844                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   30845             assign(sE, loadLE(Ity_V128, mkexpr(addr)));
   30846          }
   30847          delta++;
   30848          putYMMRegLoAndZU( rG,
   30849                            mkexpr( math_PBLENDW_128( sE, sV, imm8) ) );
   30850          *uses_vvvv = True;
   30851          goto decode_success;
   30852       }
   30853       /* VPBLENDW imm8, ymm3/m256, ymm2, ymm1 */
   30854       /* VPBLENDW = VEX.NDS.256.66.0F3A.WIG 0E /r ib */
   30855       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   30856          UChar  modrm = getUChar(delta);
   30857          UInt   imm8;
   30858          UInt   rG    = gregOfRexRM(pfx, modrm);
   30859          UInt   rV    = getVexNvvvv(pfx);
   30860          IRTemp sV    = newTemp(Ity_V256);
   30861          IRTemp sE    = newTemp(Ity_V256);
   30862          IRTemp sVhi, sVlo, sEhi, sElo;
   30863          sVhi = sVlo = sEhi = sElo = IRTemp_INVALID;
   30864          assign ( sV, getYMMReg(rV) );
   30865          if (epartIsReg(modrm)) {
   30866             UInt rE = eregOfRexRM(pfx, modrm);
   30867             delta += 1;
   30868             imm8 = getUChar(delta);
   30869             DIP("vpblendw $%u,%s,%s,%s\n",
   30870                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   30871             assign(sE, getYMMReg(rE));
   30872          } else {
   30873             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30874             delta += alen;
   30875             imm8 = getUChar(delta);
   30876             DIP("vpblendw $%u,%s,%s,%s\n",
   30877                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   30878             assign(sE, loadLE(Ity_V256, mkexpr(addr)));
   30879          }
   30880          delta++;
   30881          breakupV256toV128s( sV, &sVhi, &sVlo );
   30882          breakupV256toV128s( sE, &sEhi, &sElo );
   30883          putYMMReg( rG, binop( Iop_V128HLtoV256,
   30884                                mkexpr( math_PBLENDW_128( sEhi, sVhi, imm8) ),
   30885                                mkexpr( math_PBLENDW_128( sElo, sVlo, imm8) ) ) );
   30886          *uses_vvvv = True;
   30887          goto decode_success;
   30888       }
   30889       break;
   30890 
   30891    case 0x0F:
   30892       /* VPALIGNR imm8, xmm3/m128, xmm2, xmm1 */
   30893       /* VPALIGNR = VEX.NDS.128.66.0F3A.WIG 0F /r ib */
   30894       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30895          UChar  modrm = getUChar(delta);
   30896          UInt   rG    = gregOfRexRM(pfx, modrm);
   30897          UInt   rV    = getVexNvvvv(pfx);
   30898          IRTemp sV    = newTemp(Ity_V128);
   30899          IRTemp dV    = newTemp(Ity_V128);
   30900          UInt   imm8;
   30901 
   30902          assign( dV, getXMMReg(rV) );
   30903 
   30904          if ( epartIsReg( modrm ) ) {
   30905             UInt   rE = eregOfRexRM(pfx, modrm);
   30906             assign( sV, getXMMReg(rE) );
   30907             imm8 = getUChar(delta+1);
   30908             delta += 1+1;
   30909             DIP("vpalignr $%u,%s,%s,%s\n", imm8, nameXMMReg(rE),
   30910                                            nameXMMReg(rV), nameXMMReg(rG));
   30911          } else {
   30912             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30913             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   30914             imm8 = getUChar(delta+alen);
   30915             delta += alen+1;
   30916             DIP("vpalignr $%u,%s,%s,%s\n", imm8, dis_buf,
   30917                                            nameXMMReg(rV), nameXMMReg(rG));
   30918          }
   30919 
   30920          IRTemp res = math_PALIGNR_XMM( sV, dV, imm8 );
   30921          putYMMRegLoAndZU( rG, mkexpr(res) );
   30922          *uses_vvvv = True;
   30923          goto decode_success;
   30924       }
   30925       /* VPALIGNR imm8, ymm3/m256, ymm2, ymm1 */
   30926       /* VPALIGNR = VEX.NDS.256.66.0F3A.WIG 0F /r ib */
   30927       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   30928          UChar  modrm = getUChar(delta);
   30929          UInt   rG    = gregOfRexRM(pfx, modrm);
   30930          UInt   rV    = getVexNvvvv(pfx);
   30931          IRTemp sV    = newTemp(Ity_V256);
   30932          IRTemp dV    = newTemp(Ity_V256);
   30933          IRTemp sHi, sLo, dHi, dLo;
   30934          sHi = sLo = dHi = dLo = IRTemp_INVALID;
   30935          UInt   imm8;
   30936 
   30937          assign( dV, getYMMReg(rV) );
   30938 
   30939          if ( epartIsReg( modrm ) ) {
   30940             UInt   rE = eregOfRexRM(pfx, modrm);
   30941             assign( sV, getYMMReg(rE) );
   30942             imm8 = getUChar(delta+1);
   30943             delta += 1+1;
   30944             DIP("vpalignr $%u,%s,%s,%s\n", imm8, nameYMMReg(rE),
   30945                                            nameYMMReg(rV), nameYMMReg(rG));
   30946          } else {
   30947             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30948             assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   30949             imm8 = getUChar(delta+alen);
   30950             delta += alen+1;
   30951             DIP("vpalignr $%u,%s,%s,%s\n", imm8, dis_buf,
   30952                                            nameYMMReg(rV), nameYMMReg(rG));
   30953          }
   30954 
   30955          breakupV256toV128s( dV, &dHi, &dLo );
   30956          breakupV256toV128s( sV, &sHi, &sLo );
   30957          putYMMReg( rG, binop( Iop_V128HLtoV256,
   30958                                mkexpr( math_PALIGNR_XMM( sHi, dHi, imm8 ) ),
   30959                                mkexpr( math_PALIGNR_XMM( sLo, dLo, imm8 ) ) )
   30960                     );
   30961          *uses_vvvv = True;
   30962          goto decode_success;
   30963       }
   30964       break;
   30965 
   30966    case 0x14:
   30967       /* VPEXTRB imm8, xmm2, reg/m8 = VEX.128.66.0F3A.W0 14 /r ib */
   30968       if (have66noF2noF3(pfx)
   30969           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   30970          delta = dis_PEXTRB_128_GtoE( vbi, pfx, delta, False/*!isAvx*/ );
   30971          goto decode_success;
   30972       }
   30973       break;
   30974 
   30975    case 0x15:
   30976       /* VPEXTRW imm8, reg/m16, xmm2 */
   30977       /* VPEXTRW = VEX.128.66.0F3A.W0 15 /r ib */
   30978       if (have66noF2noF3(pfx)
   30979           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   30980          delta = dis_PEXTRW( vbi, pfx, delta, True/*isAvx*/ );
   30981          goto decode_success;
   30982       }
   30983       break;
   30984 
   30985    case 0x16:
   30986       /* VPEXTRD imm8, r32/m32, xmm2 */
   30987       /* VPEXTRD = VEX.128.66.0F3A.W0 16 /r ib */
   30988       if (have66noF2noF3(pfx)
   30989           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   30990          delta = dis_PEXTRD( vbi, pfx, delta, True/*isAvx*/ );
   30991          goto decode_success;
   30992       }
   30993       /* VPEXTRQ = VEX.128.66.0F3A.W1 16 /r ib */
   30994       if (have66noF2noF3(pfx)
   30995           && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
   30996          delta = dis_PEXTRQ( vbi, pfx, delta, True/*isAvx*/ );
   30997          goto decode_success;
   30998       }
   30999       break;
   31000 
   31001    case 0x17:
   31002       /* VEXTRACTPS imm8, xmm1, r32/m32 = VEX.128.66.0F3A.WIG 17 /r ib */
   31003       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31004          delta = dis_EXTRACTPS( vbi, pfx, delta, True/*isAvx*/ );
   31005          goto decode_success;
   31006       }
   31007       break;
   31008 
   31009    case 0x18:
   31010       /* VINSERTF128 r/m, rV, rD
   31011          ::: rD = insertinto(a lane in rV, 128 bits from r/m) */
   31012       /* VINSERTF128 = VEX.NDS.256.66.0F3A.W0 18 /r ib */
   31013       if (have66noF2noF3(pfx)
   31014           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   31015          UChar  modrm = getUChar(delta);
   31016          UInt   ib    = 0;
   31017          UInt   rG    = gregOfRexRM(pfx, modrm);
   31018          UInt   rV    = getVexNvvvv(pfx);
   31019          IRTemp t128  = newTemp(Ity_V128);
   31020          if (epartIsReg(modrm)) {
   31021             UInt rE = eregOfRexRM(pfx, modrm);
   31022             delta += 1;
   31023             assign(t128, getXMMReg(rE));
   31024             ib = getUChar(delta);
   31025             DIP("vinsertf128 $%u,%s,%s,%s\n",
   31026                 ib, nameXMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   31027          } else {
   31028             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31029             assign(t128, loadLE(Ity_V128, mkexpr(addr)));
   31030             delta += alen;
   31031             ib = getUChar(delta);
   31032             DIP("vinsertf128 $%u,%s,%s,%s\n",
   31033                 ib, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   31034          }
   31035          delta++;
   31036          putYMMRegLane128(rG, 0,   getYMMRegLane128(rV, 0));
   31037          putYMMRegLane128(rG, 1,   getYMMRegLane128(rV, 1));
   31038          putYMMRegLane128(rG, ib & 1, mkexpr(t128));
   31039          *uses_vvvv = True;
   31040          goto decode_success;
   31041       }
   31042       break;
   31043 
   31044    case 0x19:
   31045      /* VEXTRACTF128 $lane_no, rS, r/m
   31046         ::: r/m:V128 = a lane of rS:V256 (RM format) */
   31047      /* VEXTRACTF128 = VEX.256.66.0F3A.W0 19 /r ib */
   31048       if (have66noF2noF3(pfx)
   31049           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   31050          UChar  modrm = getUChar(delta);
   31051          UInt   ib    = 0;
   31052          UInt   rS    = gregOfRexRM(pfx, modrm);
   31053          IRTemp t128  = newTemp(Ity_V128);
   31054          if (epartIsReg(modrm)) {
   31055             UInt rD = eregOfRexRM(pfx, modrm);
   31056             delta += 1;
   31057             ib = getUChar(delta);
   31058             assign(t128, getYMMRegLane128(rS, ib & 1));
   31059             putYMMRegLoAndZU(rD, mkexpr(t128));
   31060             DIP("vextractf128 $%u,%s,%s\n",
   31061                 ib, nameXMMReg(rS), nameYMMReg(rD));
   31062          } else {
   31063             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31064             delta += alen;
   31065             ib = getUChar(delta);
   31066             assign(t128, getYMMRegLane128(rS, ib & 1));
   31067             storeLE(mkexpr(addr), mkexpr(t128));
   31068             DIP("vextractf128 $%u,%s,%s\n",
   31069                 ib, nameYMMReg(rS), dis_buf);
   31070          }
   31071          delta++;
   31072          /* doesn't use vvvv */
   31073          goto decode_success;
   31074       }
   31075       break;
   31076 
   31077    case 0x20:
   31078       /* VPINSRB r32/m8, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W0 20 /r ib */
   31079       if (have66noF2noF3(pfx)
   31080           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   31081          UChar  modrm  = getUChar(delta);
   31082          UInt   rG     = gregOfRexRM(pfx, modrm);
   31083          UInt   rV     = getVexNvvvv(pfx);
   31084          Int    imm8;
   31085          IRTemp src_u8 = newTemp(Ity_I8);
   31086 
   31087          if ( epartIsReg( modrm ) ) {
   31088             UInt rE = eregOfRexRM(pfx,modrm);
   31089             imm8 = (Int)(getUChar(delta+1) & 15);
   31090             assign( src_u8, unop(Iop_32to8, getIReg32( rE )) );
   31091             delta += 1+1;
   31092             DIP( "vpinsrb $%d,%s,%s,%s\n",
   31093                  imm8, nameIReg32(rE), nameXMMReg(rV), nameXMMReg(rG) );
   31094          } else {
   31095             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31096             imm8 = (Int)(getUChar(delta+alen) & 15);
   31097             assign( src_u8, loadLE( Ity_I8, mkexpr(addr) ) );
   31098             delta += alen+1;
   31099             DIP( "vpinsrb $%d,%s,%s,%s\n",
   31100                  imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   31101          }
   31102 
   31103          IRTemp src_vec = newTemp(Ity_V128);
   31104          assign(src_vec, getXMMReg( rV ));
   31105          IRTemp res_vec = math_PINSRB_128( src_vec, src_u8, imm8 );
   31106          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   31107          *uses_vvvv = True;
   31108          goto decode_success;
   31109       }
   31110       break;
   31111 
   31112    case 0x21:
   31113       /* VINSERTPS imm8, xmm3/m32, xmm2, xmm1
   31114          = VEX.NDS.128.66.0F3A.WIG 21 /r ib */
   31115       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31116          UChar  modrm = getUChar(delta);
   31117          UInt   rG    = gregOfRexRM(pfx, modrm);
   31118          UInt   rV    = getVexNvvvv(pfx);
   31119          UInt   imm8;
   31120          IRTemp d2ins = newTemp(Ity_I32); /* comes from the E part */
   31121          const IRTemp inval = IRTemp_INVALID;
   31122 
   31123          if ( epartIsReg( modrm ) ) {
   31124             UInt   rE = eregOfRexRM(pfx, modrm);
   31125             IRTemp vE = newTemp(Ity_V128);
   31126             assign( vE, getXMMReg(rE) );
   31127             IRTemp dsE[4] = { inval, inval, inval, inval };
   31128             breakupV128to32s( vE, &dsE[3], &dsE[2], &dsE[1], &dsE[0] );
   31129             imm8 = getUChar(delta+1);
   31130             d2ins = dsE[(imm8 >> 6) & 3]; /* "imm8_count_s" */
   31131             delta += 1+1;
   31132             DIP( "insertps $%u, %s,%s\n",
   31133                  imm8, nameXMMReg(rE), nameXMMReg(rG) );
   31134          } else {
   31135             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31136             assign( d2ins, loadLE( Ity_I32, mkexpr(addr) ) );
   31137             imm8 = getUChar(delta+alen);
   31138             delta += alen+1;
   31139             DIP( "insertps $%u, %s,%s\n",
   31140                  imm8, dis_buf, nameXMMReg(rG) );
   31141          }
   31142 
   31143          IRTemp vV = newTemp(Ity_V128);
   31144          assign( vV, getXMMReg(rV) );
   31145 
   31146          putYMMRegLoAndZU( rG, mkexpr(math_INSERTPS( vV, d2ins, imm8 )) );
   31147          *uses_vvvv = True;
   31148          goto decode_success;
   31149       }
   31150       break;
   31151 
   31152    case 0x22:
   31153       /* VPINSRD r32/m32, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W0 22 /r ib */
   31154       if (have66noF2noF3(pfx)
   31155           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   31156          UChar  modrm = getUChar(delta);
   31157          UInt   rG    = gregOfRexRM(pfx, modrm);
   31158          UInt   rV    = getVexNvvvv(pfx);
   31159          Int    imm8_10;
   31160          IRTemp src_u32 = newTemp(Ity_I32);
   31161 
   31162          if ( epartIsReg( modrm ) ) {
   31163             UInt rE = eregOfRexRM(pfx,modrm);
   31164             imm8_10 = (Int)(getUChar(delta+1) & 3);
   31165             assign( src_u32, getIReg32( rE ) );
   31166             delta += 1+1;
   31167             DIP( "vpinsrd $%d,%s,%s,%s\n",
   31168                  imm8_10, nameIReg32(rE), nameXMMReg(rV), nameXMMReg(rG) );
   31169          } else {
   31170             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31171             imm8_10 = (Int)(getUChar(delta+alen) & 3);
   31172             assign( src_u32, loadLE( Ity_I32, mkexpr(addr) ) );
   31173             delta += alen+1;
   31174             DIP( "vpinsrd $%d,%s,%s,%s\n",
   31175                  imm8_10, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   31176          }
   31177 
   31178          IRTemp src_vec = newTemp(Ity_V128);
   31179          assign(src_vec, getXMMReg( rV ));
   31180          IRTemp res_vec = math_PINSRD_128( src_vec, src_u32, imm8_10 );
   31181          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   31182          *uses_vvvv = True;
   31183          goto decode_success;
   31184       }
   31185       /* VPINSRQ r64/m64, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W1 22 /r ib */
   31186       if (have66noF2noF3(pfx)
   31187           && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
   31188          UChar  modrm = getUChar(delta);
   31189          UInt   rG    = gregOfRexRM(pfx, modrm);
   31190          UInt   rV    = getVexNvvvv(pfx);
   31191          Int    imm8_0;
   31192          IRTemp src_u64 = newTemp(Ity_I64);
   31193 
   31194          if ( epartIsReg( modrm ) ) {
   31195             UInt rE = eregOfRexRM(pfx,modrm);
   31196             imm8_0 = (Int)(getUChar(delta+1) & 1);
   31197             assign( src_u64, getIReg64( rE ) );
   31198             delta += 1+1;
   31199             DIP( "vpinsrq $%d,%s,%s,%s\n",
   31200                  imm8_0, nameIReg64(rE), nameXMMReg(rV), nameXMMReg(rG) );
   31201          } else {
   31202             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31203             imm8_0 = (Int)(getUChar(delta+alen) & 1);
   31204             assign( src_u64, loadLE( Ity_I64, mkexpr(addr) ) );
   31205             delta += alen+1;
   31206             DIP( "vpinsrd $%d,%s,%s,%s\n",
   31207                  imm8_0, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   31208          }
   31209 
   31210          IRTemp src_vec = newTemp(Ity_V128);
   31211          assign(src_vec, getXMMReg( rV ));
   31212          IRTemp res_vec = math_PINSRQ_128( src_vec, src_u64, imm8_0 );
   31213          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   31214          *uses_vvvv = True;
   31215          goto decode_success;
   31216       }
   31217       break;
   31218 
   31219    case 0x38:
   31220       /* VINSERTI128 r/m, rV, rD
   31221          ::: rD = insertinto(a lane in rV, 128 bits from r/m) */
   31222       /* VINSERTI128 = VEX.NDS.256.66.0F3A.W0 38 /r ib */
   31223       if (have66noF2noF3(pfx)
   31224           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   31225          UChar  modrm = getUChar(delta);
   31226          UInt   ib    = 0;
   31227          UInt   rG    = gregOfRexRM(pfx, modrm);
   31228          UInt   rV    = getVexNvvvv(pfx);
   31229          IRTemp t128  = newTemp(Ity_V128);
   31230          if (epartIsReg(modrm)) {
   31231             UInt rE = eregOfRexRM(pfx, modrm);
   31232             delta += 1;
   31233             assign(t128, getXMMReg(rE));
   31234             ib = getUChar(delta);
   31235             DIP("vinserti128 $%u,%s,%s,%s\n",
   31236                 ib, nameXMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   31237          } else {
   31238             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31239             assign(t128, loadLE(Ity_V128, mkexpr(addr)));
   31240             delta += alen;
   31241             ib = getUChar(delta);
   31242             DIP("vinserti128 $%u,%s,%s,%s\n",
   31243                 ib, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   31244          }
   31245          delta++;
   31246          putYMMRegLane128(rG, 0,   getYMMRegLane128(rV, 0));
   31247          putYMMRegLane128(rG, 1,   getYMMRegLane128(rV, 1));
   31248          putYMMRegLane128(rG, ib & 1, mkexpr(t128));
   31249          *uses_vvvv = True;
   31250          goto decode_success;
   31251       }
   31252       break;
   31253 
   31254    case 0x39:
   31255       /* VEXTRACTI128 $lane_no, rS, r/m
   31256          ::: r/m:V128 = a lane of rS:V256 (RM format) */
   31257       /* VEXTRACTI128 = VEX.256.66.0F3A.W0 39 /r ib */
   31258       if (have66noF2noF3(pfx)
   31259           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   31260          UChar  modrm = getUChar(delta);
   31261          UInt   ib    = 0;
   31262          UInt   rS    = gregOfRexRM(pfx, modrm);
   31263          IRTemp t128  = newTemp(Ity_V128);
   31264          if (epartIsReg(modrm)) {
   31265             UInt rD = eregOfRexRM(pfx, modrm);
   31266             delta += 1;
   31267             ib = getUChar(delta);
   31268             assign(t128, getYMMRegLane128(rS, ib & 1));
   31269             putYMMRegLoAndZU(rD, mkexpr(t128));
   31270             DIP("vextracti128 $%u,%s,%s\n",
   31271                 ib, nameXMMReg(rS), nameYMMReg(rD));
   31272          } else {
   31273             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31274             delta += alen;
   31275             ib = getUChar(delta);
   31276             assign(t128, getYMMRegLane128(rS, ib & 1));
   31277             storeLE(mkexpr(addr), mkexpr(t128));
   31278             DIP("vextracti128 $%u,%s,%s\n",
   31279                 ib, nameYMMReg(rS), dis_buf);
   31280          }
   31281          delta++;
   31282          /* doesn't use vvvv */
   31283          goto decode_success;
   31284       }
   31285       break;
   31286 
   31287    case 0x40:
   31288       /* VDPPS imm8, xmm3/m128,xmm2,xmm1 = VEX.NDS.128.66.0F3A.WIG 40 /r ib */
   31289       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31290          UChar  modrm   = getUChar(delta);
   31291          UInt   rG      = gregOfRexRM(pfx, modrm);
   31292          UInt   rV      = getVexNvvvv(pfx);
   31293          IRTemp dst_vec = newTemp(Ity_V128);
   31294          Int    imm8;
   31295          if (epartIsReg( modrm )) {
   31296             UInt rE = eregOfRexRM(pfx,modrm);
   31297             imm8 = (Int)getUChar(delta+1);
   31298             assign( dst_vec, getXMMReg( rE ) );
   31299             delta += 1+1;
   31300             DIP( "vdpps $%d,%s,%s,%s\n",
   31301                  imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
   31302          } else {
   31303             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31304             imm8 = (Int)getUChar(delta+alen);
   31305             assign( dst_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   31306             delta += alen+1;
   31307             DIP( "vdpps $%d,%s,%s,%s\n",
   31308                  imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   31309          }
   31310 
   31311          IRTemp src_vec = newTemp(Ity_V128);
   31312          assign(src_vec, getXMMReg( rV ));
   31313          IRTemp res_vec = math_DPPS_128( src_vec, dst_vec, imm8 );
   31314          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   31315          *uses_vvvv = True;
   31316          goto decode_success;
   31317       }
   31318       /* VDPPS imm8, ymm3/m128,ymm2,ymm1 = VEX.NDS.256.66.0F3A.WIG 40 /r ib */
   31319       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   31320          UChar  modrm   = getUChar(delta);
   31321          UInt   rG      = gregOfRexRM(pfx, modrm);
   31322          UInt   rV      = getVexNvvvv(pfx);
   31323          IRTemp dst_vec = newTemp(Ity_V256);
   31324          Int    imm8;
   31325          if (epartIsReg( modrm )) {
   31326             UInt rE = eregOfRexRM(pfx,modrm);
   31327             imm8 = (Int)getUChar(delta+1);
   31328             assign( dst_vec, getYMMReg( rE ) );
   31329             delta += 1+1;
   31330             DIP( "vdpps $%d,%s,%s,%s\n",
   31331                  imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG) );
   31332          } else {
   31333             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31334             imm8 = (Int)getUChar(delta+alen);
   31335             assign( dst_vec, loadLE( Ity_V256, mkexpr(addr) ) );
   31336             delta += alen+1;
   31337             DIP( "vdpps $%d,%s,%s,%s\n",
   31338                  imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
   31339          }
   31340 
   31341          IRTemp src_vec = newTemp(Ity_V256);
   31342          assign(src_vec, getYMMReg( rV ));
   31343          IRTemp s0, s1, d0, d1;
   31344          s0 = s1 = d0 = d1 = IRTemp_INVALID;
   31345          breakupV256toV128s( dst_vec, &d1, &d0 );
   31346          breakupV256toV128s( src_vec, &s1, &s0 );
   31347          putYMMReg( rG, binop( Iop_V128HLtoV256,
   31348                                mkexpr( math_DPPS_128(s1, d1, imm8) ),
   31349                                mkexpr( math_DPPS_128(s0, d0, imm8) ) ) );
   31350          *uses_vvvv = True;
   31351          goto decode_success;
   31352       }
   31353       break;
   31354 
   31355    case 0x41:
   31356       /* VDPPD imm8, xmm3/m128,xmm2,xmm1 = VEX.NDS.128.66.0F3A.WIG 41 /r ib */
   31357       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31358          UChar  modrm   = getUChar(delta);
   31359          UInt   rG      = gregOfRexRM(pfx, modrm);
   31360          UInt   rV      = getVexNvvvv(pfx);
   31361          IRTemp dst_vec = newTemp(Ity_V128);
   31362          Int    imm8;
   31363          if (epartIsReg( modrm )) {
   31364             UInt rE = eregOfRexRM(pfx,modrm);
   31365             imm8 = (Int)getUChar(delta+1);
   31366             assign( dst_vec, getXMMReg( rE ) );
   31367             delta += 1+1;
   31368             DIP( "vdppd $%d,%s,%s,%s\n",
   31369                  imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
   31370          } else {
   31371             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31372             imm8 = (Int)getUChar(delta+alen);
   31373             assign( dst_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   31374             delta += alen+1;
   31375             DIP( "vdppd $%d,%s,%s,%s\n",
   31376                  imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   31377          }
   31378 
   31379          IRTemp src_vec = newTemp(Ity_V128);
   31380          assign(src_vec, getXMMReg( rV ));
   31381          IRTemp res_vec = math_DPPD_128( src_vec, dst_vec, imm8 );
   31382          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   31383          *uses_vvvv = True;
   31384          goto decode_success;
   31385       }
   31386       break;
   31387 
   31388    case 0x42:
   31389       /* VMPSADBW imm8, xmm3/m128,xmm2,xmm1 */
   31390       /* VMPSADBW = VEX.NDS.128.66.0F3A.WIG 42 /r ib */
   31391       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31392          UChar  modrm   = getUChar(delta);
   31393          Int    imm8;
   31394          IRTemp src_vec = newTemp(Ity_V128);
   31395          IRTemp dst_vec = newTemp(Ity_V128);
   31396          UInt   rG      = gregOfRexRM(pfx, modrm);
   31397          UInt   rV      = getVexNvvvv(pfx);
   31398 
   31399          assign( dst_vec, getXMMReg(rV) );
   31400 
   31401          if ( epartIsReg( modrm ) ) {
   31402             UInt rE = eregOfRexRM(pfx, modrm);
   31403 
   31404             imm8 = (Int)getUChar(delta+1);
   31405             assign( src_vec, getXMMReg(rE) );
   31406             delta += 1+1;
   31407             DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
   31408                  nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
   31409          } else {
   31410             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   31411                              1/* imm8 is 1 byte after the amode */ );
   31412             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   31413             imm8 = (Int)getUChar(delta+alen);
   31414             delta += alen+1;
   31415             DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
   31416                  dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   31417          }
   31418 
   31419          putYMMRegLoAndZU( rG, mkexpr( math_MPSADBW_128(dst_vec,
   31420                                                         src_vec, imm8) ) );
   31421          *uses_vvvv = True;
   31422          goto decode_success;
   31423       }
   31424       /* VMPSADBW imm8, ymm3/m256,ymm2,ymm1 */
   31425       /* VMPSADBW = VEX.NDS.256.66.0F3A.WIG 42 /r ib */
   31426       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   31427          UChar  modrm   = getUChar(delta);
   31428          Int    imm8;
   31429          IRTemp src_vec = newTemp(Ity_V256);
   31430          IRTemp dst_vec = newTemp(Ity_V256);
   31431          UInt   rG      = gregOfRexRM(pfx, modrm);
   31432          UInt   rV      = getVexNvvvv(pfx);
   31433          IRTemp sHi, sLo, dHi, dLo;
   31434          sHi = sLo = dHi = dLo = IRTemp_INVALID;
   31435 
   31436          assign( dst_vec, getYMMReg(rV) );
   31437 
   31438          if ( epartIsReg( modrm ) ) {
   31439             UInt rE = eregOfRexRM(pfx, modrm);
   31440 
   31441             imm8 = (Int)getUChar(delta+1);
   31442             assign( src_vec, getYMMReg(rE) );
   31443             delta += 1+1;
   31444             DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
   31445                  nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG) );
   31446          } else {
   31447             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   31448                              1/* imm8 is 1 byte after the amode */ );
   31449             assign( src_vec, loadLE( Ity_V256, mkexpr(addr) ) );
   31450             imm8 = (Int)getUChar(delta+alen);
   31451             delta += alen+1;
   31452             DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
   31453                  dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
   31454          }
   31455 
   31456          breakupV256toV128s( dst_vec, &dHi, &dLo );
   31457          breakupV256toV128s( src_vec, &sHi, &sLo );
   31458          putYMMReg( rG, binop( Iop_V128HLtoV256,
   31459                                mkexpr( math_MPSADBW_128(dHi, sHi, imm8 >> 3) ),
   31460                                mkexpr( math_MPSADBW_128(dLo, sLo, imm8) ) ) );
   31461          *uses_vvvv = True;
   31462          goto decode_success;
   31463       }
   31464       break;
   31465 
   31466    case 0x44:
   31467       /* VPCLMULQDQ imm8, xmm3/m128,xmm2,xmm1 */
   31468       /* VPCLMULQDQ = VEX.NDS.128.66.0F3A.WIG 44 /r ib */
   31469       /* 66 0F 3A 44 /r ib = PCLMULQDQ xmm1, xmm2/m128, imm8
   31470        * Carry-less multiplication of selected XMM quadwords into XMM
   31471        * registers (a.k.a multiplication of polynomials over GF(2))
   31472        */
   31473       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31474          UChar  modrm = getUChar(delta);
   31475          Int imm8;
   31476          IRTemp sV    = newTemp(Ity_V128);
   31477          IRTemp dV    = newTemp(Ity_V128);
   31478          UInt   rG    = gregOfRexRM(pfx, modrm);
   31479          UInt   rV    = getVexNvvvv(pfx);
   31480 
   31481          assign( dV, getXMMReg(rV) );
   31482 
   31483          if ( epartIsReg( modrm ) ) {
   31484             UInt rE = eregOfRexRM(pfx, modrm);
   31485             imm8 = (Int)getUChar(delta+1);
   31486             assign( sV, getXMMReg(rE) );
   31487             delta += 1+1;
   31488             DIP( "vpclmulqdq $%d, %s,%s,%s\n", imm8,
   31489                  nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
   31490          } else {
   31491             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   31492                              1/* imm8 is 1 byte after the amode */ );
   31493             assign( sV, loadLE( Ity_V128, mkexpr(addr) ) );
   31494             imm8 = (Int)getUChar(delta+alen);
   31495             delta += alen+1;
   31496             DIP( "vpclmulqdq $%d, %s,%s,%s\n",
   31497                  imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   31498          }
   31499 
   31500          putYMMRegLoAndZU( rG, mkexpr( math_PCLMULQDQ(dV, sV, imm8) ) );
   31501          *uses_vvvv = True;
   31502          goto decode_success;
   31503       }
   31504       break;
   31505 
   31506    case 0x46:
   31507       /* VPERM2I128 imm8, ymm3/m256, ymm2, ymm1 = VEX.NDS.66.0F3A.W0 46 /r ib */
   31508       if (have66noF2noF3(pfx)
   31509           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   31510          UChar  modrm = getUChar(delta);
   31511          UInt   imm8  = 0;
   31512          UInt   rG    = gregOfRexRM(pfx, modrm);
   31513          UInt   rV    = getVexNvvvv(pfx);
   31514          IRTemp s00   = newTemp(Ity_V128);
   31515          IRTemp s01   = newTemp(Ity_V128);
   31516          IRTemp s10   = newTemp(Ity_V128);
   31517          IRTemp s11   = newTemp(Ity_V128);
   31518          assign(s00, getYMMRegLane128(rV, 0));
   31519          assign(s01, getYMMRegLane128(rV, 1));
   31520          if (epartIsReg(modrm)) {
   31521             UInt rE = eregOfRexRM(pfx, modrm);
   31522             delta += 1;
   31523             imm8 = getUChar(delta);
   31524             DIP("vperm2i128 $%u,%s,%s,%s\n",
   31525                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   31526             assign(s10, getYMMRegLane128(rE, 0));
   31527             assign(s11, getYMMRegLane128(rE, 1));
   31528          } else {
   31529             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   31530             delta += alen;
   31531             imm8 = getUChar(delta);
   31532             DIP("vperm2i128 $%u,%s,%s,%s\n",
   31533                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   31534             assign(s10, loadLE(Ity_V128, binop(Iop_Add64,
   31535                                                mkexpr(addr), mkU64(0))));
   31536             assign(s11, loadLE(Ity_V128, binop(Iop_Add64,
   31537                                                mkexpr(addr), mkU64(16))));
   31538          }
   31539          delta++;
   31540 #        define SEL(_nn) (((_nn)==0) ? s00 : ((_nn)==1) ? s01 \
   31541                                            : ((_nn)==2) ? s10 : s11)
   31542          putYMMRegLane128(rG, 0, mkexpr(SEL((imm8 >> 0) & 3)));
   31543          putYMMRegLane128(rG, 1, mkexpr(SEL((imm8 >> 4) & 3)));
   31544 #        undef SEL
   31545          if (imm8 & (1<<3)) putYMMRegLane128(rG, 0, mkV128(0));
   31546          if (imm8 & (1<<7)) putYMMRegLane128(rG, 1, mkV128(0));
   31547          *uses_vvvv = True;
   31548          goto decode_success;
   31549       }
   31550       break;
   31551 
   31552    case 0x4A:
   31553       /* VBLENDVPS xmmG, xmmE/memE, xmmV, xmmIS4
   31554          ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */
   31555       /* VBLENDVPS = VEX.NDS.128.66.0F3A.WIG 4A /r /is4 */
   31556       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31557          delta = dis_VBLENDV_128 ( vbi, pfx, delta,
   31558                                    "vblendvps", 4, Iop_SarN32x4 );
   31559          *uses_vvvv = True;
   31560          goto decode_success;
   31561       }
   31562       /* VBLENDVPS ymmG, ymmE/memE, ymmV, ymmIS4
   31563          ::: ymmG:V256 = PBLEND(ymmE, ymmV, ymmIS4) (RMVR) */
   31564       /* VBLENDVPS = VEX.NDS.256.66.0F3A.WIG 4A /r /is4 */
   31565       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   31566          delta = dis_VBLENDV_256 ( vbi, pfx, delta,
   31567                                    "vblendvps", 4, Iop_SarN32x4 );
   31568          *uses_vvvv = True;
   31569          goto decode_success;
   31570       }
   31571       break;
   31572 
   31573    case 0x4B:
   31574       /* VBLENDVPD xmmG, xmmE/memE, xmmV, xmmIS4
   31575          ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */
   31576       /* VBLENDVPD = VEX.NDS.128.66.0F3A.WIG 4B /r /is4 */
   31577       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31578          delta = dis_VBLENDV_128 ( vbi, pfx, delta,
   31579                                    "vblendvpd", 8, Iop_SarN64x2 );
   31580          *uses_vvvv = True;
   31581          goto decode_success;
   31582       }
   31583       /* VBLENDVPD ymmG, ymmE/memE, ymmV, ymmIS4
   31584          ::: ymmG:V256 = PBLEND(ymmE, ymmV, ymmIS4) (RMVR) */
   31585       /* VBLENDVPD = VEX.NDS.256.66.0F3A.WIG 4B /r /is4 */
   31586       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   31587          delta = dis_VBLENDV_256 ( vbi, pfx, delta,
   31588                                    "vblendvpd", 8, Iop_SarN64x2 );
   31589          *uses_vvvv = True;
   31590          goto decode_success;
   31591       }
   31592       break;
   31593 
   31594    case 0x4C:
   31595       /* VPBLENDVB xmmG, xmmE/memE, xmmV, xmmIS4
   31596          ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */
   31597       /* VPBLENDVB = VEX.NDS.128.66.0F3A.WIG 4C /r /is4 */
   31598       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31599          delta = dis_VBLENDV_128 ( vbi, pfx, delta,
   31600                                    "vpblendvb", 1, Iop_SarN8x16 );
   31601          *uses_vvvv = True;
   31602          goto decode_success;
   31603       }
   31604       /* VPBLENDVB ymmG, ymmE/memE, ymmV, ymmIS4
   31605          ::: ymmG:V256 = PBLEND(ymmE, ymmV, ymmIS4) (RMVR) */
   31606       /* VPBLENDVB = VEX.NDS.256.66.0F3A.WIG 4C /r /is4 */
   31607       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   31608          delta = dis_VBLENDV_256 ( vbi, pfx, delta,
   31609                                    "vpblendvb", 1, Iop_SarN8x16 );
   31610          *uses_vvvv = True;
   31611          goto decode_success;
   31612       }
   31613       break;
   31614 
   31615    case 0x60:
   31616    case 0x61:
   31617    case 0x62:
   31618    case 0x63:
   31619       /* VEX.128.66.0F3A.WIG 63 /r ib = VPCMPISTRI imm8, xmm2/m128, xmm1
   31620          VEX.128.66.0F3A.WIG 62 /r ib = VPCMPISTRM imm8, xmm2/m128, xmm1
   31621          VEX.128.66.0F3A.WIG 61 /r ib = VPCMPESTRI imm8, xmm2/m128, xmm1
   31622          VEX.128.66.0F3A.WIG 60 /r ib = VPCMPESTRM imm8, xmm2/m128, xmm1
   31623          (selected special cases that actually occur in glibc,
   31624           not by any means a complete implementation.)
   31625       */
   31626       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31627          Long delta0 = delta;
   31628          delta = dis_PCMPxSTRx( vbi, pfx, delta, True/*isAvx*/, opc );
   31629          if (delta > delta0) goto decode_success;
   31630          /* else fall though; dis_PCMPxSTRx failed to decode it */
   31631       }
   31632       break;
   31633 
   31634    case 0xDF:
   31635       /* VAESKEYGENASSIST imm8, xmm2/m128, xmm1 = VEX.128.66.0F3A.WIG DF /r */
   31636       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31637          delta = dis_AESKEYGENASSIST( vbi, pfx, delta, True/*!isAvx*/ );
   31638          goto decode_success;
   31639       }
   31640       break;
   31641 
   31642    case 0xF0:
   31643       /* RORX imm8, r/m32, r32a = VEX.LZ.F2.0F3A.W0 F0 /r /i */
   31644       /* RORX imm8, r/m64, r64a = VEX.LZ.F2.0F3A.W1 F0 /r /i */
   31645       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   31646          Int     size = getRexW(pfx) ? 8 : 4;
   31647          IRType  ty   = szToITy(size);
   31648          IRTemp  src  = newTemp(ty);
   31649          UChar   rm   = getUChar(delta);
   31650          UChar   imm8;
   31651 
   31652          if (epartIsReg(rm)) {
   31653             imm8 = getUChar(delta+1);
   31654             assign( src, getIRegE(size,pfx,rm) );
   31655             DIP("rorx %d,%s,%s\n", imm8, nameIRegE(size,pfx,rm),
   31656                                    nameIRegG(size,pfx,rm));
   31657             delta += 2;
   31658          } else {
   31659             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   31660             imm8 = getUChar(delta+alen);
   31661             assign( src, loadLE(ty, mkexpr(addr)) );
   31662             DIP("rorx %d,%s,%s\n", imm8, dis_buf, nameIRegG(size,pfx,rm));
   31663             delta += alen + 1;
   31664          }
   31665          imm8 &= 8*size-1;
   31666 
   31667          /* dst = (src >>u imm8) | (src << (size-imm8)) */
   31668          putIRegG( size, pfx, rm,
   31669                    imm8 == 0 ? mkexpr(src)
   31670                    : binop( mkSizedOp(ty,Iop_Or8),
   31671                             binop( mkSizedOp(ty,Iop_Shr8), mkexpr(src),
   31672                                    mkU8(imm8) ),
   31673                             binop( mkSizedOp(ty,Iop_Shl8), mkexpr(src),
   31674                                    mkU8(8*size-imm8) ) ) );
   31675          /* Flags aren't modified.  */
   31676          goto decode_success;
   31677       }
   31678       break;
   31679 
   31680    default:
   31681       break;
   31682 
   31683    }
   31684 
   31685   //decode_failure:
   31686    return deltaIN;
   31687 
   31688   decode_success:
   31689    return delta;
   31690 }
   31691 
   31692 
   31693 /*------------------------------------------------------------*/
   31694 /*---                                                      ---*/
   31695 /*--- Disassemble a single instruction                     ---*/
   31696 /*---                                                      ---*/
   31697 /*------------------------------------------------------------*/
   31698 
   31699 /* Disassemble a single instruction into IR.  The instruction is
   31700    located in host memory at &guest_code[delta]. */
   31701 
   31702 static
   31703 DisResult disInstr_AMD64_WRK (
   31704              /*OUT*/Bool* expect_CAS,
   31705              Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
   31706              Bool         resteerCisOk,
   31707              void*        callback_opaque,
   31708              Long         delta64,
   31709              const VexArchInfo* archinfo,
   31710              const VexAbiInfo*  vbi,
   31711              Bool         sigill_diag
   31712           )
   31713 {
   31714    IRTemp    t1, t2;
   31715    UChar     pre;
   31716    Int       n, n_prefixes;
   31717    DisResult dres;
   31718 
   31719    /* The running delta */
   31720    Long delta = delta64;
   31721 
   31722    /* Holds eip at the start of the insn, so that we can print
   31723       consistent error messages for unimplemented insns. */
   31724    Long delta_start = delta;
   31725 
   31726    /* sz denotes the nominal data-op size of the insn; we change it to
   31727       2 if an 0x66 prefix is seen and 8 if REX.W is 1.  In case of
   31728       conflict REX.W takes precedence. */
   31729    Int sz = 4;
   31730 
   31731    /* pfx holds the summary of prefixes. */
   31732    Prefix pfx = PFX_EMPTY;
   31733 
   31734    /* Holds the computed opcode-escape indication. */
   31735    Escape esc = ESC_NONE;
   31736 
   31737    /* Set result defaults. */
   31738    dres.whatNext    = Dis_Continue;
   31739    dres.len         = 0;
   31740    dres.continueAt  = 0;
   31741    dres.jk_StopHere = Ijk_INVALID;
   31742    *expect_CAS = False;
   31743 
   31744    vassert(guest_RIP_next_assumed == 0);
   31745    vassert(guest_RIP_next_mustcheck == False);
   31746 
   31747    t1 = t2 = IRTemp_INVALID;
   31748 
   31749    DIP("\t0x%llx:  ", guest_RIP_bbstart+delta);
   31750 
   31751    /* Spot "Special" instructions (see comment at top of file). */
   31752    {
   31753       const UChar* code = guest_code + delta;
   31754       /* Spot the 16-byte preamble:
   31755          48C1C703   rolq $3,  %rdi
   31756          48C1C70D   rolq $13, %rdi
   31757          48C1C73D   rolq $61, %rdi
   31758          48C1C733   rolq $51, %rdi
   31759       */
   31760       if (code[ 0] == 0x48 && code[ 1] == 0xC1 && code[ 2] == 0xC7
   31761                                                && code[ 3] == 0x03 &&
   31762           code[ 4] == 0x48 && code[ 5] == 0xC1 && code[ 6] == 0xC7
   31763                                                && code[ 7] == 0x0D &&
   31764           code[ 8] == 0x48 && code[ 9] == 0xC1 && code[10] == 0xC7
   31765                                                && code[11] == 0x3D &&
   31766           code[12] == 0x48 && code[13] == 0xC1 && code[14] == 0xC7
   31767                                                && code[15] == 0x33) {
   31768          /* Got a "Special" instruction preamble.  Which one is it? */
   31769          if (code[16] == 0x48 && code[17] == 0x87
   31770                               && code[18] == 0xDB /* xchgq %rbx,%rbx */) {
   31771             /* %RDX = client_request ( %RAX ) */
   31772             DIP("%%rdx = client_request ( %%rax )\n");
   31773             delta += 19;
   31774             jmp_lit(&dres, Ijk_ClientReq, guest_RIP_bbstart+delta);
   31775             vassert(dres.whatNext == Dis_StopHere);
   31776             goto decode_success;
   31777          }
   31778          else
   31779          if (code[16] == 0x48 && code[17] == 0x87
   31780                               && code[18] == 0xC9 /* xchgq %rcx,%rcx */) {
   31781             /* %RAX = guest_NRADDR */
   31782             DIP("%%rax = guest_NRADDR\n");
   31783             delta += 19;
   31784             putIRegRAX(8, IRExpr_Get( OFFB_NRADDR, Ity_I64 ));
   31785             goto decode_success;
   31786          }
   31787          else
   31788          if (code[16] == 0x48 && code[17] == 0x87
   31789                               && code[18] == 0xD2 /* xchgq %rdx,%rdx */) {
   31790             /* call-noredir *%RAX */
   31791             DIP("call-noredir *%%rax\n");
   31792             delta += 19;
   31793             t1 = newTemp(Ity_I64);
   31794             assign(t1, getIRegRAX(8));
   31795             t2 = newTemp(Ity_I64);
   31796             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   31797             putIReg64(R_RSP, mkexpr(t2));
   31798             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta));
   31799             jmp_treg(&dres, Ijk_NoRedir, t1);
   31800             vassert(dres.whatNext == Dis_StopHere);
   31801             goto decode_success;
   31802          }
   31803          else
   31804          if (code[16] == 0x48 && code[17] == 0x87
   31805                               && code[18] == 0xff /* xchgq %rdi,%rdi */) {
   31806            /* IR injection */
   31807             DIP("IR injection\n");
   31808             vex_inject_ir(irsb, Iend_LE);
   31809 
   31810             // Invalidate the current insn. The reason is that the IRop we're
   31811             // injecting here can change. In which case the translation has to
   31812             // be redone. For ease of handling, we simply invalidate all the
   31813             // time.
   31814             stmt(IRStmt_Put(OFFB_CMSTART, mkU64(guest_RIP_curr_instr)));
   31815             stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(19)));
   31816 
   31817             delta += 19;
   31818 
   31819             stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_bbstart + delta) ) );
   31820             dres.whatNext    = Dis_StopHere;
   31821             dres.jk_StopHere = Ijk_InvalICache;
   31822             goto decode_success;
   31823          }
   31824          /* We don't know what it is. */
   31825          goto decode_failure;
   31826          /*NOTREACHED*/
   31827       }
   31828    }
   31829 
   31830    /* Eat prefixes, summarising the result in pfx and sz, and rejecting
   31831       as many invalid combinations as possible. */
   31832    n_prefixes = 0;
   31833    while (True) {
   31834       if (n_prefixes > 7) goto decode_failure;
   31835       pre = getUChar(delta);
   31836       switch (pre) {
   31837          case 0x66: pfx |= PFX_66; break;
   31838          case 0x67: pfx |= PFX_ASO; break;
   31839          case 0xF2: pfx |= PFX_F2; break;
   31840          case 0xF3: pfx |= PFX_F3; break;
   31841          case 0xF0: pfx |= PFX_LOCK; *expect_CAS = True; break;
   31842          case 0x2E: pfx |= PFX_CS; break;
   31843          case 0x3E: pfx |= PFX_DS; break;
   31844          case 0x26: pfx |= PFX_ES; break;
   31845          case 0x64: pfx |= PFX_FS; break;
   31846          case 0x65: pfx |= PFX_GS; break;
   31847          case 0x36: pfx |= PFX_SS; break;
   31848          case 0x40 ... 0x4F:
   31849             pfx |= PFX_REX;
   31850             if (pre & (1<<3)) pfx |= PFX_REXW;
   31851             if (pre & (1<<2)) pfx |= PFX_REXR;
   31852             if (pre & (1<<1)) pfx |= PFX_REXX;
   31853             if (pre & (1<<0)) pfx |= PFX_REXB;
   31854             break;
   31855          default:
   31856             goto not_a_legacy_prefix;
   31857       }
   31858       n_prefixes++;
   31859       delta++;
   31860    }
   31861 
   31862    not_a_legacy_prefix:
   31863    /* We've used up all the non-VEX prefixes.  Parse and validate a
   31864       VEX prefix if that's appropriate. */
   31865    if (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX) {
   31866       /* Used temporarily for holding VEX prefixes. */
   31867       UChar vex0 = getUChar(delta);
   31868       if (vex0 == 0xC4) {
   31869          /* 3-byte VEX */
   31870          UChar vex1 = getUChar(delta+1);
   31871          UChar vex2 = getUChar(delta+2);
   31872          delta += 3;
   31873          pfx |= PFX_VEX;
   31874          /* Snarf contents of byte 1 */
   31875          /* R */ pfx |= (vex1 & (1<<7)) ? 0 : PFX_REXR;
   31876          /* X */ pfx |= (vex1 & (1<<6)) ? 0 : PFX_REXX;
   31877          /* B */ pfx |= (vex1 & (1<<5)) ? 0 : PFX_REXB;
   31878          /* m-mmmm */
   31879          switch (vex1 & 0x1F) {
   31880             case 1: esc = ESC_0F;   break;
   31881             case 2: esc = ESC_0F38; break;
   31882             case 3: esc = ESC_0F3A; break;
   31883             /* Any other m-mmmm field will #UD */
   31884             default: goto decode_failure;
   31885          }
   31886          /* Snarf contents of byte 2 */
   31887          /* W */    pfx |= (vex2 & (1<<7)) ? PFX_REXW : 0;
   31888          /* ~v3 */  pfx |= (vex2 & (1<<6)) ? 0 : PFX_VEXnV3;
   31889          /* ~v2 */  pfx |= (vex2 & (1<<5)) ? 0 : PFX_VEXnV2;
   31890          /* ~v1 */  pfx |= (vex2 & (1<<4)) ? 0 : PFX_VEXnV1;
   31891          /* ~v0 */  pfx |= (vex2 & (1<<3)) ? 0 : PFX_VEXnV0;
   31892          /* L */    pfx |= (vex2 & (1<<2)) ? PFX_VEXL : 0;
   31893          /* pp */
   31894          switch (vex2 & 3) {
   31895             case 0: break;
   31896             case 1: pfx |= PFX_66; break;
   31897             case 2: pfx |= PFX_F3; break;
   31898             case 3: pfx |= PFX_F2; break;
   31899             default: vassert(0);
   31900          }
   31901       }
   31902       else if (vex0 == 0xC5) {
   31903          /* 2-byte VEX */
   31904          UChar vex1 = getUChar(delta+1);
   31905          delta += 2;
   31906          pfx |= PFX_VEX;
   31907          /* Snarf contents of byte 1 */
   31908          /* R */    pfx |= (vex1 & (1<<7)) ? 0 : PFX_REXR;
   31909          /* ~v3 */  pfx |= (vex1 & (1<<6)) ? 0 : PFX_VEXnV3;
   31910          /* ~v2 */  pfx |= (vex1 & (1<<5)) ? 0 : PFX_VEXnV2;
   31911          /* ~v1 */  pfx |= (vex1 & (1<<4)) ? 0 : PFX_VEXnV1;
   31912          /* ~v0 */  pfx |= (vex1 & (1<<3)) ? 0 : PFX_VEXnV0;
   31913          /* L */    pfx |= (vex1 & (1<<2)) ? PFX_VEXL : 0;
   31914          /* pp */
   31915          switch (vex1 & 3) {
   31916             case 0: break;
   31917             case 1: pfx |= PFX_66; break;
   31918             case 2: pfx |= PFX_F3; break;
   31919             case 3: pfx |= PFX_F2; break;
   31920             default: vassert(0);
   31921          }
   31922          /* implied: */
   31923          esc = ESC_0F;
   31924       }
   31925       /* Can't have both VEX and REX */
   31926       if ((pfx & PFX_VEX) && (pfx & PFX_REX))
   31927          goto decode_failure; /* can't have both */
   31928    }
   31929 
   31930    /* Dump invalid combinations */
   31931    n = 0;
   31932    if (pfx & PFX_F2) n++;
   31933    if (pfx & PFX_F3) n++;
   31934    if (n > 1)
   31935       goto decode_failure; /* can't have both */
   31936 
   31937    n = 0;
   31938    if (pfx & PFX_CS) n++;
   31939    if (pfx & PFX_DS) n++;
   31940    if (pfx & PFX_ES) n++;
   31941    if (pfx & PFX_FS) n++;
   31942    if (pfx & PFX_GS) n++;
   31943    if (pfx & PFX_SS) n++;
   31944    if (n > 1)
   31945       goto decode_failure; /* multiple seg overrides == illegal */
   31946 
   31947    /* We have a %fs prefix.  Reject it if there's no evidence in 'vbi'
   31948       that we should accept it. */
   31949    if ((pfx & PFX_FS) && !vbi->guest_amd64_assume_fs_is_const)
   31950       goto decode_failure;
   31951 
   31952    /* Ditto for %gs prefixes. */
   31953    if ((pfx & PFX_GS) && !vbi->guest_amd64_assume_gs_is_const)
   31954       goto decode_failure;
   31955 
   31956    /* Set up sz. */
   31957    sz = 4;
   31958    if (pfx & PFX_66) sz = 2;
   31959    if ((pfx & PFX_REX) && (pfx & PFX_REXW)) sz = 8;
   31960 
   31961    /* Now we should be looking at the primary opcode byte or the
   31962       leading escapes.  Check that any LOCK prefix is actually
   31963       allowed. */
   31964    if (haveLOCK(pfx)) {
   31965       if (can_be_used_with_LOCK_prefix( &guest_code[delta] )) {
   31966          DIP("lock ");
   31967       } else {
   31968          *expect_CAS = False;
   31969          goto decode_failure;
   31970       }
   31971    }
   31972 
   31973    /* Eat up opcode escape bytes, until we're really looking at the
   31974       primary opcode byte.  But only if there's no VEX present. */
   31975    if (!(pfx & PFX_VEX)) {
   31976       vassert(esc == ESC_NONE);
   31977       pre = getUChar(delta);
   31978       if (pre == 0x0F) {
   31979          delta++;
   31980          pre = getUChar(delta);
   31981          switch (pre) {
   31982             case 0x38: esc = ESC_0F38; delta++; break;
   31983             case 0x3A: esc = ESC_0F3A; delta++; break;
   31984             default:   esc = ESC_0F; break;
   31985          }
   31986       }
   31987    }
   31988 
   31989    /* So now we're really really looking at the primary opcode
   31990       byte. */
   31991    Long delta_at_primary_opcode = delta;
   31992 
   31993    if (!(pfx & PFX_VEX)) {
   31994       /* Handle non-VEX prefixed instructions.  "Legacy" (non-VEX) SSE
   31995          instructions preserve the upper 128 bits of YMM registers;
   31996          iow we can simply ignore the presence of the upper halves of
   31997          these registers. */
   31998       switch (esc) {
   31999          case ESC_NONE:
   32000             delta = dis_ESC_NONE( &dres, expect_CAS,
   32001                                   resteerOkFn, resteerCisOk, callback_opaque,
   32002                                   archinfo, vbi, pfx, sz, delta );
   32003             break;
   32004          case ESC_0F:
   32005             delta = dis_ESC_0F  ( &dres, expect_CAS,
   32006                                   resteerOkFn, resteerCisOk, callback_opaque,
   32007                                   archinfo, vbi, pfx, sz, delta );
   32008             break;
   32009          case ESC_0F38:
   32010             delta = dis_ESC_0F38( &dres,
   32011                                   resteerOkFn, resteerCisOk, callback_opaque,
   32012                                   archinfo, vbi, pfx, sz, delta );
   32013             break;
   32014          case ESC_0F3A:
   32015             delta = dis_ESC_0F3A( &dres,
   32016                                   resteerOkFn, resteerCisOk, callback_opaque,
   32017                                   archinfo, vbi, pfx, sz, delta );
   32018             break;
   32019          default:
   32020             vassert(0);
   32021       }
   32022    } else {
   32023       /* VEX prefixed instruction */
   32024       /* Sloppy Intel wording: "An instruction encoded with a VEX.128
   32025          prefix that loads a YMM register operand ..." zeroes out bits
   32026          128 and above of the register. */
   32027       Bool uses_vvvv = False;
   32028       switch (esc) {
   32029          case ESC_0F:
   32030             delta = dis_ESC_0F__VEX ( &dres, &uses_vvvv,
   32031                                       resteerOkFn, resteerCisOk,
   32032                                       callback_opaque,
   32033                                       archinfo, vbi, pfx, sz, delta );
   32034             break;
   32035          case ESC_0F38:
   32036             delta = dis_ESC_0F38__VEX ( &dres, &uses_vvvv,
   32037                                         resteerOkFn, resteerCisOk,
   32038                                         callback_opaque,
   32039                                         archinfo, vbi, pfx, sz, delta );
   32040             break;
   32041          case ESC_0F3A:
   32042             delta = dis_ESC_0F3A__VEX ( &dres, &uses_vvvv,
   32043                                         resteerOkFn, resteerCisOk,
   32044                                         callback_opaque,
   32045                                         archinfo, vbi, pfx, sz, delta );
   32046             break;
   32047          case ESC_NONE:
   32048             /* The presence of a VEX prefix, by Intel definition,
   32049                always implies at least an 0F escape. */
   32050             goto decode_failure;
   32051          default:
   32052             vassert(0);
   32053       }
   32054       /* If the insn doesn't use VEX.vvvv then it must be all ones.
   32055          Check this. */
   32056       if (!uses_vvvv) {
   32057          if (getVexNvvvv(pfx) != 0)
   32058             goto decode_failure;
   32059       }
   32060    }
   32061 
   32062    vassert(delta - delta_at_primary_opcode >= 0);
   32063    vassert(delta - delta_at_primary_opcode < 16/*let's say*/);
   32064 
   32065    /* Use delta == delta_at_primary_opcode to denote decode failure.
   32066       This implies that any successful decode must use at least one
   32067       byte up. */
   32068    if (delta == delta_at_primary_opcode)
   32069       goto decode_failure;
   32070    else
   32071       goto decode_success; /* \o/ */
   32072 
   32073 
   32074   decode_failure:
   32075    /* All decode failures end up here. */
   32076    if (sigill_diag) {
   32077       vex_printf("vex amd64->IR: unhandled instruction bytes: "
   32078                  "0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
   32079                  getUChar(delta_start+0),
   32080                  getUChar(delta_start+1),
   32081                  getUChar(delta_start+2),
   32082                  getUChar(delta_start+3),
   32083                  getUChar(delta_start+4),
   32084                  getUChar(delta_start+5),
   32085                  getUChar(delta_start+6),
   32086                  getUChar(delta_start+7) );
   32087       vex_printf("vex amd64->IR:   REX=%d REX.W=%d REX.R=%d REX.X=%d REX.B=%d\n",
   32088                  haveREX(pfx) ? 1 : 0, getRexW(pfx), getRexR(pfx),
   32089                  getRexX(pfx), getRexB(pfx));
   32090       vex_printf("vex amd64->IR:   VEX=%d VEX.L=%d VEX.nVVVV=0x%x ESC=%s\n",
   32091                  haveVEX(pfx) ? 1 : 0, getVexL(pfx),
   32092                  getVexNvvvv(pfx),
   32093                  esc==ESC_NONE ? "NONE" :
   32094                    esc==ESC_0F ? "0F" :
   32095                    esc==ESC_0F38 ? "0F38" :
   32096                    esc==ESC_0F3A ? "0F3A" : "???");
   32097       vex_printf("vex amd64->IR:   PFX.66=%d PFX.F2=%d PFX.F3=%d\n",
   32098                  have66(pfx) ? 1 : 0, haveF2(pfx) ? 1 : 0,
   32099                  haveF3(pfx) ? 1 : 0);
   32100    }
   32101 
   32102    /* Tell the dispatcher that this insn cannot be decoded, and so has
   32103       not been executed, and (is currently) the next to be executed.
   32104       RIP should be up-to-date since it made so at the start of each
   32105       insn, but nevertheless be paranoid and update it again right
   32106       now. */
   32107    stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_curr_instr) ) );
   32108    jmp_lit(&dres, Ijk_NoDecode, guest_RIP_curr_instr);
   32109    vassert(dres.whatNext == Dis_StopHere);
   32110    dres.len = 0;
   32111    /* We also need to say that a CAS is not expected now, regardless
   32112       of what it might have been set to at the start of the function,
   32113       since the IR that we've emitted just above (to synthesis a
   32114       SIGILL) does not involve any CAS, and presumably no other IR has
   32115       been emitted for this (non-decoded) insn. */
   32116    *expect_CAS = False;
   32117    return dres;
   32118 
   32119 
   32120   decode_success:
   32121    /* All decode successes end up here. */
   32122    switch (dres.whatNext) {
   32123       case Dis_Continue:
   32124          stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_bbstart + delta) ) );
   32125          break;
   32126       case Dis_ResteerU:
   32127       case Dis_ResteerC:
   32128          stmt( IRStmt_Put( OFFB_RIP, mkU64(dres.continueAt) ) );
   32129          break;
   32130       case Dis_StopHere:
   32131          break;
   32132       default:
   32133          vassert(0);
   32134    }
   32135 
   32136    DIP("\n");
   32137    dres.len = toUInt(delta - delta_start);
   32138    return dres;
   32139 }
   32140 
   32141 #undef DIP
   32142 #undef DIS
   32143 
   32144 
   32145 /*------------------------------------------------------------*/
   32146 /*--- Top-level fn                                         ---*/
   32147 /*------------------------------------------------------------*/
   32148 
   32149 /* Disassemble a single instruction into IR.  The instruction
   32150    is located in host memory at &guest_code[delta]. */
   32151 
   32152 DisResult disInstr_AMD64 ( IRSB*        irsb_IN,
   32153                            Bool         (*resteerOkFn) ( void*, Addr ),
   32154                            Bool         resteerCisOk,
   32155                            void*        callback_opaque,
   32156                            const UChar* guest_code_IN,
   32157                            Long         delta,
   32158                            Addr         guest_IP,
   32159                            VexArch      guest_arch,
   32160                            const VexArchInfo* archinfo,
   32161                            const VexAbiInfo*  abiinfo,
   32162                            VexEndness   host_endness_IN,
   32163                            Bool         sigill_diag_IN )
   32164 {
   32165    Int       i, x1, x2;
   32166    Bool      expect_CAS, has_CAS;
   32167    DisResult dres;
   32168 
   32169    /* Set globals (see top of this file) */
   32170    vassert(guest_arch == VexArchAMD64);
   32171    guest_code           = guest_code_IN;
   32172    irsb                 = irsb_IN;
   32173    host_endness         = host_endness_IN;
   32174    guest_RIP_curr_instr = guest_IP;
   32175    guest_RIP_bbstart    = guest_IP - delta;
   32176 
   32177    /* We'll consult these after doing disInstr_AMD64_WRK. */
   32178    guest_RIP_next_assumed   = 0;
   32179    guest_RIP_next_mustcheck = False;
   32180 
   32181    x1 = irsb_IN->stmts_used;
   32182    expect_CAS = False;
   32183    dres = disInstr_AMD64_WRK ( &expect_CAS, resteerOkFn,
   32184                                resteerCisOk,
   32185                                callback_opaque,
   32186                                delta, archinfo, abiinfo, sigill_diag_IN );
   32187    x2 = irsb_IN->stmts_used;
   32188    vassert(x2 >= x1);
   32189 
   32190    /* If disInstr_AMD64_WRK tried to figure out the next rip, check it
   32191       got it right.  Failure of this assertion is serious and denotes
   32192       a bug in disInstr. */
   32193    if (guest_RIP_next_mustcheck
   32194        && guest_RIP_next_assumed != guest_RIP_curr_instr + dres.len) {
   32195       vex_printf("\n");
   32196       vex_printf("assumed next %%rip = 0x%llx\n",
   32197                  guest_RIP_next_assumed );
   32198       vex_printf(" actual next %%rip = 0x%llx\n",
   32199                  guest_RIP_curr_instr + dres.len );
   32200       vpanic("disInstr_AMD64: disInstr miscalculated next %rip");
   32201    }
   32202 
   32203    /* See comment at the top of disInstr_AMD64_WRK for meaning of
   32204       expect_CAS.  Here, we (sanity-)check for the presence/absence of
   32205       IRCAS as directed by the returned expect_CAS value. */
   32206    has_CAS = False;
   32207    for (i = x1; i < x2; i++) {
   32208       if (irsb_IN->stmts[i]->tag == Ist_CAS)
   32209          has_CAS = True;
   32210    }
   32211 
   32212    if (expect_CAS != has_CAS) {
   32213       /* inconsistency detected.  re-disassemble the instruction so as
   32214          to generate a useful error message; then assert. */
   32215       vex_traceflags |= VEX_TRACE_FE;
   32216       dres = disInstr_AMD64_WRK ( &expect_CAS, resteerOkFn,
   32217                                   resteerCisOk,
   32218                                   callback_opaque,
   32219                                   delta, archinfo, abiinfo, sigill_diag_IN );
   32220       for (i = x1; i < x2; i++) {
   32221          vex_printf("\t\t");
   32222          ppIRStmt(irsb_IN->stmts[i]);
   32223          vex_printf("\n");
   32224       }
   32225       /* Failure of this assertion is serious and denotes a bug in
   32226          disInstr. */
   32227       vpanic("disInstr_AMD64: inconsistency in LOCK prefix handling");
   32228    }
   32229 
   32230    return dres;
   32231 }
   32232 
   32233 
   32234 /*------------------------------------------------------------*/
   32235 /*--- Unused stuff                                         ---*/
   32236 /*------------------------------------------------------------*/
   32237 
   32238 // A potentially more Memcheck-friendly version of gen_LZCNT, if
   32239 // this should ever be needed.
   32240 //
   32241 //static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
   32242 //{
   32243 //   /* Scheme is simple: propagate the most significant 1-bit into all
   32244 //      lower positions in the word.  This gives a word of the form
   32245 //      0---01---1.  Now invert it, giving a word of the form
   32246 //      1---10---0, then do a population-count idiom (to count the 1s,
   32247 //      which is the number of leading zeroes, or the word size if the
   32248 //      original word was 0.
   32249 //   */
   32250 //   Int i;
   32251 //   IRTemp t[7];
   32252 //   for (i = 0; i < 7; i++) {
   32253 //      t[i] = newTemp(ty);
   32254 //   }
   32255 //   if (ty == Ity_I64) {
   32256 //      assign(t[0], binop(Iop_Or64, mkexpr(src),
   32257 //                                   binop(Iop_Shr64, mkexpr(src),  mkU8(1))));
   32258 //      assign(t[1], binop(Iop_Or64, mkexpr(t[0]),
   32259 //                                   binop(Iop_Shr64, mkexpr(t[0]), mkU8(2))));
   32260 //      assign(t[2], binop(Iop_Or64, mkexpr(t[1]),
   32261 //                                   binop(Iop_Shr64, mkexpr(t[1]), mkU8(4))));
   32262 //      assign(t[3], binop(Iop_Or64, mkexpr(t[2]),
   32263 //                                   binop(Iop_Shr64, mkexpr(t[2]), mkU8(8))));
   32264 //      assign(t[4], binop(Iop_Or64, mkexpr(t[3]),
   32265 //                                   binop(Iop_Shr64, mkexpr(t[3]), mkU8(16))));
   32266 //      assign(t[5], binop(Iop_Or64, mkexpr(t[4]),
   32267 //                                   binop(Iop_Shr64, mkexpr(t[4]), mkU8(32))));
   32268 //      assign(t[6], unop(Iop_Not64, mkexpr(t[5])));
   32269 //      return gen_POPCOUNT(ty, t[6]);
   32270 //   }
   32271 //   if (ty == Ity_I32) {
   32272 //      assign(t[0], binop(Iop_Or32, mkexpr(src),
   32273 //                                   binop(Iop_Shr32, mkexpr(src),  mkU8(1))));
   32274 //      assign(t[1], binop(Iop_Or32, mkexpr(t[0]),
   32275 //                                   binop(Iop_Shr32, mkexpr(t[0]), mkU8(2))));
   32276 //      assign(t[2], binop(Iop_Or32, mkexpr(t[1]),
   32277 //                                   binop(Iop_Shr32, mkexpr(t[1]), mkU8(4))));
   32278 //      assign(t[3], binop(Iop_Or32, mkexpr(t[2]),
   32279 //                                   binop(Iop_Shr32, mkexpr(t[2]), mkU8(8))));
   32280 //      assign(t[4], binop(Iop_Or32, mkexpr(t[3]),
   32281 //                                   binop(Iop_Shr32, mkexpr(t[3]), mkU8(16))));
   32282 //      assign(t[5], unop(Iop_Not32, mkexpr(t[4])));
   32283 //      return gen_POPCOUNT(ty, t[5]);
   32284 //   }
   32285 //   if (ty == Ity_I16) {
   32286 //      assign(t[0], binop(Iop_Or16, mkexpr(src),
   32287 //                                   binop(Iop_Shr16, mkexpr(src),  mkU8(1))));
   32288 //      assign(t[1], binop(Iop_Or16, mkexpr(t[0]),
   32289 //                                   binop(Iop_Shr16, mkexpr(t[0]), mkU8(2))));
   32290 //      assign(t[2], binop(Iop_Or16, mkexpr(t[1]),
   32291 //                                   binop(Iop_Shr16, mkexpr(t[1]), mkU8(4))));
   32292 //      assign(t[3], binop(Iop_Or16, mkexpr(t[2]),
   32293 //                                   binop(Iop_Shr16, mkexpr(t[2]), mkU8(8))));
   32294 //      assign(t[4], unop(Iop_Not16, mkexpr(t[3])));
   32295 //      return gen_POPCOUNT(ty, t[4]);
   32296 //   }
   32297 //   vassert(0);
   32298 //}
   32299 
   32300 
   32301 /*--------------------------------------------------------------------*/
   32302 /*--- end                                       guest_amd64_toIR.c ---*/
   32303 /*--------------------------------------------------------------------*/
   32304