Home | History | Annotate | Download | only in priv
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- begin                                     guest_amd64_toIR.c ---*/
      4 /*--------------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2013 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 /* Translates AMD64 code to IR. */
     37 
     38 /* TODO:
     39 
     40    All Puts to CC_OP/CC_DEP1/CC_DEP2/CC_NDEP should really be checked
     41    to ensure a 64-bit value is being written.
     42 
     43    x87 FP Limitations:
     44 
     45    * all arithmetic done at 64 bits
     46 
     47    * no FP exceptions, except for handling stack over/underflow
     48 
     49    * FP rounding mode observed only for float->int conversions and
     50      int->float conversions which could lose accuracy, and for
     51      float-to-float rounding.  For all other operations,
     52      round-to-nearest is used, regardless.
     53 
     54    * some of the FCOM cases could do with testing -- not convinced
     55      that the args are the right way round.
     56 
     57    * FSAVE does not re-initialise the FPU; it should do
     58 
     59    * FINIT not only initialises the FPU environment, it also zeroes
     60      all the FP registers.  It should leave the registers unchanged.
     61 
     62     SAHF should cause eflags[1] == 1, and in fact it produces 0.  As
     63     per Intel docs this bit has no meaning anyway.  Since PUSHF is the
     64     only way to observe eflags[1], a proper fix would be to make that
     65     bit be set by PUSHF.
     66 
     67     This module uses global variables and so is not MT-safe (if that
     68     should ever become relevant).
     69 */
     70 
     71 /* Notes re address size overrides (0x67).
     72 
     73    According to the AMD documentation (24594 Rev 3.09, Sept 2003,
     74    "AMD64 Architecture Programmer's Manual Volume 3: General-Purpose
     75    and System Instructions"), Section 1.2.3 ("Address-Size Override
     76    Prefix"):
     77 
     78    0x67 applies to all explicit memory references, causing the top
     79    32 bits of the effective address to become zero.
     80 
     81    0x67 has no effect on stack references (push/pop); these always
     82    use a 64-bit address.
     83 
     84    0x67 changes the interpretation of instructions which implicitly
     85    reference RCX/RSI/RDI, so that in fact ECX/ESI/EDI are used
     86    instead.  These are:
     87 
     88       cmp{s,sb,sw,sd,sq}
     89       in{s,sb,sw,sd}
     90       jcxz, jecxz, jrcxz
     91       lod{s,sb,sw,sd,sq}
     92       loop{,e,bz,be,z}
     93       mov{s,sb,sw,sd,sq}
     94       out{s,sb,sw,sd}
     95       rep{,e,ne,nz}
     96       sca{s,sb,sw,sd,sq}
     97       sto{s,sb,sw,sd,sq}
     98       xlat{,b} */
     99 
    100 /* "Special" instructions.
    101 
    102    This instruction decoder can decode three special instructions
    103    which mean nothing natively (are no-ops as far as regs/mem are
    104    concerned) but have meaning for supporting Valgrind.  A special
    105    instruction is flagged by the 16-byte preamble 48C1C703 48C1C70D
    106    48C1C73D 48C1C733 (in the standard interpretation, that means: rolq
    107    $3, %rdi; rolq $13, %rdi; rolq $61, %rdi; rolq $51, %rdi).
    108    Following that, one of the following 3 are allowed (standard
    109    interpretation in parentheses):
    110 
    111       4887DB (xchgq %rbx,%rbx)   %RDX = client_request ( %RAX )
    112       4887C9 (xchgq %rcx,%rcx)   %RAX = guest_NRADDR
    113       4887D2 (xchgq %rdx,%rdx)   call-noredir *%RAX
    114       4887F6 (xchgq %rdi,%rdi)   IR injection
    115 
    116    Any other bytes following the 16-byte preamble are illegal and
    117    constitute a failure in instruction decoding.  This all assumes
    118    that the preamble will never occur except in specific code
    119    fragments designed for Valgrind to catch.
    120 
    121    No prefixes may precede a "Special" instruction.
    122 */
    123 
    124 /* casLE (implementation of lock-prefixed insns) and rep-prefixed
    125    insns: the side-exit back to the start of the insn is done with
    126    Ijk_Boring.  This is quite wrong, it should be done with
    127    Ijk_NoRedir, since otherwise the side exit, which is intended to
    128    restart the instruction for whatever reason, could go somewhere
    129    entirely else.  Doing it right (with Ijk_NoRedir jumps) would make
    130    no-redir jumps performance critical, at least for rep-prefixed
    131    instructions, since all iterations thereof would involve such a
    132    jump.  It's not such a big deal with casLE since the side exit is
    133    only taken if the CAS fails, that is, the location is contended,
    134    which is relatively unlikely.
    135 
    136    Note also, the test for CAS success vs failure is done using
    137    Iop_CasCmp{EQ,NE}{8,16,32,64} rather than the ordinary
    138    Iop_Cmp{EQ,NE} equivalents.  This is so as to tell Memcheck that it
    139    shouldn't definedness-check these comparisons.  See
    140    COMMENT_ON_CasCmpEQ in memcheck/mc_translate.c for
    141    background/rationale.
    142 */
    143 
    144 /* LOCK prefixed instructions.  These are translated using IR-level
    145    CAS statements (IRCAS) and are believed to preserve atomicity, even
    146    from the point of view of some other process racing against a
    147    simulated one (presumably they communicate via a shared memory
    148    segment).
    149 
    150    Handlers which are aware of LOCK prefixes are:
    151       dis_op2_G_E      (add, or, adc, sbb, and, sub, xor)
    152       dis_cmpxchg_G_E  (cmpxchg)
    153       dis_Grp1         (add, or, adc, sbb, and, sub, xor)
    154       dis_Grp3         (not, neg)
    155       dis_Grp4         (inc, dec)
    156       dis_Grp5         (inc, dec)
    157       dis_Grp8_Imm     (bts, btc, btr)
    158       dis_bt_G_E       (bts, btc, btr)
    159       dis_xadd_G_E     (xadd)
    160 */
    161 
    162 
    163 #include "libvex_basictypes.h"
    164 #include "libvex_ir.h"
    165 #include "libvex.h"
    166 #include "libvex_guest_amd64.h"
    167 
    168 #include "main_util.h"
    169 #include "main_globals.h"
    170 #include "guest_generic_bb_to_IR.h"
    171 #include "guest_generic_x87.h"
    172 #include "guest_amd64_defs.h"
    173 
    174 
    175 /*------------------------------------------------------------*/
    176 /*--- Globals                                              ---*/
    177 /*------------------------------------------------------------*/
    178 
    179 /* These are set at the start of the translation of an insn, right
    180    down in disInstr_AMD64, so that we don't have to pass them around
    181    endlessly.  They are all constant during the translation of any
    182    given insn. */
    183 
    184 /* These are set at the start of the translation of a BB, so
    185    that we don't have to pass them around endlessly. */
    186 
    187 /* We need to know this to do sub-register accesses correctly. */
    188 static Bool host_is_bigendian;
    189 
    190 /* Pointer to the guest code area (points to start of BB, not to the
    191    insn being processed). */
    192 static UChar* guest_code;
    193 
    194 /* The guest address corresponding to guest_code[0]. */
    195 static Addr64 guest_RIP_bbstart;
    196 
    197 /* The guest address for the instruction currently being
    198    translated. */
    199 static Addr64 guest_RIP_curr_instr;
    200 
    201 /* The IRSB* into which we're generating code. */
    202 static IRSB* irsb;
    203 
    204 /* For ensuring that %rip-relative addressing is done right.  A read
    205    of %rip generates the address of the next instruction.  It may be
    206    that we don't conveniently know that inside disAMode().  For sanity
    207    checking, if the next insn %rip is needed, we make a guess at what
    208    it is, record that guess here, and set the accompanying Bool to
    209    indicate that -- after this insn's decode is finished -- that guess
    210    needs to be checked.  */
    211 
    212 /* At the start of each insn decode, is set to (0, False).
    213    After the decode, if _mustcheck is now True, _assumed is
    214    checked. */
    215 
    216 static Addr64 guest_RIP_next_assumed;
    217 static Bool   guest_RIP_next_mustcheck;
    218 
    219 
    220 /*------------------------------------------------------------*/
    221 /*--- Helpers for constructing IR.                         ---*/
    222 /*------------------------------------------------------------*/
    223 
    224 /* Generate a new temporary of the given type. */
    225 static IRTemp newTemp ( IRType ty )
    226 {
    227    vassert(isPlausibleIRType(ty));
    228    return newIRTemp( irsb->tyenv, ty );
    229 }
    230 
    231 /* Add a statement to the list held by "irsb". */
    232 static void stmt ( IRStmt* st )
    233 {
    234    addStmtToIRSB( irsb, st );
    235 }
    236 
    237 /* Generate a statement "dst := e". */
    238 static void assign ( IRTemp dst, IRExpr* e )
    239 {
    240    stmt( IRStmt_WrTmp(dst, e) );
    241 }
    242 
    243 static IRExpr* unop ( IROp op, IRExpr* a )
    244 {
    245    return IRExpr_Unop(op, a);
    246 }
    247 
    248 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
    249 {
    250    return IRExpr_Binop(op, a1, a2);
    251 }
    252 
    253 static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
    254 {
    255    return IRExpr_Triop(op, a1, a2, a3);
    256 }
    257 
    258 static IRExpr* mkexpr ( IRTemp tmp )
    259 {
    260    return IRExpr_RdTmp(tmp);
    261 }
    262 
    263 static IRExpr* mkU8 ( ULong i )
    264 {
    265    vassert(i < 256);
    266    return IRExpr_Const(IRConst_U8( (UChar)i ));
    267 }
    268 
    269 static IRExpr* mkU16 ( ULong i )
    270 {
    271    vassert(i < 0x10000ULL);
    272    return IRExpr_Const(IRConst_U16( (UShort)i ));
    273 }
    274 
    275 static IRExpr* mkU32 ( ULong i )
    276 {
    277    vassert(i < 0x100000000ULL);
    278    return IRExpr_Const(IRConst_U32( (UInt)i ));
    279 }
    280 
    281 static IRExpr* mkU64 ( ULong i )
    282 {
    283    return IRExpr_Const(IRConst_U64(i));
    284 }
    285 
    286 static IRExpr* mkU ( IRType ty, ULong i )
    287 {
    288    switch (ty) {
    289       case Ity_I8:  return mkU8(i);
    290       case Ity_I16: return mkU16(i);
    291       case Ity_I32: return mkU32(i);
    292       case Ity_I64: return mkU64(i);
    293       default: vpanic("mkU(amd64)");
    294    }
    295 }
    296 
    297 static void storeLE ( IRExpr* addr, IRExpr* data )
    298 {
    299    stmt( IRStmt_Store(Iend_LE, addr, data) );
    300 }
    301 
    302 static IRExpr* loadLE ( IRType ty, IRExpr* addr )
    303 {
    304    return IRExpr_Load(Iend_LE, ty, addr);
    305 }
    306 
    307 static IROp mkSizedOp ( IRType ty, IROp op8 )
    308 {
    309    vassert(op8 == Iop_Add8 || op8 == Iop_Sub8
    310            || op8 == Iop_Mul8
    311            || op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
    312            || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
    313            || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
    314            || op8 == Iop_CasCmpNE8
    315            || op8 == Iop_Not8 );
    316    switch (ty) {
    317       case Ity_I8:  return 0 +op8;
    318       case Ity_I16: return 1 +op8;
    319       case Ity_I32: return 2 +op8;
    320       case Ity_I64: return 3 +op8;
    321       default: vpanic("mkSizedOp(amd64)");
    322    }
    323 }
    324 
    325 static
    326 IRExpr* doScalarWidening ( Int szSmall, Int szBig, Bool signd, IRExpr* src )
    327 {
    328    if (szSmall == 1 && szBig == 4) {
    329       return unop(signd ? Iop_8Sto32 : Iop_8Uto32, src);
    330    }
    331    if (szSmall == 1 && szBig == 2) {
    332       return unop(signd ? Iop_8Sto16 : Iop_8Uto16, src);
    333    }
    334    if (szSmall == 2 && szBig == 4) {
    335       return unop(signd ? Iop_16Sto32 : Iop_16Uto32, src);
    336    }
    337    if (szSmall == 1 && szBig == 8 && !signd) {
    338       return unop(Iop_8Uto64, src);
    339    }
    340    if (szSmall == 1 && szBig == 8 && signd) {
    341       return unop(Iop_8Sto64, src);
    342    }
    343    if (szSmall == 2 && szBig == 8 && !signd) {
    344       return unop(Iop_16Uto64, src);
    345    }
    346    if (szSmall == 2 && szBig == 8 && signd) {
    347       return unop(Iop_16Sto64, src);
    348    }
    349    vpanic("doScalarWidening(amd64)");
    350 }
    351 
    352 
    353 
    354 /*------------------------------------------------------------*/
    355 /*--- Debugging output                                     ---*/
    356 /*------------------------------------------------------------*/
    357 
    358 /* Bomb out if we can't handle something. */
    359 __attribute__ ((noreturn))
    360 static void unimplemented ( const HChar* str )
    361 {
    362    vex_printf("amd64toIR: unimplemented feature\n");
    363    vpanic(str);
    364 }
    365 
    366 #define DIP(format, args...)           \
    367    if (vex_traceflags & VEX_TRACE_FE)  \
    368       vex_printf(format, ## args)
    369 
    370 #define DIS(buf, format, args...)      \
    371    if (vex_traceflags & VEX_TRACE_FE)  \
    372       vex_sprintf(buf, format, ## args)
    373 
    374 
    375 /*------------------------------------------------------------*/
    376 /*--- Offsets of various parts of the amd64 guest state.   ---*/
    377 /*------------------------------------------------------------*/
    378 
    379 #define OFFB_RAX       offsetof(VexGuestAMD64State,guest_RAX)
    380 #define OFFB_RBX       offsetof(VexGuestAMD64State,guest_RBX)
    381 #define OFFB_RCX       offsetof(VexGuestAMD64State,guest_RCX)
    382 #define OFFB_RDX       offsetof(VexGuestAMD64State,guest_RDX)
    383 #define OFFB_RSP       offsetof(VexGuestAMD64State,guest_RSP)
    384 #define OFFB_RBP       offsetof(VexGuestAMD64State,guest_RBP)
    385 #define OFFB_RSI       offsetof(VexGuestAMD64State,guest_RSI)
    386 #define OFFB_RDI       offsetof(VexGuestAMD64State,guest_RDI)
    387 #define OFFB_R8        offsetof(VexGuestAMD64State,guest_R8)
    388 #define OFFB_R9        offsetof(VexGuestAMD64State,guest_R9)
    389 #define OFFB_R10       offsetof(VexGuestAMD64State,guest_R10)
    390 #define OFFB_R11       offsetof(VexGuestAMD64State,guest_R11)
    391 #define OFFB_R12       offsetof(VexGuestAMD64State,guest_R12)
    392 #define OFFB_R13       offsetof(VexGuestAMD64State,guest_R13)
    393 #define OFFB_R14       offsetof(VexGuestAMD64State,guest_R14)
    394 #define OFFB_R15       offsetof(VexGuestAMD64State,guest_R15)
    395 
    396 #define OFFB_RIP       offsetof(VexGuestAMD64State,guest_RIP)
    397 
    398 #define OFFB_FS_ZERO   offsetof(VexGuestAMD64State,guest_FS_ZERO)
    399 #define OFFB_GS_0x60   offsetof(VexGuestAMD64State,guest_GS_0x60)
    400 
    401 #define OFFB_CC_OP     offsetof(VexGuestAMD64State,guest_CC_OP)
    402 #define OFFB_CC_DEP1   offsetof(VexGuestAMD64State,guest_CC_DEP1)
    403 #define OFFB_CC_DEP2   offsetof(VexGuestAMD64State,guest_CC_DEP2)
    404 #define OFFB_CC_NDEP   offsetof(VexGuestAMD64State,guest_CC_NDEP)
    405 
    406 #define OFFB_FPREGS    offsetof(VexGuestAMD64State,guest_FPREG[0])
    407 #define OFFB_FPTAGS    offsetof(VexGuestAMD64State,guest_FPTAG[0])
    408 #define OFFB_DFLAG     offsetof(VexGuestAMD64State,guest_DFLAG)
    409 #define OFFB_ACFLAG    offsetof(VexGuestAMD64State,guest_ACFLAG)
    410 #define OFFB_IDFLAG    offsetof(VexGuestAMD64State,guest_IDFLAG)
    411 #define OFFB_FTOP      offsetof(VexGuestAMD64State,guest_FTOP)
    412 #define OFFB_FC3210    offsetof(VexGuestAMD64State,guest_FC3210)
    413 #define OFFB_FPROUND   offsetof(VexGuestAMD64State,guest_FPROUND)
    414 
    415 #define OFFB_SSEROUND  offsetof(VexGuestAMD64State,guest_SSEROUND)
    416 #define OFFB_YMM0      offsetof(VexGuestAMD64State,guest_YMM0)
    417 #define OFFB_YMM1      offsetof(VexGuestAMD64State,guest_YMM1)
    418 #define OFFB_YMM2      offsetof(VexGuestAMD64State,guest_YMM2)
    419 #define OFFB_YMM3      offsetof(VexGuestAMD64State,guest_YMM3)
    420 #define OFFB_YMM4      offsetof(VexGuestAMD64State,guest_YMM4)
    421 #define OFFB_YMM5      offsetof(VexGuestAMD64State,guest_YMM5)
    422 #define OFFB_YMM6      offsetof(VexGuestAMD64State,guest_YMM6)
    423 #define OFFB_YMM7      offsetof(VexGuestAMD64State,guest_YMM7)
    424 #define OFFB_YMM8      offsetof(VexGuestAMD64State,guest_YMM8)
    425 #define OFFB_YMM9      offsetof(VexGuestAMD64State,guest_YMM9)
    426 #define OFFB_YMM10     offsetof(VexGuestAMD64State,guest_YMM10)
    427 #define OFFB_YMM11     offsetof(VexGuestAMD64State,guest_YMM11)
    428 #define OFFB_YMM12     offsetof(VexGuestAMD64State,guest_YMM12)
    429 #define OFFB_YMM13     offsetof(VexGuestAMD64State,guest_YMM13)
    430 #define OFFB_YMM14     offsetof(VexGuestAMD64State,guest_YMM14)
    431 #define OFFB_YMM15     offsetof(VexGuestAMD64State,guest_YMM15)
    432 #define OFFB_YMM16     offsetof(VexGuestAMD64State,guest_YMM16)
    433 
    434 #define OFFB_EMNOTE    offsetof(VexGuestAMD64State,guest_EMNOTE)
    435 #define OFFB_CMSTART   offsetof(VexGuestAMD64State,guest_CMSTART)
    436 #define OFFB_CMLEN     offsetof(VexGuestAMD64State,guest_CMLEN)
    437 
    438 #define OFFB_NRADDR    offsetof(VexGuestAMD64State,guest_NRADDR)
    439 
    440 
    441 /*------------------------------------------------------------*/
    442 /*--- Helper bits and pieces for deconstructing the        ---*/
    443 /*--- amd64 insn stream.                                   ---*/
    444 /*------------------------------------------------------------*/
    445 
    446 /* This is the AMD64 register encoding -- integer regs. */
    447 #define R_RAX 0
    448 #define R_RCX 1
    449 #define R_RDX 2
    450 #define R_RBX 3
    451 #define R_RSP 4
    452 #define R_RBP 5
    453 #define R_RSI 6
    454 #define R_RDI 7
    455 #define R_R8  8
    456 #define R_R9  9
    457 #define R_R10 10
    458 #define R_R11 11
    459 #define R_R12 12
    460 #define R_R13 13
    461 #define R_R14 14
    462 #define R_R15 15
    463 
    464 /* This is the Intel register encoding -- segment regs. */
    465 #define R_ES 0
    466 #define R_CS 1
    467 #define R_SS 2
    468 #define R_DS 3
    469 #define R_FS 4
    470 #define R_GS 5
    471 
    472 
    473 /* Various simple conversions */
    474 
    475 static ULong extend_s_8to64 ( UChar x )
    476 {
    477    return (ULong)((((Long)x) << 56) >> 56);
    478 }
    479 
    480 static ULong extend_s_16to64 ( UShort x )
    481 {
    482    return (ULong)((((Long)x) << 48) >> 48);
    483 }
    484 
    485 static ULong extend_s_32to64 ( UInt x )
    486 {
    487    return (ULong)((((Long)x) << 32) >> 32);
    488 }
    489 
    490 /* Figure out whether the mod and rm parts of a modRM byte refer to a
    491    register or memory.  If so, the byte will have the form 11XXXYYY,
    492    where YYY is the register number. */
    493 inline
    494 static Bool epartIsReg ( UChar mod_reg_rm )
    495 {
    496    return toBool(0xC0 == (mod_reg_rm & 0xC0));
    497 }
    498 
    499 /* Extract the 'g' field from a modRM byte.  This only produces 3
    500    bits, which is not a complete register number.  You should avoid
    501    this function if at all possible. */
    502 inline
    503 static Int gregLO3ofRM ( UChar mod_reg_rm )
    504 {
    505    return (Int)( (mod_reg_rm >> 3) & 7 );
    506 }
    507 
    508 /* Ditto the 'e' field of a modRM byte. */
    509 inline
    510 static Int eregLO3ofRM ( UChar mod_reg_rm )
    511 {
    512    return (Int)(mod_reg_rm & 0x7);
    513 }
    514 
    515 /* Get a 8/16/32-bit unsigned value out of the insn stream. */
    516 
    517 static inline UChar getUChar ( Long delta )
    518 {
    519    UChar v = guest_code[delta+0];
    520    return v;
    521 }
    522 
    523 static UInt getUDisp16 ( Long delta )
    524 {
    525    UInt v = guest_code[delta+1]; v <<= 8;
    526    v |= guest_code[delta+0];
    527    return v & 0xFFFF;
    528 }
    529 
    530 //.. static UInt getUDisp ( Int size, Long delta )
    531 //.. {
    532 //..    switch (size) {
    533 //..       case 4: return getUDisp32(delta);
    534 //..       case 2: return getUDisp16(delta);
    535 //..       case 1: return getUChar(delta);
    536 //..       default: vpanic("getUDisp(x86)");
    537 //..    }
    538 //..    return 0; /*notreached*/
    539 //.. }
    540 
    541 
    542 /* Get a byte value out of the insn stream and sign-extend to 64
    543    bits. */
    544 static Long getSDisp8 ( Long delta )
    545 {
    546    return extend_s_8to64( guest_code[delta] );
    547 }
    548 
    549 /* Get a 16-bit value out of the insn stream and sign-extend to 64
    550    bits. */
    551 static Long getSDisp16 ( Long delta )
    552 {
    553    UInt v = guest_code[delta+1]; v <<= 8;
    554    v |= guest_code[delta+0];
    555    return extend_s_16to64( (UShort)v );
    556 }
    557 
    558 /* Get a 32-bit value out of the insn stream and sign-extend to 64
    559    bits. */
    560 static Long getSDisp32 ( Long delta )
    561 {
    562    UInt v = guest_code[delta+3]; v <<= 8;
    563    v |= guest_code[delta+2]; v <<= 8;
    564    v |= guest_code[delta+1]; v <<= 8;
    565    v |= guest_code[delta+0];
    566    return extend_s_32to64( v );
    567 }
    568 
    569 /* Get a 64-bit value out of the insn stream. */
    570 static Long getDisp64 ( Long delta )
    571 {
    572    ULong v = 0;
    573    v |= guest_code[delta+7]; v <<= 8;
    574    v |= guest_code[delta+6]; v <<= 8;
    575    v |= guest_code[delta+5]; v <<= 8;
    576    v |= guest_code[delta+4]; v <<= 8;
    577    v |= guest_code[delta+3]; v <<= 8;
    578    v |= guest_code[delta+2]; v <<= 8;
    579    v |= guest_code[delta+1]; v <<= 8;
    580    v |= guest_code[delta+0];
    581    return v;
    582 }
    583 
    584 /* Note: because AMD64 doesn't allow 64-bit literals, it is an error
    585    if this is called with size==8.  Should not happen. */
    586 static Long getSDisp ( Int size, Long delta )
    587 {
    588    switch (size) {
    589       case 4: return getSDisp32(delta);
    590       case 2: return getSDisp16(delta);
    591       case 1: return getSDisp8(delta);
    592       default: vpanic("getSDisp(amd64)");
    593   }
    594 }
    595 
    596 static ULong mkSizeMask ( Int sz )
    597 {
    598    switch (sz) {
    599       case 1: return 0x00000000000000FFULL;
    600       case 2: return 0x000000000000FFFFULL;
    601       case 4: return 0x00000000FFFFFFFFULL;
    602       case 8: return 0xFFFFFFFFFFFFFFFFULL;
    603       default: vpanic("mkSzMask(amd64)");
    604    }
    605 }
    606 
    607 static Int imin ( Int a, Int b )
    608 {
    609    return (a < b) ? a : b;
    610 }
    611 
    612 static IRType szToITy ( Int n )
    613 {
    614    switch (n) {
    615       case 1: return Ity_I8;
    616       case 2: return Ity_I16;
    617       case 4: return Ity_I32;
    618       case 8: return Ity_I64;
    619       default: vex_printf("\nszToITy(%d)\n", n);
    620                vpanic("szToITy(amd64)");
    621    }
    622 }
    623 
    624 
    625 /*------------------------------------------------------------*/
    626 /*--- For dealing with prefixes.                           ---*/
    627 /*------------------------------------------------------------*/
    628 
    629 /* The idea is to pass around an int holding a bitmask summarising
    630    info from the prefixes seen on the current instruction, including
    631    info from the REX byte.  This info is used in various places, but
    632    most especially when making sense of register fields in
    633    instructions.
    634 
    635    The top 8 bits of the prefix are 0x55, just as a hacky way to
    636    ensure it really is a valid prefix.
    637 
    638    Things you can safely assume about a well-formed prefix:
    639    * at most one segment-override bit (CS,DS,ES,FS,GS,SS) is set.
    640    * if REX is not present then REXW,REXR,REXX,REXB will read
    641      as zero.
    642    * F2 and F3 will not both be 1.
    643 */
    644 
    645 typedef UInt  Prefix;
    646 
    647 #define PFX_ASO    (1<<0)    /* address-size override present (0x67) */
    648 #define PFX_66     (1<<1)    /* operand-size override-to-16 present (0x66) */
    649 #define PFX_REX    (1<<2)    /* REX byte present (0x40 to 0x4F) */
    650 #define PFX_REXW   (1<<3)    /* REX W bit, if REX present, else 0 */
    651 #define PFX_REXR   (1<<4)    /* REX R bit, if REX present, else 0 */
    652 #define PFX_REXX   (1<<5)    /* REX X bit, if REX present, else 0 */
    653 #define PFX_REXB   (1<<6)    /* REX B bit, if REX present, else 0 */
    654 #define PFX_LOCK   (1<<7)    /* bus LOCK prefix present (0xF0) */
    655 #define PFX_F2     (1<<8)    /* REP/REPE/REPZ prefix present (0xF2) */
    656 #define PFX_F3     (1<<9)    /* REPNE/REPNZ prefix present (0xF3) */
    657 #define PFX_CS     (1<<10)   /* CS segment prefix present (0x2E) */
    658 #define PFX_DS     (1<<11)   /* DS segment prefix present (0x3E) */
    659 #define PFX_ES     (1<<12)   /* ES segment prefix present (0x26) */
    660 #define PFX_FS     (1<<13)   /* FS segment prefix present (0x64) */
    661 #define PFX_GS     (1<<14)   /* GS segment prefix present (0x65) */
    662 #define PFX_SS     (1<<15)   /* SS segment prefix present (0x36) */
    663 #define PFX_VEX    (1<<16)   /* VEX prefix present (0xC4 or 0xC5) */
    664 #define PFX_VEXL   (1<<17)   /* VEX L bit, if VEX present, else 0 */
    665 /* The extra register field VEX.vvvv is encoded (after not-ing it) as
    666    PFX_VEXnV3 .. PFX_VEXnV0, so these must occupy adjacent bit
    667    positions. */
    668 #define PFX_VEXnV0 (1<<18)   /* ~VEX vvvv[0], if VEX present, else 0 */
    669 #define PFX_VEXnV1 (1<<19)   /* ~VEX vvvv[1], if VEX present, else 0 */
    670 #define PFX_VEXnV2 (1<<20)   /* ~VEX vvvv[2], if VEX present, else 0 */
    671 #define PFX_VEXnV3 (1<<21)   /* ~VEX vvvv[3], if VEX present, else 0 */
    672 
    673 
    674 #define PFX_EMPTY 0x55000000
    675 
    676 static Bool IS_VALID_PFX ( Prefix pfx ) {
    677    return toBool((pfx & 0xFF000000) == PFX_EMPTY);
    678 }
    679 
    680 static Bool haveREX ( Prefix pfx ) {
    681    return toBool(pfx & PFX_REX);
    682 }
    683 
    684 static Int getRexW ( Prefix pfx ) {
    685    return (pfx & PFX_REXW) ? 1 : 0;
    686 }
    687 static Int getRexR ( Prefix pfx ) {
    688    return (pfx & PFX_REXR) ? 1 : 0;
    689 }
    690 static Int getRexX ( Prefix pfx ) {
    691    return (pfx & PFX_REXX) ? 1 : 0;
    692 }
    693 static Int getRexB ( Prefix pfx ) {
    694    return (pfx & PFX_REXB) ? 1 : 0;
    695 }
    696 
    697 /* Check a prefix doesn't have F2 or F3 set in it, since usually that
    698    completely changes what instruction it really is. */
    699 static Bool haveF2orF3 ( Prefix pfx ) {
    700    return toBool((pfx & (PFX_F2|PFX_F3)) > 0);
    701 }
    702 static Bool haveF2andF3 ( Prefix pfx ) {
    703    return toBool((pfx & (PFX_F2|PFX_F3)) == (PFX_F2|PFX_F3));
    704 }
    705 static Bool haveF2 ( Prefix pfx ) {
    706    return toBool((pfx & PFX_F2) > 0);
    707 }
    708 static Bool haveF3 ( Prefix pfx ) {
    709    return toBool((pfx & PFX_F3) > 0);
    710 }
    711 
    712 static Bool have66 ( Prefix pfx ) {
    713    return toBool((pfx & PFX_66) > 0);
    714 }
    715 static Bool haveASO ( Prefix pfx ) {
    716    return toBool((pfx & PFX_ASO) > 0);
    717 }
    718 static Bool haveLOCK ( Prefix pfx ) {
    719    return toBool((pfx & PFX_LOCK) > 0);
    720 }
    721 
    722 /* Return True iff pfx has 66 set and F2 and F3 clear */
    723 static Bool have66noF2noF3 ( Prefix pfx )
    724 {
    725   return
    726      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_66);
    727 }
    728 
    729 /* Return True iff pfx has F2 set and 66 and F3 clear */
    730 static Bool haveF2no66noF3 ( Prefix pfx )
    731 {
    732   return
    733      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F2);
    734 }
    735 
    736 /* Return True iff pfx has F3 set and 66 and F2 clear */
    737 static Bool haveF3no66noF2 ( Prefix pfx )
    738 {
    739   return
    740      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F3);
    741 }
    742 
    743 /* Return True iff pfx has F3 set and F2 clear */
    744 static Bool haveF3noF2 ( Prefix pfx )
    745 {
    746   return
    747      toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F3);
    748 }
    749 
    750 /* Return True iff pfx has F2 set and F3 clear */
    751 static Bool haveF2noF3 ( Prefix pfx )
    752 {
    753   return
    754      toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F2);
    755 }
    756 
    757 /* Return True iff pfx has 66, F2 and F3 clear */
    758 static Bool haveNo66noF2noF3 ( Prefix pfx )
    759 {
    760   return
    761      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == 0);
    762 }
    763 
    764 /* Return True iff pfx has any of 66, F2 and F3 set */
    765 static Bool have66orF2orF3 ( Prefix pfx )
    766 {
    767   return toBool( ! haveNo66noF2noF3(pfx) );
    768 }
    769 
    770 /* Return True iff pfx has 66 or F3 set */
    771 static Bool have66orF3 ( Prefix pfx )
    772 {
    773    return toBool((pfx & (PFX_66|PFX_F3)) > 0);
    774 }
    775 
    776 /* Clear all the segment-override bits in a prefix. */
    777 static Prefix clearSegBits ( Prefix p )
    778 {
    779    return
    780       p & ~(PFX_CS | PFX_DS | PFX_ES | PFX_FS | PFX_GS | PFX_SS);
    781 }
    782 
    783 /* Get the (inverted, hence back to "normal") VEX.vvvv field. */
    784 static UInt getVexNvvvv ( Prefix pfx ) {
    785    UInt r = (UInt)pfx;
    786    r /= (UInt)PFX_VEXnV0; /* pray this turns into a shift */
    787    return r & 0xF;
    788 }
    789 
    790 static Bool haveVEX ( Prefix pfx ) {
    791    return toBool(pfx & PFX_VEX);
    792 }
    793 
    794 static Int getVexL ( Prefix pfx ) {
    795    return (pfx & PFX_VEXL) ? 1 : 0;
    796 }
    797 
    798 
    799 /*------------------------------------------------------------*/
    800 /*--- For dealing with escapes                             ---*/
    801 /*------------------------------------------------------------*/
    802 
    803 
    804 /* Escapes come after the prefixes, but before the primary opcode
    805    byte.  They escape the primary opcode byte into a bigger space.
    806    The 0xF0000000 isn't significant, except so as to make it not
    807    overlap valid Prefix values, for sanity checking.
    808 */
    809 
    810 typedef
    811    enum {
    812       ESC_NONE=0xF0000000, // none
    813       ESC_0F,              // 0F
    814       ESC_0F38,            // 0F 38
    815       ESC_0F3A             // 0F 3A
    816    }
    817    Escape;
    818 
    819 
    820 /*------------------------------------------------------------*/
    821 /*--- For dealing with integer registers                   ---*/
    822 /*------------------------------------------------------------*/
    823 
    824 /* This is somewhat complex.  The rules are:
    825 
    826    For 64, 32 and 16 bit register references, the e or g fields in the
    827    modrm bytes supply the low 3 bits of the register number.  The
    828    fourth (most-significant) bit of the register number is supplied by
    829    the REX byte, if it is present; else that bit is taken to be zero.
    830 
    831    The REX.R bit supplies the high bit corresponding to the g register
    832    field, and the REX.B bit supplies the high bit corresponding to the
    833    e register field (when the mod part of modrm indicates that modrm's
    834    e component refers to a register and not to memory).
    835 
    836    The REX.X bit supplies a high register bit for certain registers
    837    in SIB address modes, and is generally rarely used.
    838 
    839    For 8 bit register references, the presence of the REX byte itself
    840    has significance.  If there is no REX present, then the 3-bit
    841    number extracted from the modrm e or g field is treated as an index
    842    into the sequence %al %cl %dl %bl %ah %ch %dh %bh -- that is, the
    843    old x86 encoding scheme.
    844 
    845    But if there is a REX present, the register reference is
    846    interpreted in the same way as for 64/32/16-bit references: a high
    847    bit is extracted from REX, giving a 4-bit number, and the denoted
    848    register is the lowest 8 bits of the 16 integer registers denoted
    849    by the number.  In particular, values 3 through 7 of this sequence
    850    do not refer to %ah %ch %dh %bh but instead to the lowest 8 bits of
    851    %rsp %rbp %rsi %rdi.
    852 
    853    The REX.W bit has no bearing at all on register numbers.  Instead
    854    its presence indicates that the operand size is to be overridden
    855    from its default value (32 bits) to 64 bits instead.  This is in
    856    the same fashion that an 0x66 prefix indicates the operand size is
    857    to be overridden from 32 bits down to 16 bits.  When both REX.W and
    858    0x66 are present there is a conflict, and REX.W takes precedence.
    859 
    860    Rather than try to handle this complexity using a single huge
    861    function, several smaller ones are provided.  The aim is to make it
    862    as difficult as possible to screw up register decoding in a subtle
    863    and hard-to-track-down way.
    864 
    865    Because these routines fish around in the host's memory (that is,
    866    in the guest state area) for sub-parts of guest registers, their
    867    correctness depends on the host's endianness.  So far these
    868    routines only work for little-endian hosts.  Those for which
    869    endianness is important have assertions to ensure sanity.
    870 */
    871 
    872 
    873 /* About the simplest question you can ask: where do the 64-bit
    874    integer registers live (in the guest state) ? */
    875 
    876 static Int integerGuestReg64Offset ( UInt reg )
    877 {
    878    switch (reg) {
    879       case R_RAX: return OFFB_RAX;
    880       case R_RCX: return OFFB_RCX;
    881       case R_RDX: return OFFB_RDX;
    882       case R_RBX: return OFFB_RBX;
    883       case R_RSP: return OFFB_RSP;
    884       case R_RBP: return OFFB_RBP;
    885       case R_RSI: return OFFB_RSI;
    886       case R_RDI: return OFFB_RDI;
    887       case R_R8:  return OFFB_R8;
    888       case R_R9:  return OFFB_R9;
    889       case R_R10: return OFFB_R10;
    890       case R_R11: return OFFB_R11;
    891       case R_R12: return OFFB_R12;
    892       case R_R13: return OFFB_R13;
    893       case R_R14: return OFFB_R14;
    894       case R_R15: return OFFB_R15;
    895       default: vpanic("integerGuestReg64Offset(amd64)");
    896    }
    897 }
    898 
    899 
    900 /* Produce the name of an integer register, for printing purposes.
    901    reg is a number in the range 0 .. 15 that has been generated from a
    902    3-bit reg-field number and a REX extension bit.  irregular denotes
    903    the case where sz==1 and no REX byte is present. */
    904 
    905 static
    906 const HChar* nameIReg ( Int sz, UInt reg, Bool irregular )
    907 {
    908    static const HChar* ireg64_names[16]
    909      = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
    910          "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
    911    static const HChar* ireg32_names[16]
    912      = { "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
    913          "%r8d", "%r9d", "%r10d","%r11d","%r12d","%r13d","%r14d","%r15d" };
    914    static const HChar* ireg16_names[16]
    915      = { "%ax",  "%cx",  "%dx",  "%bx",  "%sp",  "%bp",  "%si",  "%di",
    916          "%r8w", "%r9w", "%r10w","%r11w","%r12w","%r13w","%r14w","%r15w" };
    917    static const HChar* ireg8_names[16]
    918      = { "%al",  "%cl",  "%dl",  "%bl",  "%spl", "%bpl", "%sil", "%dil",
    919          "%r8b", "%r9b", "%r10b","%r11b","%r12b","%r13b","%r14b","%r15b" };
    920    static const HChar* ireg8_irregular[8]
    921      = { "%al", "%cl", "%dl", "%bl", "%ah", "%ch", "%dh", "%bh" };
    922 
    923    vassert(reg < 16);
    924    if (sz == 1) {
    925       if (irregular)
    926          vassert(reg < 8);
    927    } else {
    928       vassert(irregular == False);
    929    }
    930 
    931    switch (sz) {
    932       case 8: return ireg64_names[reg];
    933       case 4: return ireg32_names[reg];
    934       case 2: return ireg16_names[reg];
    935       case 1: if (irregular) {
    936                  return ireg8_irregular[reg];
    937               } else {
    938                  return ireg8_names[reg];
    939               }
    940       default: vpanic("nameIReg(amd64)");
    941    }
    942 }
    943 
    944 /* Using the same argument conventions as nameIReg, produce the
    945    guest state offset of an integer register. */
    946 
    947 static
    948 Int offsetIReg ( Int sz, UInt reg, Bool irregular )
    949 {
    950    vassert(reg < 16);
    951    if (sz == 1) {
    952       if (irregular)
    953          vassert(reg < 8);
    954    } else {
    955       vassert(irregular == False);
    956    }
    957 
    958    /* Deal with irregular case -- sz==1 and no REX present */
    959    if (sz == 1 && irregular) {
    960       switch (reg) {
    961          case R_RSP: return 1+ OFFB_RAX;
    962          case R_RBP: return 1+ OFFB_RCX;
    963          case R_RSI: return 1+ OFFB_RDX;
    964          case R_RDI: return 1+ OFFB_RBX;
    965          default:    break; /* use the normal case */
    966       }
    967    }
    968 
    969    /* Normal case */
    970    return integerGuestReg64Offset(reg);
    971 }
    972 
    973 
    974 /* Read the %CL register :: Ity_I8, for shift/rotate operations. */
    975 
    976 static IRExpr* getIRegCL ( void )
    977 {
    978    vassert(!host_is_bigendian);
    979    return IRExpr_Get( OFFB_RCX, Ity_I8 );
    980 }
    981 
    982 
    983 /* Write to the %AH register. */
    984 
    985 static void putIRegAH ( IRExpr* e )
    986 {
    987    vassert(!host_is_bigendian);
    988    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I8);
    989    stmt( IRStmt_Put( OFFB_RAX+1, e ) );
    990 }
    991 
    992 
    993 /* Read/write various widths of %RAX, as it has various
    994    special-purpose uses. */
    995 
    996 static const HChar* nameIRegRAX ( Int sz )
    997 {
    998    switch (sz) {
    999       case 1: return "%al";
   1000       case 2: return "%ax";
   1001       case 4: return "%eax";
   1002       case 8: return "%rax";
   1003       default: vpanic("nameIRegRAX(amd64)");
   1004    }
   1005 }
   1006 
   1007 static IRExpr* getIRegRAX ( Int sz )
   1008 {
   1009    vassert(!host_is_bigendian);
   1010    switch (sz) {
   1011       case 1: return IRExpr_Get( OFFB_RAX, Ity_I8 );
   1012       case 2: return IRExpr_Get( OFFB_RAX, Ity_I16 );
   1013       case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RAX, Ity_I64 ));
   1014       case 8: return IRExpr_Get( OFFB_RAX, Ity_I64 );
   1015       default: vpanic("getIRegRAX(amd64)");
   1016    }
   1017 }
   1018 
   1019 static void putIRegRAX ( Int sz, IRExpr* e )
   1020 {
   1021    IRType ty = typeOfIRExpr(irsb->tyenv, e);
   1022    vassert(!host_is_bigendian);
   1023    switch (sz) {
   1024       case 8: vassert(ty == Ity_I64);
   1025               stmt( IRStmt_Put( OFFB_RAX, e ));
   1026               break;
   1027       case 4: vassert(ty == Ity_I32);
   1028               stmt( IRStmt_Put( OFFB_RAX, unop(Iop_32Uto64,e) ));
   1029               break;
   1030       case 2: vassert(ty == Ity_I16);
   1031               stmt( IRStmt_Put( OFFB_RAX, e ));
   1032               break;
   1033       case 1: vassert(ty == Ity_I8);
   1034               stmt( IRStmt_Put( OFFB_RAX, e ));
   1035               break;
   1036       default: vpanic("putIRegRAX(amd64)");
   1037    }
   1038 }
   1039 
   1040 
   1041 /* Read/write various widths of %RDX, as it has various
   1042    special-purpose uses. */
   1043 
   1044 static const HChar* nameIRegRDX ( Int sz )
   1045 {
   1046    switch (sz) {
   1047       case 1: return "%dl";
   1048       case 2: return "%dx";
   1049       case 4: return "%edx";
   1050       case 8: return "%rdx";
   1051       default: vpanic("nameIRegRDX(amd64)");
   1052    }
   1053 }
   1054 
   1055 static IRExpr* getIRegRDX ( Int sz )
   1056 {
   1057    vassert(!host_is_bigendian);
   1058    switch (sz) {
   1059       case 1: return IRExpr_Get( OFFB_RDX, Ity_I8 );
   1060       case 2: return IRExpr_Get( OFFB_RDX, Ity_I16 );
   1061       case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RDX, Ity_I64 ));
   1062       case 8: return IRExpr_Get( OFFB_RDX, Ity_I64 );
   1063       default: vpanic("getIRegRDX(amd64)");
   1064    }
   1065 }
   1066 
   1067 static void putIRegRDX ( Int sz, IRExpr* e )
   1068 {
   1069    vassert(!host_is_bigendian);
   1070    vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
   1071    switch (sz) {
   1072       case 8: stmt( IRStmt_Put( OFFB_RDX, e ));
   1073               break;
   1074       case 4: stmt( IRStmt_Put( OFFB_RDX, unop(Iop_32Uto64,e) ));
   1075               break;
   1076       case 2: stmt( IRStmt_Put( OFFB_RDX, e ));
   1077               break;
   1078       case 1: stmt( IRStmt_Put( OFFB_RDX, e ));
   1079               break;
   1080       default: vpanic("putIRegRDX(amd64)");
   1081    }
   1082 }
   1083 
   1084 
   1085 /* Simplistic functions to deal with the integer registers as a
   1086    straightforward bank of 16 64-bit regs. */
   1087 
   1088 static IRExpr* getIReg64 ( UInt regno )
   1089 {
   1090    return IRExpr_Get( integerGuestReg64Offset(regno),
   1091                       Ity_I64 );
   1092 }
   1093 
   1094 static void putIReg64 ( UInt regno, IRExpr* e )
   1095 {
   1096    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   1097    stmt( IRStmt_Put( integerGuestReg64Offset(regno), e ) );
   1098 }
   1099 
   1100 static const HChar* nameIReg64 ( UInt regno )
   1101 {
   1102    return nameIReg( 8, regno, False );
   1103 }
   1104 
   1105 
   1106 /* Simplistic functions to deal with the lower halves of integer
   1107    registers as a straightforward bank of 16 32-bit regs. */
   1108 
   1109 static IRExpr* getIReg32 ( UInt regno )
   1110 {
   1111    vassert(!host_is_bigendian);
   1112    return unop(Iop_64to32,
   1113                IRExpr_Get( integerGuestReg64Offset(regno),
   1114                            Ity_I64 ));
   1115 }
   1116 
   1117 static void putIReg32 ( UInt regno, IRExpr* e )
   1118 {
   1119    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
   1120    stmt( IRStmt_Put( integerGuestReg64Offset(regno),
   1121                      unop(Iop_32Uto64,e) ) );
   1122 }
   1123 
   1124 static const HChar* nameIReg32 ( UInt regno )
   1125 {
   1126    return nameIReg( 4, regno, False );
   1127 }
   1128 
   1129 
   1130 /* Simplistic functions to deal with the lower quarters of integer
   1131    registers as a straightforward bank of 16 16-bit regs. */
   1132 
   1133 static IRExpr* getIReg16 ( UInt regno )
   1134 {
   1135    vassert(!host_is_bigendian);
   1136    return IRExpr_Get( integerGuestReg64Offset(regno),
   1137                       Ity_I16 );
   1138 }
   1139 
   1140 static void putIReg16 ( UInt regno, IRExpr* e )
   1141 {
   1142    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
   1143    stmt( IRStmt_Put( integerGuestReg64Offset(regno),
   1144                      unop(Iop_16Uto64,e) ) );
   1145 }
   1146 
   1147 static const HChar* nameIReg16 ( UInt regno )
   1148 {
   1149    return nameIReg( 2, regno, False );
   1150 }
   1151 
   1152 
   1153 /* Sometimes what we know is a 3-bit register number, a REX byte, and
   1154    which field of the REX byte is to be used to extend to a 4-bit
   1155    number.  These functions cater for that situation.
   1156 */
   1157 static IRExpr* getIReg64rexX ( Prefix pfx, UInt lo3bits )
   1158 {
   1159    vassert(lo3bits < 8);
   1160    vassert(IS_VALID_PFX(pfx));
   1161    return getIReg64( lo3bits | (getRexX(pfx) << 3) );
   1162 }
   1163 
   1164 static const HChar* nameIReg64rexX ( Prefix pfx, UInt lo3bits )
   1165 {
   1166    vassert(lo3bits < 8);
   1167    vassert(IS_VALID_PFX(pfx));
   1168    return nameIReg( 8, lo3bits | (getRexX(pfx) << 3), False );
   1169 }
   1170 
   1171 static const HChar* nameIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
   1172 {
   1173    vassert(lo3bits < 8);
   1174    vassert(IS_VALID_PFX(pfx));
   1175    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1176    return nameIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1177                         toBool(sz==1 && !haveREX(pfx)) );
   1178 }
   1179 
   1180 static IRExpr* getIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
   1181 {
   1182    vassert(lo3bits < 8);
   1183    vassert(IS_VALID_PFX(pfx));
   1184    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1185    if (sz == 4) {
   1186       sz = 8;
   1187       return unop(Iop_64to32,
   1188                   IRExpr_Get(
   1189                      offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1190                                      False/*!irregular*/ ),
   1191                      szToITy(sz)
   1192                  )
   1193              );
   1194    } else {
   1195       return IRExpr_Get(
   1196                 offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1197                                 toBool(sz==1 && !haveREX(pfx)) ),
   1198                 szToITy(sz)
   1199              );
   1200    }
   1201 }
   1202 
   1203 static void putIRegRexB ( Int sz, Prefix pfx, UInt lo3bits, IRExpr* e )
   1204 {
   1205    vassert(lo3bits < 8);
   1206    vassert(IS_VALID_PFX(pfx));
   1207    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1208    vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
   1209    stmt( IRStmt_Put(
   1210             offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1211                             toBool(sz==1 && !haveREX(pfx)) ),
   1212             sz==4 ? unop(Iop_32Uto64,e) : e
   1213    ));
   1214 }
   1215 
   1216 
   1217 /* Functions for getting register numbers from modrm bytes and REX
   1218    when we don't have to consider the complexities of integer subreg
   1219    accesses.
   1220 */
   1221 /* Extract the g reg field from a modRM byte, and augment it using the
   1222    REX.R bit from the supplied REX byte.  The R bit usually is
   1223    associated with the g register field.
   1224 */
   1225 static UInt gregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
   1226 {
   1227    Int reg = (Int)( (mod_reg_rm >> 3) & 7 );
   1228    reg += (pfx & PFX_REXR) ? 8 : 0;
   1229    return reg;
   1230 }
   1231 
   1232 /* Extract the e reg field from a modRM byte, and augment it using the
   1233    REX.B bit from the supplied REX byte.  The B bit usually is
   1234    associated with the e register field (when modrm indicates e is a
   1235    register, that is).
   1236 */
   1237 static UInt eregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
   1238 {
   1239    Int rm;
   1240    vassert(epartIsReg(mod_reg_rm));
   1241    rm = (Int)(mod_reg_rm & 0x7);
   1242    rm += (pfx & PFX_REXB) ? 8 : 0;
   1243    return rm;
   1244 }
   1245 
   1246 
   1247 /* General functions for dealing with integer register access. */
   1248 
   1249 /* Produce the guest state offset for a reference to the 'g' register
   1250    field in a modrm byte, taking into account REX (or its absence),
   1251    and the size of the access.
   1252 */
   1253 static UInt offsetIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1254 {
   1255    UInt reg;
   1256    vassert(!host_is_bigendian);
   1257    vassert(IS_VALID_PFX(pfx));
   1258    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1259    reg = gregOfRexRM( pfx, mod_reg_rm );
   1260    return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
   1261 }
   1262 
   1263 static
   1264 IRExpr* getIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1265 {
   1266    if (sz == 4) {
   1267       sz = 8;
   1268       return unop(Iop_64to32,
   1269                   IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
   1270                               szToITy(sz) ));
   1271    } else {
   1272       return IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
   1273                          szToITy(sz) );
   1274    }
   1275 }
   1276 
   1277 static
   1278 void putIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
   1279 {
   1280    vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
   1281    if (sz == 4) {
   1282       e = unop(Iop_32Uto64,e);
   1283    }
   1284    stmt( IRStmt_Put( offsetIRegG( sz, pfx, mod_reg_rm ), e ) );
   1285 }
   1286 
   1287 static
   1288 const HChar* nameIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1289 {
   1290    return nameIReg( sz, gregOfRexRM(pfx,mod_reg_rm),
   1291                         toBool(sz==1 && !haveREX(pfx)) );
   1292 }
   1293 
   1294 
   1295 static
   1296 IRExpr* getIRegV ( Int sz, Prefix pfx )
   1297 {
   1298    if (sz == 4) {
   1299       sz = 8;
   1300       return unop(Iop_64to32,
   1301                   IRExpr_Get( offsetIReg( sz, getVexNvvvv(pfx), False ),
   1302                               szToITy(sz) ));
   1303    } else {
   1304       return IRExpr_Get( offsetIReg( sz, getVexNvvvv(pfx), False ),
   1305                          szToITy(sz) );
   1306    }
   1307 }
   1308 
   1309 static
   1310 void putIRegV ( Int sz, Prefix pfx, IRExpr* e )
   1311 {
   1312    vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
   1313    if (sz == 4) {
   1314       e = unop(Iop_32Uto64,e);
   1315    }
   1316    stmt( IRStmt_Put( offsetIReg( sz, getVexNvvvv(pfx), False ), e ) );
   1317 }
   1318 
   1319 static
   1320 const HChar* nameIRegV ( Int sz, Prefix pfx )
   1321 {
   1322    return nameIReg( sz, getVexNvvvv(pfx), False );
   1323 }
   1324 
   1325 
   1326 
   1327 /* Produce the guest state offset for a reference to the 'e' register
   1328    field in a modrm byte, taking into account REX (or its absence),
   1329    and the size of the access.  eregOfRexRM will assert if mod_reg_rm
   1330    denotes a memory access rather than a register access.
   1331 */
   1332 static UInt offsetIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1333 {
   1334    UInt reg;
   1335    vassert(!host_is_bigendian);
   1336    vassert(IS_VALID_PFX(pfx));
   1337    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1338    reg = eregOfRexRM( pfx, mod_reg_rm );
   1339    return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
   1340 }
   1341 
   1342 static
   1343 IRExpr* getIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1344 {
   1345    if (sz == 4) {
   1346       sz = 8;
   1347       return unop(Iop_64to32,
   1348                   IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
   1349                               szToITy(sz) ));
   1350    } else {
   1351       return IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
   1352                          szToITy(sz) );
   1353    }
   1354 }
   1355 
   1356 static
   1357 void putIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
   1358 {
   1359    vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
   1360    if (sz == 4) {
   1361       e = unop(Iop_32Uto64,e);
   1362    }
   1363    stmt( IRStmt_Put( offsetIRegE( sz, pfx, mod_reg_rm ), e ) );
   1364 }
   1365 
   1366 static
   1367 const HChar* nameIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1368 {
   1369    return nameIReg( sz, eregOfRexRM(pfx,mod_reg_rm),
   1370                         toBool(sz==1 && !haveREX(pfx)) );
   1371 }
   1372 
   1373 
   1374 /*------------------------------------------------------------*/
   1375 /*--- For dealing with XMM registers                       ---*/
   1376 /*------------------------------------------------------------*/
   1377 
   1378 static Int ymmGuestRegOffset ( UInt ymmreg )
   1379 {
   1380    switch (ymmreg) {
   1381       case 0:  return OFFB_YMM0;
   1382       case 1:  return OFFB_YMM1;
   1383       case 2:  return OFFB_YMM2;
   1384       case 3:  return OFFB_YMM3;
   1385       case 4:  return OFFB_YMM4;
   1386       case 5:  return OFFB_YMM5;
   1387       case 6:  return OFFB_YMM6;
   1388       case 7:  return OFFB_YMM7;
   1389       case 8:  return OFFB_YMM8;
   1390       case 9:  return OFFB_YMM9;
   1391       case 10: return OFFB_YMM10;
   1392       case 11: return OFFB_YMM11;
   1393       case 12: return OFFB_YMM12;
   1394       case 13: return OFFB_YMM13;
   1395       case 14: return OFFB_YMM14;
   1396       case 15: return OFFB_YMM15;
   1397       default: vpanic("ymmGuestRegOffset(amd64)");
   1398    }
   1399 }
   1400 
   1401 static Int xmmGuestRegOffset ( UInt xmmreg )
   1402 {
   1403    /* Correct for little-endian host only. */
   1404    vassert(!host_is_bigendian);
   1405    return ymmGuestRegOffset( xmmreg );
   1406 }
   1407 
   1408 /* Lanes of vector registers are always numbered from zero being the
   1409    least significant lane (rightmost in the register).  */
   1410 
   1411 static Int xmmGuestRegLane16offset ( UInt xmmreg, Int laneno )
   1412 {
   1413    /* Correct for little-endian host only. */
   1414    vassert(!host_is_bigendian);
   1415    vassert(laneno >= 0 && laneno < 8);
   1416    return xmmGuestRegOffset( xmmreg ) + 2 * laneno;
   1417 }
   1418 
   1419 static Int xmmGuestRegLane32offset ( UInt xmmreg, Int laneno )
   1420 {
   1421    /* Correct for little-endian host only. */
   1422    vassert(!host_is_bigendian);
   1423    vassert(laneno >= 0 && laneno < 4);
   1424    return xmmGuestRegOffset( xmmreg ) + 4 * laneno;
   1425 }
   1426 
   1427 static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno )
   1428 {
   1429    /* Correct for little-endian host only. */
   1430    vassert(!host_is_bigendian);
   1431    vassert(laneno >= 0 && laneno < 2);
   1432    return xmmGuestRegOffset( xmmreg ) + 8 * laneno;
   1433 }
   1434 
   1435 static Int ymmGuestRegLane128offset ( UInt ymmreg, Int laneno )
   1436 {
   1437    /* Correct for little-endian host only. */
   1438    vassert(!host_is_bigendian);
   1439    vassert(laneno >= 0 && laneno < 2);
   1440    return ymmGuestRegOffset( ymmreg ) + 16 * laneno;
   1441 }
   1442 
   1443 static Int ymmGuestRegLane64offset ( UInt ymmreg, Int laneno )
   1444 {
   1445    /* Correct for little-endian host only. */
   1446    vassert(!host_is_bigendian);
   1447    vassert(laneno >= 0 && laneno < 4);
   1448    return ymmGuestRegOffset( ymmreg ) + 8 * laneno;
   1449 }
   1450 
   1451 static Int ymmGuestRegLane32offset ( UInt ymmreg, Int laneno )
   1452 {
   1453    /* Correct for little-endian host only. */
   1454    vassert(!host_is_bigendian);
   1455    vassert(laneno >= 0 && laneno < 8);
   1456    return ymmGuestRegOffset( ymmreg ) + 4 * laneno;
   1457 }
   1458 
   1459 static IRExpr* getXMMReg ( UInt xmmreg )
   1460 {
   1461    return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
   1462 }
   1463 
   1464 static IRExpr* getXMMRegLane64 ( UInt xmmreg, Int laneno )
   1465 {
   1466    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_I64 );
   1467 }
   1468 
   1469 static IRExpr* getXMMRegLane64F ( UInt xmmreg, Int laneno )
   1470 {
   1471    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_F64 );
   1472 }
   1473 
   1474 static IRExpr* getXMMRegLane32 ( UInt xmmreg, Int laneno )
   1475 {
   1476    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_I32 );
   1477 }
   1478 
   1479 static IRExpr* getXMMRegLane32F ( UInt xmmreg, Int laneno )
   1480 {
   1481    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_F32 );
   1482 }
   1483 
   1484 static IRExpr* getXMMRegLane16 ( UInt xmmreg, Int laneno )
   1485 {
   1486   return IRExpr_Get( xmmGuestRegLane16offset(xmmreg,laneno), Ity_I16 );
   1487 }
   1488 
   1489 static void putXMMReg ( UInt xmmreg, IRExpr* e )
   1490 {
   1491    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
   1492    stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
   1493 }
   1494 
   1495 static void putXMMRegLane64 ( UInt xmmreg, Int laneno, IRExpr* e )
   1496 {
   1497    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   1498    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
   1499 }
   1500 
   1501 static void putXMMRegLane64F ( UInt xmmreg, Int laneno, IRExpr* e )
   1502 {
   1503    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
   1504    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
   1505 }
   1506 
   1507 static void putXMMRegLane32F ( UInt xmmreg, Int laneno, IRExpr* e )
   1508 {
   1509    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
   1510    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
   1511 }
   1512 
   1513 static void putXMMRegLane32 ( UInt xmmreg, Int laneno, IRExpr* e )
   1514 {
   1515    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
   1516    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
   1517 }
   1518 
   1519 static IRExpr* getYMMReg ( UInt xmmreg )
   1520 {
   1521    return IRExpr_Get( ymmGuestRegOffset(xmmreg), Ity_V256 );
   1522 }
   1523 
   1524 static IRExpr* getYMMRegLane128 ( UInt ymmreg, Int laneno )
   1525 {
   1526    return IRExpr_Get( ymmGuestRegLane128offset(ymmreg,laneno), Ity_V128 );
   1527 }
   1528 
   1529 static IRExpr* getYMMRegLane64 ( UInt ymmreg, Int laneno )
   1530 {
   1531    return IRExpr_Get( ymmGuestRegLane64offset(ymmreg,laneno), Ity_I64 );
   1532 }
   1533 
   1534 static IRExpr* getYMMRegLane32 ( UInt ymmreg, Int laneno )
   1535 {
   1536    return IRExpr_Get( ymmGuestRegLane32offset(ymmreg,laneno), Ity_I32 );
   1537 }
   1538 
   1539 static void putYMMReg ( UInt ymmreg, IRExpr* e )
   1540 {
   1541    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V256);
   1542    stmt( IRStmt_Put( ymmGuestRegOffset(ymmreg), e ) );
   1543 }
   1544 
   1545 static void putYMMRegLane128 ( UInt ymmreg, Int laneno, IRExpr* e )
   1546 {
   1547    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
   1548    stmt( IRStmt_Put( ymmGuestRegLane128offset(ymmreg,laneno), e ) );
   1549 }
   1550 
   1551 static void putYMMRegLane64F ( UInt ymmreg, Int laneno, IRExpr* e )
   1552 {
   1553    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
   1554    stmt( IRStmt_Put( ymmGuestRegLane64offset(ymmreg,laneno), e ) );
   1555 }
   1556 
   1557 static void putYMMRegLane64 ( UInt ymmreg, Int laneno, IRExpr* e )
   1558 {
   1559    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   1560    stmt( IRStmt_Put( ymmGuestRegLane64offset(ymmreg,laneno), e ) );
   1561 }
   1562 
   1563 static void putYMMRegLane32F ( UInt ymmreg, Int laneno, IRExpr* e )
   1564 {
   1565    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
   1566    stmt( IRStmt_Put( ymmGuestRegLane32offset(ymmreg,laneno), e ) );
   1567 }
   1568 
   1569 static void putYMMRegLane32 ( UInt ymmreg, Int laneno, IRExpr* e )
   1570 {
   1571    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
   1572    stmt( IRStmt_Put( ymmGuestRegLane32offset(ymmreg,laneno), e ) );
   1573 }
   1574 
   1575 static IRExpr* mkV128 ( UShort mask )
   1576 {
   1577    return IRExpr_Const(IRConst_V128(mask));
   1578 }
   1579 
   1580 /* Write the low half of a YMM reg and zero out the upper half. */
   1581 static void putYMMRegLoAndZU ( UInt ymmreg, IRExpr* e )
   1582 {
   1583    putYMMRegLane128( ymmreg, 0, e );
   1584    putYMMRegLane128( ymmreg, 1, mkV128(0) );
   1585 }
   1586 
   1587 static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y )
   1588 {
   1589    vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1);
   1590    vassert(typeOfIRExpr(irsb->tyenv,y) == Ity_I1);
   1591    return unop(Iop_64to1,
   1592                binop(Iop_And64,
   1593                      unop(Iop_1Uto64,x),
   1594                      unop(Iop_1Uto64,y)));
   1595 }
   1596 
   1597 /* Generate a compare-and-swap operation, operating on memory at
   1598    'addr'.  The expected value is 'expVal' and the new value is
   1599    'newVal'.  If the operation fails, then transfer control (with a
   1600    no-redir jump (XXX no -- see comment at top of this file)) to
   1601    'restart_point', which is presumably the address of the guest
   1602    instruction again -- retrying, essentially. */
   1603 static void casLE ( IRExpr* addr, IRExpr* expVal, IRExpr* newVal,
   1604                     Addr64 restart_point )
   1605 {
   1606    IRCAS* cas;
   1607    IRType tyE    = typeOfIRExpr(irsb->tyenv, expVal);
   1608    IRType tyN    = typeOfIRExpr(irsb->tyenv, newVal);
   1609    IRTemp oldTmp = newTemp(tyE);
   1610    IRTemp expTmp = newTemp(tyE);
   1611    vassert(tyE == tyN);
   1612    vassert(tyE == Ity_I64 || tyE == Ity_I32
   1613            || tyE == Ity_I16 || tyE == Ity_I8);
   1614    assign(expTmp, expVal);
   1615    cas = mkIRCAS( IRTemp_INVALID, oldTmp, Iend_LE, addr,
   1616                   NULL, mkexpr(expTmp), NULL, newVal );
   1617    stmt( IRStmt_CAS(cas) );
   1618    stmt( IRStmt_Exit(
   1619             binop( mkSizedOp(tyE,Iop_CasCmpNE8),
   1620                    mkexpr(oldTmp), mkexpr(expTmp) ),
   1621             Ijk_Boring, /*Ijk_NoRedir*/
   1622             IRConst_U64( restart_point ),
   1623             OFFB_RIP
   1624          ));
   1625 }
   1626 
   1627 
   1628 /*------------------------------------------------------------*/
   1629 /*--- Helpers for %rflags.                                 ---*/
   1630 /*------------------------------------------------------------*/
   1631 
   1632 /* -------------- Evaluating the flags-thunk. -------------- */
   1633 
   1634 /* Build IR to calculate all the eflags from stored
   1635    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
   1636    Ity_I64. */
   1637 static IRExpr* mk_amd64g_calculate_rflags_all ( void )
   1638 {
   1639    IRExpr** args
   1640       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
   1641                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1642                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1643                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   1644    IRExpr* call
   1645       = mkIRExprCCall(
   1646            Ity_I64,
   1647            0/*regparm*/,
   1648            "amd64g_calculate_rflags_all", &amd64g_calculate_rflags_all,
   1649            args
   1650         );
   1651    /* Exclude OP and NDEP from definedness checking.  We're only
   1652       interested in DEP1 and DEP2. */
   1653    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
   1654    return call;
   1655 }
   1656 
   1657 /* Build IR to calculate some particular condition from stored
   1658    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
   1659    Ity_Bit. */
   1660 static IRExpr* mk_amd64g_calculate_condition ( AMD64Condcode cond )
   1661 {
   1662    IRExpr** args
   1663       = mkIRExprVec_5( mkU64(cond),
   1664                        IRExpr_Get(OFFB_CC_OP,   Ity_I64),
   1665                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1666                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1667                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   1668    IRExpr* call
   1669       = mkIRExprCCall(
   1670            Ity_I64,
   1671            0/*regparm*/,
   1672            "amd64g_calculate_condition", &amd64g_calculate_condition,
   1673            args
   1674         );
   1675    /* Exclude the requested condition, OP and NDEP from definedness
   1676       checking.  We're only interested in DEP1 and DEP2. */
   1677    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<1) | (1<<4);
   1678    return unop(Iop_64to1, call);
   1679 }
   1680 
   1681 /* Build IR to calculate just the carry flag from stored
   1682    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression :: Ity_I64. */
   1683 static IRExpr* mk_amd64g_calculate_rflags_c ( void )
   1684 {
   1685    IRExpr** args
   1686       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
   1687                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1688                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1689                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   1690    IRExpr* call
   1691       = mkIRExprCCall(
   1692            Ity_I64,
   1693            0/*regparm*/,
   1694            "amd64g_calculate_rflags_c", &amd64g_calculate_rflags_c,
   1695            args
   1696         );
   1697    /* Exclude OP and NDEP from definedness checking.  We're only
   1698       interested in DEP1 and DEP2. */
   1699    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
   1700    return call;
   1701 }
   1702 
   1703 
   1704 /* -------------- Building the flags-thunk. -------------- */
   1705 
   1706 /* The machinery in this section builds the flag-thunk following a
   1707    flag-setting operation.  Hence the various setFlags_* functions.
   1708 */
   1709 
   1710 static Bool isAddSub ( IROp op8 )
   1711 {
   1712    return toBool(op8 == Iop_Add8 || op8 == Iop_Sub8);
   1713 }
   1714 
   1715 static Bool isLogic ( IROp op8 )
   1716 {
   1717    return toBool(op8 == Iop_And8 || op8 == Iop_Or8 || op8 == Iop_Xor8);
   1718 }
   1719 
   1720 /* U-widen 1/8/16/32/64 bit int expr to 64. */
   1721 static IRExpr* widenUto64 ( IRExpr* e )
   1722 {
   1723    switch (typeOfIRExpr(irsb->tyenv,e)) {
   1724       case Ity_I64: return e;
   1725       case Ity_I32: return unop(Iop_32Uto64, e);
   1726       case Ity_I16: return unop(Iop_16Uto64, e);
   1727       case Ity_I8:  return unop(Iop_8Uto64, e);
   1728       case Ity_I1:  return unop(Iop_1Uto64, e);
   1729       default: vpanic("widenUto64");
   1730    }
   1731 }
   1732 
   1733 /* S-widen 8/16/32/64 bit int expr to 32. */
   1734 static IRExpr* widenSto64 ( IRExpr* e )
   1735 {
   1736    switch (typeOfIRExpr(irsb->tyenv,e)) {
   1737       case Ity_I64: return e;
   1738       case Ity_I32: return unop(Iop_32Sto64, e);
   1739       case Ity_I16: return unop(Iop_16Sto64, e);
   1740       case Ity_I8:  return unop(Iop_8Sto64, e);
   1741       default: vpanic("widenSto64");
   1742    }
   1743 }
   1744 
   1745 /* Narrow 8/16/32/64 bit int expr to 8/16/32/64.  Clearly only some
   1746    of these combinations make sense. */
   1747 static IRExpr* narrowTo ( IRType dst_ty, IRExpr* e )
   1748 {
   1749    IRType src_ty = typeOfIRExpr(irsb->tyenv,e);
   1750    if (src_ty == dst_ty)
   1751       return e;
   1752    if (src_ty == Ity_I32 && dst_ty == Ity_I16)
   1753       return unop(Iop_32to16, e);
   1754    if (src_ty == Ity_I32 && dst_ty == Ity_I8)
   1755       return unop(Iop_32to8, e);
   1756    if (src_ty == Ity_I64 && dst_ty == Ity_I32)
   1757       return unop(Iop_64to32, e);
   1758    if (src_ty == Ity_I64 && dst_ty == Ity_I16)
   1759       return unop(Iop_64to16, e);
   1760    if (src_ty == Ity_I64 && dst_ty == Ity_I8)
   1761       return unop(Iop_64to8, e);
   1762 
   1763    vex_printf("\nsrc, dst tys are: ");
   1764    ppIRType(src_ty);
   1765    vex_printf(", ");
   1766    ppIRType(dst_ty);
   1767    vex_printf("\n");
   1768    vpanic("narrowTo(amd64)");
   1769 }
   1770 
   1771 
   1772 /* Set the flags thunk OP, DEP1 and DEP2 fields.  The supplied op is
   1773    auto-sized up to the real op. */
   1774 
   1775 static
   1776 void setFlags_DEP1_DEP2 ( IROp op8, IRTemp dep1, IRTemp dep2, IRType ty )
   1777 {
   1778    Int ccOp = 0;
   1779    switch (ty) {
   1780       case Ity_I8:  ccOp = 0; break;
   1781       case Ity_I16: ccOp = 1; break;
   1782       case Ity_I32: ccOp = 2; break;
   1783       case Ity_I64: ccOp = 3; break;
   1784       default: vassert(0);
   1785    }
   1786    switch (op8) {
   1787       case Iop_Add8: ccOp += AMD64G_CC_OP_ADDB;   break;
   1788       case Iop_Sub8: ccOp += AMD64G_CC_OP_SUBB;   break;
   1789       default:       ppIROp(op8);
   1790                      vpanic("setFlags_DEP1_DEP2(amd64)");
   1791    }
   1792    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   1793    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
   1794    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(dep2))) );
   1795 }
   1796 
   1797 
   1798 /* Set the OP and DEP1 fields only, and write zero to DEP2. */
   1799 
   1800 static
   1801 void setFlags_DEP1 ( IROp op8, IRTemp dep1, IRType ty )
   1802 {
   1803    Int ccOp = 0;
   1804    switch (ty) {
   1805       case Ity_I8:  ccOp = 0; break;
   1806       case Ity_I16: ccOp = 1; break;
   1807       case Ity_I32: ccOp = 2; break;
   1808       case Ity_I64: ccOp = 3; break;
   1809       default: vassert(0);
   1810    }
   1811    switch (op8) {
   1812       case Iop_Or8:
   1813       case Iop_And8:
   1814       case Iop_Xor8: ccOp += AMD64G_CC_OP_LOGICB; break;
   1815       default:       ppIROp(op8);
   1816                      vpanic("setFlags_DEP1(amd64)");
   1817    }
   1818    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   1819    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
   1820    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
   1821 }
   1822 
   1823 
   1824 /* For shift operations, we put in the result and the undershifted
   1825    result.  Except if the shift amount is zero, the thunk is left
   1826    unchanged. */
   1827 
   1828 static void setFlags_DEP1_DEP2_shift ( IROp    op64,
   1829                                        IRTemp  res,
   1830                                        IRTemp  resUS,
   1831                                        IRType  ty,
   1832                                        IRTemp  guard )
   1833 {
   1834    Int ccOp = 0;
   1835    switch (ty) {
   1836       case Ity_I8:  ccOp = 0; break;
   1837       case Ity_I16: ccOp = 1; break;
   1838       case Ity_I32: ccOp = 2; break;
   1839       case Ity_I64: ccOp = 3; break;
   1840       default: vassert(0);
   1841    }
   1842 
   1843    vassert(guard);
   1844 
   1845    /* Both kinds of right shifts are handled by the same thunk
   1846       operation. */
   1847    switch (op64) {
   1848       case Iop_Shr64:
   1849       case Iop_Sar64: ccOp += AMD64G_CC_OP_SHRB; break;
   1850       case Iop_Shl64: ccOp += AMD64G_CC_OP_SHLB; break;
   1851       default:        ppIROp(op64);
   1852                       vpanic("setFlags_DEP1_DEP2_shift(amd64)");
   1853    }
   1854 
   1855    /* guard :: Ity_I8.  We need to convert it to I1. */
   1856    IRTemp guardB = newTemp(Ity_I1);
   1857    assign( guardB, binop(Iop_CmpNE8, mkexpr(guard), mkU8(0)) );
   1858 
   1859    /* DEP1 contains the result, DEP2 contains the undershifted value. */
   1860    stmt( IRStmt_Put( OFFB_CC_OP,
   1861                      IRExpr_ITE( mkexpr(guardB),
   1862                                  mkU64(ccOp),
   1863                                  IRExpr_Get(OFFB_CC_OP,Ity_I64) ) ));
   1864    stmt( IRStmt_Put( OFFB_CC_DEP1,
   1865                      IRExpr_ITE( mkexpr(guardB),
   1866                                  widenUto64(mkexpr(res)),
   1867                                  IRExpr_Get(OFFB_CC_DEP1,Ity_I64) ) ));
   1868    stmt( IRStmt_Put( OFFB_CC_DEP2,
   1869                      IRExpr_ITE( mkexpr(guardB),
   1870                                  widenUto64(mkexpr(resUS)),
   1871                                  IRExpr_Get(OFFB_CC_DEP2,Ity_I64) ) ));
   1872 }
   1873 
   1874 
   1875 /* For the inc/dec case, we store in DEP1 the result value and in NDEP
   1876    the former value of the carry flag, which unfortunately we have to
   1877    compute. */
   1878 
   1879 static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
   1880 {
   1881    Int ccOp = inc ? AMD64G_CC_OP_INCB : AMD64G_CC_OP_DECB;
   1882 
   1883    switch (ty) {
   1884       case Ity_I8:  ccOp += 0; break;
   1885       case Ity_I16: ccOp += 1; break;
   1886       case Ity_I32: ccOp += 2; break;
   1887       case Ity_I64: ccOp += 3; break;
   1888       default: vassert(0);
   1889    }
   1890 
   1891    /* This has to come first, because calculating the C flag
   1892       may require reading all four thunk fields. */
   1893    stmt( IRStmt_Put( OFFB_CC_NDEP, mk_amd64g_calculate_rflags_c()) );
   1894    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   1895    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(res))) );
   1896    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
   1897 }
   1898 
   1899 
   1900 /* Multiplies are pretty much like add and sub: DEP1 and DEP2 hold the
   1901    two arguments. */
   1902 
   1903 static
   1904 void setFlags_MUL ( IRType ty, IRTemp arg1, IRTemp arg2, ULong base_op )
   1905 {
   1906    switch (ty) {
   1907       case Ity_I8:
   1908          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+0) ) );
   1909          break;
   1910       case Ity_I16:
   1911          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+1) ) );
   1912          break;
   1913       case Ity_I32:
   1914          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+2) ) );
   1915          break;
   1916       case Ity_I64:
   1917          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+3) ) );
   1918          break;
   1919       default:
   1920          vpanic("setFlags_MUL(amd64)");
   1921    }
   1922    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(arg1)) ));
   1923    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(arg2)) ));
   1924 }
   1925 
   1926 
   1927 /* -------------- Condition codes. -------------- */
   1928 
   1929 /* Condition codes, using the AMD encoding.  */
   1930 
   1931 static const HChar* name_AMD64Condcode ( AMD64Condcode cond )
   1932 {
   1933    switch (cond) {
   1934       case AMD64CondO:      return "o";
   1935       case AMD64CondNO:     return "no";
   1936       case AMD64CondB:      return "b";
   1937       case AMD64CondNB:     return "ae"; /*"nb";*/
   1938       case AMD64CondZ:      return "e"; /*"z";*/
   1939       case AMD64CondNZ:     return "ne"; /*"nz";*/
   1940       case AMD64CondBE:     return "be";
   1941       case AMD64CondNBE:    return "a"; /*"nbe";*/
   1942       case AMD64CondS:      return "s";
   1943       case AMD64CondNS:     return "ns";
   1944       case AMD64CondP:      return "p";
   1945       case AMD64CondNP:     return "np";
   1946       case AMD64CondL:      return "l";
   1947       case AMD64CondNL:     return "ge"; /*"nl";*/
   1948       case AMD64CondLE:     return "le";
   1949       case AMD64CondNLE:    return "g"; /*"nle";*/
   1950       case AMD64CondAlways: return "ALWAYS";
   1951       default: vpanic("name_AMD64Condcode");
   1952    }
   1953 }
   1954 
   1955 static
   1956 AMD64Condcode positiveIse_AMD64Condcode ( AMD64Condcode  cond,
   1957                                           /*OUT*/Bool*   needInvert )
   1958 {
   1959    vassert(cond >= AMD64CondO && cond <= AMD64CondNLE);
   1960    if (cond & 1) {
   1961       *needInvert = True;
   1962       return cond-1;
   1963    } else {
   1964       *needInvert = False;
   1965       return cond;
   1966    }
   1967 }
   1968 
   1969 
   1970 /* -------------- Helpers for ADD/SUB with carry. -------------- */
   1971 
   1972 /* Given ta1, ta2 and tres, compute tres = ADC(ta1,ta2) and set flags
   1973    appropriately.
   1974 
   1975    Optionally, generate a store for the 'tres' value.  This can either
   1976    be a normal store, or it can be a cas-with-possible-failure style
   1977    store:
   1978 
   1979    if taddr is IRTemp_INVALID, then no store is generated.
   1980 
   1981    if taddr is not IRTemp_INVALID, then a store (using taddr as
   1982    the address) is generated:
   1983 
   1984      if texpVal is IRTemp_INVALID then a normal store is
   1985      generated, and restart_point must be zero (it is irrelevant).
   1986 
   1987      if texpVal is not IRTemp_INVALID then a cas-style store is
   1988      generated.  texpVal is the expected value, restart_point
   1989      is the restart point if the store fails, and texpVal must
   1990      have the same type as tres.
   1991 
   1992 */
   1993 static void helper_ADC ( Int sz,
   1994                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   1995                          /* info about optional store: */
   1996                          IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
   1997 {
   1998    UInt    thunkOp;
   1999    IRType  ty    = szToITy(sz);
   2000    IRTemp  oldc  = newTemp(Ity_I64);
   2001    IRTemp  oldcn = newTemp(ty);
   2002    IROp    plus  = mkSizedOp(ty, Iop_Add8);
   2003    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   2004 
   2005    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   2006 
   2007    switch (sz) {
   2008       case 8:  thunkOp = AMD64G_CC_OP_ADCQ; break;
   2009       case 4:  thunkOp = AMD64G_CC_OP_ADCL; break;
   2010       case 2:  thunkOp = AMD64G_CC_OP_ADCW; break;
   2011       case 1:  thunkOp = AMD64G_CC_OP_ADCB; break;
   2012       default: vassert(0);
   2013    }
   2014 
   2015    /* oldc = old carry flag, 0 or 1 */
   2016    assign( oldc,  binop(Iop_And64,
   2017                         mk_amd64g_calculate_rflags_c(),
   2018                         mkU64(1)) );
   2019 
   2020    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   2021 
   2022    assign( tres, binop(plus,
   2023                        binop(plus,mkexpr(ta1),mkexpr(ta2)),
   2024                        mkexpr(oldcn)) );
   2025 
   2026    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   2027       start of this function. */
   2028    if (taddr != IRTemp_INVALID) {
   2029       if (texpVal == IRTemp_INVALID) {
   2030          vassert(restart_point == 0);
   2031          storeLE( mkexpr(taddr), mkexpr(tres) );
   2032       } else {
   2033          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   2034          /* .. and hence 'texpVal' has the same type as 'tres'. */
   2035          casLE( mkexpr(taddr),
   2036                 mkexpr(texpVal), mkexpr(tres), restart_point );
   2037       }
   2038    }
   2039 
   2040    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
   2041    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1))  ));
   2042    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
   2043                                                          mkexpr(oldcn)) )) );
   2044    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   2045 }
   2046 
   2047 
   2048 /* Given ta1, ta2 and tres, compute tres = SBB(ta1,ta2) and set flags
   2049    appropriately.  As with helper_ADC, possibly generate a store of
   2050    the result -- see comments on helper_ADC for details.
   2051 */
   2052 static void helper_SBB ( Int sz,
   2053                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   2054                          /* info about optional store: */
   2055                          IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
   2056 {
   2057    UInt    thunkOp;
   2058    IRType  ty    = szToITy(sz);
   2059    IRTemp  oldc  = newTemp(Ity_I64);
   2060    IRTemp  oldcn = newTemp(ty);
   2061    IROp    minus = mkSizedOp(ty, Iop_Sub8);
   2062    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   2063 
   2064    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   2065 
   2066    switch (sz) {
   2067       case 8:  thunkOp = AMD64G_CC_OP_SBBQ; break;
   2068       case 4:  thunkOp = AMD64G_CC_OP_SBBL; break;
   2069       case 2:  thunkOp = AMD64G_CC_OP_SBBW; break;
   2070       case 1:  thunkOp = AMD64G_CC_OP_SBBB; break;
   2071       default: vassert(0);
   2072    }
   2073 
   2074    /* oldc = old carry flag, 0 or 1 */
   2075    assign( oldc, binop(Iop_And64,
   2076                        mk_amd64g_calculate_rflags_c(),
   2077                        mkU64(1)) );
   2078 
   2079    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   2080 
   2081    assign( tres, binop(minus,
   2082                        binop(minus,mkexpr(ta1),mkexpr(ta2)),
   2083                        mkexpr(oldcn)) );
   2084 
   2085    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   2086       start of this function. */
   2087    if (taddr != IRTemp_INVALID) {
   2088       if (texpVal == IRTemp_INVALID) {
   2089          vassert(restart_point == 0);
   2090          storeLE( mkexpr(taddr), mkexpr(tres) );
   2091       } else {
   2092          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   2093          /* .. and hence 'texpVal' has the same type as 'tres'. */
   2094          casLE( mkexpr(taddr),
   2095                 mkexpr(texpVal), mkexpr(tres), restart_point );
   2096       }
   2097    }
   2098 
   2099    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
   2100    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1) )) );
   2101    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
   2102                                                          mkexpr(oldcn)) )) );
   2103    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   2104 }
   2105 
   2106 
   2107 /* -------------- Helpers for disassembly printing. -------------- */
   2108 
   2109 static const HChar* nameGrp1 ( Int opc_aux )
   2110 {
   2111    static const HChar* grp1_names[8]
   2112      = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
   2113    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(amd64)");
   2114    return grp1_names[opc_aux];
   2115 }
   2116 
   2117 static const HChar* nameGrp2 ( Int opc_aux )
   2118 {
   2119    static const HChar* grp2_names[8]
   2120      = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
   2121    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(amd64)");
   2122    return grp2_names[opc_aux];
   2123 }
   2124 
   2125 static const HChar* nameGrp4 ( Int opc_aux )
   2126 {
   2127    static const HChar* grp4_names[8]
   2128      = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
   2129    if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(amd64)");
   2130    return grp4_names[opc_aux];
   2131 }
   2132 
   2133 static const HChar* nameGrp5 ( Int opc_aux )
   2134 {
   2135    static const HChar* grp5_names[8]
   2136      = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
   2137    if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(amd64)");
   2138    return grp5_names[opc_aux];
   2139 }
   2140 
   2141 static const HChar* nameGrp8 ( Int opc_aux )
   2142 {
   2143    static const HChar* grp8_names[8]
   2144       = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
   2145    if (opc_aux < 4 || opc_aux > 7) vpanic("nameGrp8(amd64)");
   2146    return grp8_names[opc_aux];
   2147 }
   2148 
   2149 //.. static const HChar* nameSReg ( UInt sreg )
   2150 //.. {
   2151 //..    switch (sreg) {
   2152 //..       case R_ES: return "%es";
   2153 //..       case R_CS: return "%cs";
   2154 //..       case R_SS: return "%ss";
   2155 //..       case R_DS: return "%ds";
   2156 //..       case R_FS: return "%fs";
   2157 //..       case R_GS: return "%gs";
   2158 //..       default: vpanic("nameSReg(x86)");
   2159 //..    }
   2160 //.. }
   2161 
   2162 static const HChar* nameMMXReg ( Int mmxreg )
   2163 {
   2164    static const HChar* mmx_names[8]
   2165      = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
   2166    if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(amd64,guest)");
   2167    return mmx_names[mmxreg];
   2168 }
   2169 
   2170 static const HChar* nameXMMReg ( Int xmmreg )
   2171 {
   2172    static const HChar* xmm_names[16]
   2173      = { "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3",
   2174          "%xmm4",  "%xmm5",  "%xmm6",  "%xmm7",
   2175          "%xmm8",  "%xmm9",  "%xmm10", "%xmm11",
   2176          "%xmm12", "%xmm13", "%xmm14", "%xmm15" };
   2177    if (xmmreg < 0 || xmmreg > 15) vpanic("nameXMMReg(amd64)");
   2178    return xmm_names[xmmreg];
   2179 }
   2180 
   2181 static const HChar* nameMMXGran ( Int gran )
   2182 {
   2183    switch (gran) {
   2184       case 0: return "b";
   2185       case 1: return "w";
   2186       case 2: return "d";
   2187       case 3: return "q";
   2188       default: vpanic("nameMMXGran(amd64,guest)");
   2189    }
   2190 }
   2191 
   2192 static HChar nameISize ( Int size )
   2193 {
   2194    switch (size) {
   2195       case 8: return 'q';
   2196       case 4: return 'l';
   2197       case 2: return 'w';
   2198       case 1: return 'b';
   2199       default: vpanic("nameISize(amd64)");
   2200    }
   2201 }
   2202 
   2203 static const HChar* nameYMMReg ( Int ymmreg )
   2204 {
   2205    static const HChar* ymm_names[16]
   2206      = { "%ymm0",  "%ymm1",  "%ymm2",  "%ymm3",
   2207          "%ymm4",  "%ymm5",  "%ymm6",  "%ymm7",
   2208          "%ymm8",  "%ymm9",  "%ymm10", "%ymm11",
   2209          "%ymm12", "%ymm13", "%ymm14", "%ymm15" };
   2210    if (ymmreg < 0 || ymmreg > 15) vpanic("nameYMMReg(amd64)");
   2211    return ymm_names[ymmreg];
   2212 }
   2213 
   2214 
   2215 /*------------------------------------------------------------*/
   2216 /*--- JMP helpers                                          ---*/
   2217 /*------------------------------------------------------------*/
   2218 
   2219 static void jmp_lit( /*MOD*/DisResult* dres,
   2220                      IRJumpKind kind, Addr64 d64 )
   2221 {
   2222    vassert(dres->whatNext    == Dis_Continue);
   2223    vassert(dres->len         == 0);
   2224    vassert(dres->continueAt  == 0);
   2225    vassert(dres->jk_StopHere == Ijk_INVALID);
   2226    dres->whatNext    = Dis_StopHere;
   2227    dres->jk_StopHere = kind;
   2228    stmt( IRStmt_Put( OFFB_RIP, mkU64(d64) ) );
   2229 }
   2230 
   2231 static void jmp_treg( /*MOD*/DisResult* dres,
   2232                       IRJumpKind kind, IRTemp t )
   2233 {
   2234    vassert(dres->whatNext    == Dis_Continue);
   2235    vassert(dres->len         == 0);
   2236    vassert(dres->continueAt  == 0);
   2237    vassert(dres->jk_StopHere == Ijk_INVALID);
   2238    dres->whatNext    = Dis_StopHere;
   2239    dres->jk_StopHere = kind;
   2240    stmt( IRStmt_Put( OFFB_RIP, mkexpr(t) ) );
   2241 }
   2242 
   2243 static
   2244 void jcc_01 ( /*MOD*/DisResult* dres,
   2245               AMD64Condcode cond, Addr64 d64_false, Addr64 d64_true )
   2246 {
   2247    Bool          invert;
   2248    AMD64Condcode condPos;
   2249    vassert(dres->whatNext    == Dis_Continue);
   2250    vassert(dres->len         == 0);
   2251    vassert(dres->continueAt  == 0);
   2252    vassert(dres->jk_StopHere == Ijk_INVALID);
   2253    dres->whatNext    = Dis_StopHere;
   2254    dres->jk_StopHere = Ijk_Boring;
   2255    condPos = positiveIse_AMD64Condcode ( cond, &invert );
   2256    if (invert) {
   2257       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
   2258                          Ijk_Boring,
   2259                          IRConst_U64(d64_false),
   2260                          OFFB_RIP ) );
   2261       stmt( IRStmt_Put( OFFB_RIP, mkU64(d64_true) ) );
   2262    } else {
   2263       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
   2264                          Ijk_Boring,
   2265                          IRConst_U64(d64_true),
   2266                          OFFB_RIP ) );
   2267       stmt( IRStmt_Put( OFFB_RIP, mkU64(d64_false) ) );
   2268    }
   2269 }
   2270 
   2271 /* Let new_rsp be the %rsp value after a call/return.  Let nia be the
   2272    guest address of the next instruction to be executed.
   2273 
   2274    This function generates an AbiHint to say that -128(%rsp)
   2275    .. -1(%rsp) should now be regarded as uninitialised.
   2276 */
   2277 static
   2278 void make_redzone_AbiHint ( VexAbiInfo* vbi,
   2279                             IRTemp new_rsp, IRTemp nia, const HChar* who )
   2280 {
   2281    Int szB = vbi->guest_stack_redzone_size;
   2282    vassert(szB >= 0);
   2283 
   2284    /* A bit of a kludge.  Currently the only AbI we've guested AMD64
   2285       for is ELF.  So just check it's the expected 128 value
   2286       (paranoia). */
   2287    vassert(szB == 128);
   2288 
   2289    if (0) vex_printf("AbiHint: %s\n", who);
   2290    vassert(typeOfIRTemp(irsb->tyenv, new_rsp) == Ity_I64);
   2291    vassert(typeOfIRTemp(irsb->tyenv, nia) == Ity_I64);
   2292    if (szB > 0)
   2293       stmt( IRStmt_AbiHint(
   2294                binop(Iop_Sub64, mkexpr(new_rsp), mkU64(szB)),
   2295                szB,
   2296                mkexpr(nia)
   2297             ));
   2298 }
   2299 
   2300 
   2301 /*------------------------------------------------------------*/
   2302 /*--- Disassembling addressing modes                       ---*/
   2303 /*------------------------------------------------------------*/
   2304 
   2305 static
   2306 const HChar* segRegTxt ( Prefix pfx )
   2307 {
   2308    if (pfx & PFX_CS) return "%cs:";
   2309    if (pfx & PFX_DS) return "%ds:";
   2310    if (pfx & PFX_ES) return "%es:";
   2311    if (pfx & PFX_FS) return "%fs:";
   2312    if (pfx & PFX_GS) return "%gs:";
   2313    if (pfx & PFX_SS) return "%ss:";
   2314    return ""; /* no override */
   2315 }
   2316 
   2317 
   2318 /* 'virtual' is an IRExpr* holding a virtual address.  Convert it to a
   2319    linear address by adding any required segment override as indicated
   2320    by sorb, and also dealing with any address size override
   2321    present. */
   2322 static
   2323 IRExpr* handleAddrOverrides ( VexAbiInfo* vbi,
   2324                               Prefix pfx, IRExpr* virtual )
   2325 {
   2326    /* --- segment overrides --- */
   2327    if (pfx & PFX_FS) {
   2328       if (vbi->guest_amd64_assume_fs_is_zero) {
   2329          /* Note that this is a linux-kernel specific hack that relies
   2330             on the assumption that %fs is always zero. */
   2331          /* return virtual + guest_FS_ZERO. */
   2332          virtual = binop(Iop_Add64, virtual,
   2333                                     IRExpr_Get(OFFB_FS_ZERO, Ity_I64));
   2334       } else {
   2335          unimplemented("amd64 %fs segment override");
   2336       }
   2337    }
   2338 
   2339    if (pfx & PFX_GS) {
   2340       if (vbi->guest_amd64_assume_gs_is_0x60) {
   2341          /* Note that this is a darwin-kernel specific hack that relies
   2342             on the assumption that %gs is always 0x60. */
   2343          /* return virtual + guest_GS_0x60. */
   2344          virtual = binop(Iop_Add64, virtual,
   2345                                     IRExpr_Get(OFFB_GS_0x60, Ity_I64));
   2346       } else {
   2347          unimplemented("amd64 %gs segment override");
   2348       }
   2349    }
   2350 
   2351    /* cs, ds, es and ss are simply ignored in 64-bit mode. */
   2352 
   2353    /* --- address size override --- */
   2354    if (haveASO(pfx))
   2355       virtual = unop(Iop_32Uto64, unop(Iop_64to32, virtual));
   2356 
   2357    return virtual;
   2358 }
   2359 
   2360 //.. {
   2361 //..    Int    sreg;
   2362 //..    IRType hWordTy;
   2363 //..    IRTemp ldt_ptr, gdt_ptr, seg_selector, r64;
   2364 //..
   2365 //..    if (sorb == 0)
   2366 //..       /* the common case - no override */
   2367 //..       return virtual;
   2368 //..
   2369 //..    switch (sorb) {
   2370 //..       case 0x3E: sreg = R_DS; break;
   2371 //..       case 0x26: sreg = R_ES; break;
   2372 //..       case 0x64: sreg = R_FS; break;
   2373 //..       case 0x65: sreg = R_GS; break;
   2374 //..       default: vpanic("handleAddrOverrides(x86,guest)");
   2375 //..    }
   2376 //..
   2377 //..    hWordTy = sizeof(HWord)==4 ? Ity_I32 : Ity_I64;
   2378 //..
   2379 //..    seg_selector = newTemp(Ity_I32);
   2380 //..    ldt_ptr      = newTemp(hWordTy);
   2381 //..    gdt_ptr      = newTemp(hWordTy);
   2382 //..    r64          = newTemp(Ity_I64);
   2383 //..
   2384 //..    assign( seg_selector, unop(Iop_16Uto32, getSReg(sreg)) );
   2385 //..    assign( ldt_ptr, IRExpr_Get( OFFB_LDT, hWordTy ));
   2386 //..    assign( gdt_ptr, IRExpr_Get( OFFB_GDT, hWordTy ));
   2387 //..
   2388 //..    /*
   2389 //..    Call this to do the translation and limit checks:
   2390 //..    ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
   2391 //..                                  UInt seg_selector, UInt virtual_addr )
   2392 //..    */
   2393 //..    assign(
   2394 //..       r64,
   2395 //..       mkIRExprCCall(
   2396 //..          Ity_I64,
   2397 //..          0/*regparms*/,
   2398 //..          "x86g_use_seg_selector",
   2399 //..          &x86g_use_seg_selector,
   2400 //..          mkIRExprVec_4( mkexpr(ldt_ptr), mkexpr(gdt_ptr),
   2401 //..                         mkexpr(seg_selector), virtual)
   2402 //..       )
   2403 //..    );
   2404 //..
   2405 //..    /* If the high 32 of the result are non-zero, there was a
   2406 //..       failure in address translation.  In which case, make a
   2407 //..       quick exit.
   2408 //..    */
   2409 //..    stmt(
   2410 //..       IRStmt_Exit(
   2411 //..          binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
   2412 //..          Ijk_MapFail,
   2413 //..          IRConst_U32( guest_eip_curr_instr )
   2414 //..       )
   2415 //..    );
   2416 //..
   2417 //..    /* otherwise, here's the translated result. */
   2418 //..    return unop(Iop_64to32, mkexpr(r64));
   2419 //.. }
   2420 
   2421 
   2422 /* Generate IR to calculate an address indicated by a ModRM and
   2423    following SIB bytes.  The expression, and the number of bytes in
   2424    the address mode, are returned (the latter in *len).  Note that
   2425    this fn should not be called if the R/M part of the address denotes
   2426    a register instead of memory.  If print_codegen is true, text of
   2427    the addressing mode is placed in buf.
   2428 
   2429    The computed address is stored in a new tempreg, and the
   2430    identity of the tempreg is returned.
   2431 
   2432    extra_bytes holds the number of bytes after the amode, as supplied
   2433    by the caller.  This is needed to make sense of %rip-relative
   2434    addresses.  Note that the value that *len is set to is only the
   2435    length of the amode itself and does not include the value supplied
   2436    in extra_bytes.
   2437  */
   2438 
   2439 static IRTemp disAMode_copy2tmp ( IRExpr* addr64 )
   2440 {
   2441    IRTemp tmp = newTemp(Ity_I64);
   2442    assign( tmp, addr64 );
   2443    return tmp;
   2444 }
   2445 
   2446 static
   2447 IRTemp disAMode ( /*OUT*/Int* len,
   2448                   VexAbiInfo* vbi, Prefix pfx, Long delta,
   2449                   /*OUT*/HChar* buf, Int extra_bytes )
   2450 {
   2451    UChar mod_reg_rm = getUChar(delta);
   2452    delta++;
   2453 
   2454    buf[0] = (UChar)0;
   2455    vassert(extra_bytes >= 0 && extra_bytes < 10);
   2456 
   2457    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   2458       jump table seems a bit excessive.
   2459    */
   2460    mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
   2461    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   2462                                                /* is now XX0XXYYY */
   2463    mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
   2464    switch (mod_reg_rm) {
   2465 
   2466       /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
   2467          REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
   2468       */
   2469       case 0x00: case 0x01: case 0x02: case 0x03:
   2470       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   2471          { UChar rm = toUChar(mod_reg_rm & 7);
   2472            DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
   2473            *len = 1;
   2474            return disAMode_copy2tmp(
   2475                   handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,rm)));
   2476          }
   2477 
   2478       /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
   2479          REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
   2480       */
   2481       case 0x08: case 0x09: case 0x0A: case 0x0B:
   2482       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   2483          { UChar rm = toUChar(mod_reg_rm & 7);
   2484            Long d   = getSDisp8(delta);
   2485            if (d == 0) {
   2486               DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
   2487            } else {
   2488               DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
   2489            }
   2490            *len = 2;
   2491            return disAMode_copy2tmp(
   2492                   handleAddrOverrides(vbi, pfx,
   2493                      binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
   2494          }
   2495 
   2496       /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
   2497          REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
   2498       */
   2499       case 0x10: case 0x11: case 0x12: case 0x13:
   2500       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   2501          { UChar rm = toUChar(mod_reg_rm & 7);
   2502            Long  d  = getSDisp32(delta);
   2503            DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
   2504            *len = 5;
   2505            return disAMode_copy2tmp(
   2506                   handleAddrOverrides(vbi, pfx,
   2507                      binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
   2508          }
   2509 
   2510       /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
   2511       /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
   2512       case 0x18: case 0x19: case 0x1A: case 0x1B:
   2513       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   2514          vpanic("disAMode(amd64): not an addr!");
   2515 
   2516       /* RIP + disp32.  This assumes that guest_RIP_curr_instr is set
   2517          correctly at the start of handling each instruction. */
   2518       case 0x05:
   2519          { Long d = getSDisp32(delta);
   2520            *len = 5;
   2521            DIS(buf, "%s%lld(%%rip)", segRegTxt(pfx), d);
   2522            /* We need to know the next instruction's start address.
   2523               Try and figure out what it is, record the guess, and ask
   2524               the top-level driver logic (bbToIR_AMD64) to check we
   2525               guessed right, after the instruction is completely
   2526               decoded. */
   2527            guest_RIP_next_mustcheck = True;
   2528            guest_RIP_next_assumed = guest_RIP_bbstart
   2529                                     + delta+4 + extra_bytes;
   2530            return disAMode_copy2tmp(
   2531                      handleAddrOverrides(vbi, pfx,
   2532                         binop(Iop_Add64, mkU64(guest_RIP_next_assumed),
   2533                                          mkU64(d))));
   2534          }
   2535 
   2536       case 0x04: {
   2537          /* SIB, with no displacement.  Special cases:
   2538             -- %rsp cannot act as an index value.
   2539                If index_r indicates %rsp, zero is used for the index.
   2540             -- when mod is zero and base indicates RBP or R13, base is
   2541                instead a 32-bit sign-extended literal.
   2542             It's all madness, I tell you.  Extract %index, %base and
   2543             scale from the SIB byte.  The value denoted is then:
   2544                | %index == %RSP && (%base == %RBP || %base == %R13)
   2545                = d32 following SIB byte
   2546                | %index == %RSP && !(%base == %RBP || %base == %R13)
   2547                = %base
   2548                | %index != %RSP && (%base == %RBP || %base == %R13)
   2549                = d32 following SIB byte + (%index << scale)
   2550                | %index != %RSP && !(%base == %RBP || %base == %R13)
   2551                = %base + (%index << scale)
   2552          */
   2553          UChar sib     = getUChar(delta);
   2554          UChar scale   = toUChar((sib >> 6) & 3);
   2555          UChar index_r = toUChar((sib >> 3) & 7);
   2556          UChar base_r  = toUChar(sib & 7);
   2557          /* correct since #(R13) == 8 + #(RBP) */
   2558          Bool  base_is_BPor13 = toBool(base_r == R_RBP);
   2559          Bool  index_is_SP    = toBool(index_r == R_RSP && 0==getRexX(pfx));
   2560          delta++;
   2561 
   2562          if ((!index_is_SP) && (!base_is_BPor13)) {
   2563             if (scale == 0) {
   2564                DIS(buf, "%s(%s,%s)", segRegTxt(pfx),
   2565                          nameIRegRexB(8,pfx,base_r),
   2566                          nameIReg64rexX(pfx,index_r));
   2567             } else {
   2568                DIS(buf, "%s(%s,%s,%d)", segRegTxt(pfx),
   2569                          nameIRegRexB(8,pfx,base_r),
   2570                          nameIReg64rexX(pfx,index_r), 1<<scale);
   2571             }
   2572             *len = 2;
   2573             return
   2574                disAMode_copy2tmp(
   2575                handleAddrOverrides(vbi, pfx,
   2576                   binop(Iop_Add64,
   2577                         getIRegRexB(8,pfx,base_r),
   2578                         binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
   2579                               mkU8(scale)))));
   2580          }
   2581 
   2582          if ((!index_is_SP) && base_is_BPor13) {
   2583             Long d = getSDisp32(delta);
   2584             DIS(buf, "%s%lld(,%s,%d)", segRegTxt(pfx), d,
   2585                       nameIReg64rexX(pfx,index_r), 1<<scale);
   2586             *len = 6;
   2587             return
   2588                disAMode_copy2tmp(
   2589                handleAddrOverrides(vbi, pfx,
   2590                   binop(Iop_Add64,
   2591                         binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
   2592                                          mkU8(scale)),
   2593                         mkU64(d))));
   2594          }
   2595 
   2596          if (index_is_SP && (!base_is_BPor13)) {
   2597             DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,base_r));
   2598             *len = 2;
   2599             return disAMode_copy2tmp(
   2600                    handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,base_r)));
   2601          }
   2602 
   2603          if (index_is_SP && base_is_BPor13) {
   2604             Long d = getSDisp32(delta);
   2605             DIS(buf, "%s%lld", segRegTxt(pfx), d);
   2606             *len = 6;
   2607             return disAMode_copy2tmp(
   2608                    handleAddrOverrides(vbi, pfx, mkU64(d)));
   2609          }
   2610 
   2611          vassert(0);
   2612       }
   2613 
   2614       /* SIB, with 8-bit displacement.  Special cases:
   2615          -- %esp cannot act as an index value.
   2616             If index_r indicates %esp, zero is used for the index.
   2617          Denoted value is:
   2618             | %index == %ESP
   2619             = d8 + %base
   2620             | %index != %ESP
   2621             = d8 + %base + (%index << scale)
   2622       */
   2623       case 0x0C: {
   2624          UChar sib     = getUChar(delta);
   2625          UChar scale   = toUChar((sib >> 6) & 3);
   2626          UChar index_r = toUChar((sib >> 3) & 7);
   2627          UChar base_r  = toUChar(sib & 7);
   2628          Long d        = getSDisp8(delta+1);
   2629 
   2630          if (index_r == R_RSP && 0==getRexX(pfx)) {
   2631             DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
   2632                                    d, nameIRegRexB(8,pfx,base_r));
   2633             *len = 3;
   2634             return disAMode_copy2tmp(
   2635                    handleAddrOverrides(vbi, pfx,
   2636                       binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
   2637          } else {
   2638             if (scale == 0) {
   2639                DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
   2640                          nameIRegRexB(8,pfx,base_r),
   2641                          nameIReg64rexX(pfx,index_r));
   2642             } else {
   2643                DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
   2644                          nameIRegRexB(8,pfx,base_r),
   2645                          nameIReg64rexX(pfx,index_r), 1<<scale);
   2646             }
   2647             *len = 3;
   2648             return
   2649                 disAMode_copy2tmp(
   2650                 handleAddrOverrides(vbi, pfx,
   2651                   binop(Iop_Add64,
   2652                         binop(Iop_Add64,
   2653                               getIRegRexB(8,pfx,base_r),
   2654                               binop(Iop_Shl64,
   2655                                     getIReg64rexX(pfx,index_r), mkU8(scale))),
   2656                         mkU64(d))));
   2657          }
   2658          vassert(0); /*NOTREACHED*/
   2659       }
   2660 
   2661       /* SIB, with 32-bit displacement.  Special cases:
   2662          -- %rsp cannot act as an index value.
   2663             If index_r indicates %rsp, zero is used for the index.
   2664          Denoted value is:
   2665             | %index == %RSP
   2666             = d32 + %base
   2667             | %index != %RSP
   2668             = d32 + %base + (%index << scale)
   2669       */
   2670       case 0x14: {
   2671          UChar sib     = getUChar(delta);
   2672          UChar scale   = toUChar((sib >> 6) & 3);
   2673          UChar index_r = toUChar((sib >> 3) & 7);
   2674          UChar base_r  = toUChar(sib & 7);
   2675          Long d        = getSDisp32(delta+1);
   2676 
   2677          if (index_r == R_RSP && 0==getRexX(pfx)) {
   2678             DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
   2679                                    d, nameIRegRexB(8,pfx,base_r));
   2680             *len = 6;
   2681             return disAMode_copy2tmp(
   2682                    handleAddrOverrides(vbi, pfx,
   2683                       binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
   2684          } else {
   2685             if (scale == 0) {
   2686                DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
   2687                          nameIRegRexB(8,pfx,base_r),
   2688                          nameIReg64rexX(pfx,index_r));
   2689             } else {
   2690                DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
   2691                          nameIRegRexB(8,pfx,base_r),
   2692                          nameIReg64rexX(pfx,index_r), 1<<scale);
   2693             }
   2694             *len = 6;
   2695             return
   2696                 disAMode_copy2tmp(
   2697                 handleAddrOverrides(vbi, pfx,
   2698                   binop(Iop_Add64,
   2699                         binop(Iop_Add64,
   2700                               getIRegRexB(8,pfx,base_r),
   2701                               binop(Iop_Shl64,
   2702                                     getIReg64rexX(pfx,index_r), mkU8(scale))),
   2703                         mkU64(d))));
   2704          }
   2705          vassert(0); /*NOTREACHED*/
   2706       }
   2707 
   2708       default:
   2709          vpanic("disAMode(amd64)");
   2710          return 0; /*notreached*/
   2711    }
   2712 }
   2713 
   2714 
   2715 /* Similarly for VSIB addressing.  This returns just the addend,
   2716    and fills in *rI and *vscale with the register number of the vector
   2717    index and its multiplicand.  */
   2718 static
   2719 IRTemp disAVSIBMode ( /*OUT*/Int* len,
   2720                       VexAbiInfo* vbi, Prefix pfx, Long delta,
   2721                       /*OUT*/HChar* buf, /*OUT*/UInt* rI,
   2722                       IRType ty, /*OUT*/Int* vscale )
   2723 {
   2724    UChar mod_reg_rm = getUChar(delta);
   2725    const HChar *vindex;
   2726 
   2727    *len = 0;
   2728    *rI = 0;
   2729    *vscale = 0;
   2730    buf[0] = (UChar)0;
   2731    if ((mod_reg_rm & 7) != 4 || epartIsReg(mod_reg_rm))
   2732       return IRTemp_INVALID;
   2733 
   2734    UChar sib     = getUChar(delta+1);
   2735    UChar scale   = toUChar((sib >> 6) & 3);
   2736    UChar index_r = toUChar((sib >> 3) & 7);
   2737    UChar base_r  = toUChar(sib & 7);
   2738    Long  d       = 0;
   2739    /* correct since #(R13) == 8 + #(RBP) */
   2740    Bool  base_is_BPor13 = toBool(base_r == R_RBP);
   2741    delta += 2;
   2742    *len = 2;
   2743 
   2744    *rI = index_r | (getRexX(pfx) << 3);
   2745    if (ty == Ity_V128)
   2746       vindex = nameXMMReg(*rI);
   2747    else
   2748       vindex = nameYMMReg(*rI);
   2749    *vscale = 1<<scale;
   2750 
   2751    switch (mod_reg_rm >> 6) {
   2752    case 0:
   2753       if (base_is_BPor13) {
   2754          d = getSDisp32(delta);
   2755          *len += 4;
   2756          if (scale == 0) {
   2757             DIS(buf, "%s%lld(,%s)", segRegTxt(pfx), d, vindex);
   2758          } else {
   2759             DIS(buf, "%s%lld(,%s,%d)", segRegTxt(pfx), d, vindex, 1<<scale);
   2760          }
   2761          return disAMode_copy2tmp( mkU64(d) );
   2762       } else {
   2763          if (scale == 0) {
   2764             DIS(buf, "%s(%s,%s)", segRegTxt(pfx),
   2765                      nameIRegRexB(8,pfx,base_r), vindex);
   2766          } else {
   2767             DIS(buf, "%s(%s,%s,%d)", segRegTxt(pfx),
   2768                      nameIRegRexB(8,pfx,base_r), vindex, 1<<scale);
   2769          }
   2770       }
   2771       break;
   2772    case 1:
   2773       d = getSDisp8(delta);
   2774       *len += 1;
   2775       goto have_disp;
   2776    case 2:
   2777       d = getSDisp32(delta);
   2778       *len += 4;
   2779    have_disp:
   2780       if (scale == 0) {
   2781          DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
   2782                   nameIRegRexB(8,pfx,base_r), vindex);
   2783       } else {
   2784          DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
   2785                   nameIRegRexB(8,pfx,base_r), vindex, 1<<scale);
   2786       }
   2787       break;
   2788    }
   2789 
   2790    if (!d)
   2791       return disAMode_copy2tmp( getIRegRexB(8,pfx,base_r) );
   2792    return disAMode_copy2tmp( binop(Iop_Add64, getIRegRexB(8,pfx,base_r),
   2793                                    mkU64(d)) );
   2794 }
   2795 
   2796 
   2797 /* Figure out the number of (insn-stream) bytes constituting the amode
   2798    beginning at delta.  Is useful for getting hold of literals beyond
   2799    the end of the amode before it has been disassembled.  */
   2800 
   2801 static UInt lengthAMode ( Prefix pfx, Long delta )
   2802 {
   2803    UChar mod_reg_rm = getUChar(delta);
   2804    delta++;
   2805 
   2806    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   2807       jump table seems a bit excessive.
   2808    */
   2809    mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
   2810    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   2811                                                /* is now XX0XXYYY */
   2812    mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
   2813    switch (mod_reg_rm) {
   2814 
   2815       /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
   2816          REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
   2817       */
   2818       case 0x00: case 0x01: case 0x02: case 0x03:
   2819       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   2820          return 1;
   2821 
   2822       /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
   2823          REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
   2824       */
   2825       case 0x08: case 0x09: case 0x0A: case 0x0B:
   2826       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   2827          return 2;
   2828 
   2829       /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
   2830          REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
   2831       */
   2832       case 0x10: case 0x11: case 0x12: case 0x13:
   2833       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   2834          return 5;
   2835 
   2836       /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
   2837       /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
   2838       /* Not an address, but still handled. */
   2839       case 0x18: case 0x19: case 0x1A: case 0x1B:
   2840       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   2841          return 1;
   2842 
   2843       /* RIP + disp32. */
   2844       case 0x05:
   2845          return 5;
   2846 
   2847       case 0x04: {
   2848          /* SIB, with no displacement. */
   2849          UChar sib     = getUChar(delta);
   2850          UChar base_r  = toUChar(sib & 7);
   2851          /* correct since #(R13) == 8 + #(RBP) */
   2852          Bool  base_is_BPor13 = toBool(base_r == R_RBP);
   2853 
   2854          if (base_is_BPor13) {
   2855             return 6;
   2856          } else {
   2857             return 2;
   2858          }
   2859       }
   2860 
   2861       /* SIB, with 8-bit displacement. */
   2862       case 0x0C:
   2863          return 3;
   2864 
   2865       /* SIB, with 32-bit displacement. */
   2866       case 0x14:
   2867          return 6;
   2868 
   2869       default:
   2870          vpanic("lengthAMode(amd64)");
   2871          return 0; /*notreached*/
   2872    }
   2873 }
   2874 
   2875 
   2876 /*------------------------------------------------------------*/
   2877 /*--- Disassembling common idioms                          ---*/
   2878 /*------------------------------------------------------------*/
   2879 
   2880 /* Handle binary integer instructions of the form
   2881       op E, G  meaning
   2882       op reg-or-mem, reg
   2883    Is passed the a ptr to the modRM byte, the actual operation, and the
   2884    data size.  Returns the address advanced completely over this
   2885    instruction.
   2886 
   2887    E(src) is reg-or-mem
   2888    G(dst) is reg.
   2889 
   2890    If E is reg, -->    GET %G,  tmp
   2891                        OP %E,   tmp
   2892                        PUT tmp, %G
   2893 
   2894    If E is mem and OP is not reversible,
   2895                 -->    (getAddr E) -> tmpa
   2896                        LD (tmpa), tmpa
   2897                        GET %G, tmp2
   2898                        OP tmpa, tmp2
   2899                        PUT tmp2, %G
   2900 
   2901    If E is mem and OP is reversible
   2902                 -->    (getAddr E) -> tmpa
   2903                        LD (tmpa), tmpa
   2904                        OP %G, tmpa
   2905                        PUT tmpa, %G
   2906 */
   2907 static
   2908 ULong dis_op2_E_G ( VexAbiInfo* vbi,
   2909                     Prefix      pfx,
   2910                     Bool        addSubCarry,
   2911                     IROp        op8,
   2912                     Bool        keep,
   2913                     Int         size,
   2914                     Long        delta0,
   2915                     const HChar* t_amd64opc )
   2916 {
   2917    HChar   dis_buf[50];
   2918    Int     len;
   2919    IRType  ty   = szToITy(size);
   2920    IRTemp  dst1 = newTemp(ty);
   2921    IRTemp  src  = newTemp(ty);
   2922    IRTemp  dst0 = newTemp(ty);
   2923    UChar   rm   = getUChar(delta0);
   2924    IRTemp  addr = IRTemp_INVALID;
   2925 
   2926    /* addSubCarry == True indicates the intended operation is
   2927       add-with-carry or subtract-with-borrow. */
   2928    if (addSubCarry) {
   2929       vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
   2930       vassert(keep);
   2931    }
   2932 
   2933    if (epartIsReg(rm)) {
   2934       /* Specially handle XOR reg,reg, because that doesn't really
   2935          depend on reg, and doing the obvious thing potentially
   2936          generates a spurious value check failure due to the bogus
   2937          dependency. */
   2938       if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
   2939           && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
   2940          if (False && op8 == Iop_Sub8)
   2941             vex_printf("vex amd64->IR: sbb %%r,%%r optimisation(1)\n");
   2942          putIRegG(size,pfx,rm, mkU(ty,0));
   2943       }
   2944 
   2945       assign( dst0, getIRegG(size,pfx,rm) );
   2946       assign( src,  getIRegE(size,pfx,rm) );
   2947 
   2948       if (addSubCarry && op8 == Iop_Add8) {
   2949          helper_ADC( size, dst1, dst0, src,
   2950                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2951          putIRegG(size, pfx, rm, mkexpr(dst1));
   2952       } else
   2953       if (addSubCarry && op8 == Iop_Sub8) {
   2954          helper_SBB( size, dst1, dst0, src,
   2955                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2956          putIRegG(size, pfx, rm, mkexpr(dst1));
   2957       } else {
   2958          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   2959          if (isAddSub(op8))
   2960             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2961          else
   2962             setFlags_DEP1(op8, dst1, ty);
   2963          if (keep)
   2964             putIRegG(size, pfx, rm, mkexpr(dst1));
   2965       }
   2966 
   2967       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   2968                           nameIRegE(size,pfx,rm),
   2969                           nameIRegG(size,pfx,rm));
   2970       return 1+delta0;
   2971    } else {
   2972       /* E refers to memory */
   2973       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   2974       assign( dst0, getIRegG(size,pfx,rm) );
   2975       assign( src,  loadLE(szToITy(size), mkexpr(addr)) );
   2976 
   2977       if (addSubCarry && op8 == Iop_Add8) {
   2978          helper_ADC( size, dst1, dst0, src,
   2979                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2980          putIRegG(size, pfx, rm, mkexpr(dst1));
   2981       } else
   2982       if (addSubCarry && op8 == Iop_Sub8) {
   2983          helper_SBB( size, dst1, dst0, src,
   2984                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2985          putIRegG(size, pfx, rm, mkexpr(dst1));
   2986       } else {
   2987          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   2988          if (isAddSub(op8))
   2989             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2990          else
   2991             setFlags_DEP1(op8, dst1, ty);
   2992          if (keep)
   2993             putIRegG(size, pfx, rm, mkexpr(dst1));
   2994       }
   2995 
   2996       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   2997                           dis_buf, nameIRegG(size, pfx, rm));
   2998       return len+delta0;
   2999    }
   3000 }
   3001 
   3002 
   3003 
   3004 /* Handle binary integer instructions of the form
   3005       op G, E  meaning
   3006       op reg, reg-or-mem
   3007    Is passed the a ptr to the modRM byte, the actual operation, and the
   3008    data size.  Returns the address advanced completely over this
   3009    instruction.
   3010 
   3011    G(src) is reg.
   3012    E(dst) is reg-or-mem
   3013 
   3014    If E is reg, -->    GET %E,  tmp
   3015                        OP %G,   tmp
   3016                        PUT tmp, %E
   3017 
   3018    If E is mem, -->    (getAddr E) -> tmpa
   3019                        LD (tmpa), tmpv
   3020                        OP %G, tmpv
   3021                        ST tmpv, (tmpa)
   3022 */
   3023 static
   3024 ULong dis_op2_G_E ( VexAbiInfo* vbi,
   3025                     Prefix      pfx,
   3026                     Bool        addSubCarry,
   3027                     IROp        op8,
   3028                     Bool        keep,
   3029                     Int         size,
   3030                     Long        delta0,
   3031                     const HChar* t_amd64opc )
   3032 {
   3033    HChar   dis_buf[50];
   3034    Int     len;
   3035    IRType  ty   = szToITy(size);
   3036    IRTemp  dst1 = newTemp(ty);
   3037    IRTemp  src  = newTemp(ty);
   3038    IRTemp  dst0 = newTemp(ty);
   3039    UChar   rm   = getUChar(delta0);
   3040    IRTemp  addr = IRTemp_INVALID;
   3041 
   3042    /* addSubCarry == True indicates the intended operation is
   3043       add-with-carry or subtract-with-borrow. */
   3044    if (addSubCarry) {
   3045       vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
   3046       vassert(keep);
   3047    }
   3048 
   3049    if (epartIsReg(rm)) {
   3050       /* Specially handle XOR reg,reg, because that doesn't really
   3051          depend on reg, and doing the obvious thing potentially
   3052          generates a spurious value check failure due to the bogus
   3053          dependency.  Ditto SBB reg,reg. */
   3054       if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
   3055           && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
   3056          putIRegE(size,pfx,rm, mkU(ty,0));
   3057       }
   3058 
   3059       assign(dst0, getIRegE(size,pfx,rm));
   3060       assign(src,  getIRegG(size,pfx,rm));
   3061 
   3062       if (addSubCarry && op8 == Iop_Add8) {
   3063          helper_ADC( size, dst1, dst0, src,
   3064                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3065          putIRegE(size, pfx, rm, mkexpr(dst1));
   3066       } else
   3067       if (addSubCarry && op8 == Iop_Sub8) {
   3068          helper_SBB( size, dst1, dst0, src,
   3069                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3070          putIRegE(size, pfx, rm, mkexpr(dst1));
   3071       } else {
   3072          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3073          if (isAddSub(op8))
   3074             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3075          else
   3076             setFlags_DEP1(op8, dst1, ty);
   3077          if (keep)
   3078             putIRegE(size, pfx, rm, mkexpr(dst1));
   3079       }
   3080 
   3081       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   3082                           nameIRegG(size,pfx,rm),
   3083                           nameIRegE(size,pfx,rm));
   3084       return 1+delta0;
   3085    }
   3086 
   3087    /* E refers to memory */
   3088    {
   3089       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   3090       assign(dst0, loadLE(ty,mkexpr(addr)));
   3091       assign(src,  getIRegG(size,pfx,rm));
   3092 
   3093       if (addSubCarry && op8 == Iop_Add8) {
   3094          if (haveLOCK(pfx)) {
   3095             /* cas-style store */
   3096             helper_ADC( size, dst1, dst0, src,
   3097                         /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3098          } else {
   3099             /* normal store */
   3100             helper_ADC( size, dst1, dst0, src,
   3101                         /*store*/addr, IRTemp_INVALID, 0 );
   3102          }
   3103       } else
   3104       if (addSubCarry && op8 == Iop_Sub8) {
   3105          if (haveLOCK(pfx)) {
   3106             /* cas-style store */
   3107             helper_SBB( size, dst1, dst0, src,
   3108                         /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3109          } else {
   3110             /* normal store */
   3111             helper_SBB( size, dst1, dst0, src,
   3112                         /*store*/addr, IRTemp_INVALID, 0 );
   3113          }
   3114       } else {
   3115          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3116          if (keep) {
   3117             if (haveLOCK(pfx)) {
   3118                if (0) vex_printf("locked case\n" );
   3119                casLE( mkexpr(addr),
   3120                       mkexpr(dst0)/*expval*/,
   3121                       mkexpr(dst1)/*newval*/, guest_RIP_curr_instr );
   3122             } else {
   3123                if (0) vex_printf("nonlocked case\n");
   3124                storeLE(mkexpr(addr), mkexpr(dst1));
   3125             }
   3126          }
   3127          if (isAddSub(op8))
   3128             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3129          else
   3130             setFlags_DEP1(op8, dst1, ty);
   3131       }
   3132 
   3133       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   3134                           nameIRegG(size,pfx,rm), dis_buf);
   3135       return len+delta0;
   3136    }
   3137 }
   3138 
   3139 
   3140 /* Handle move instructions of the form
   3141       mov E, G  meaning
   3142       mov reg-or-mem, reg
   3143    Is passed the a ptr to the modRM byte, and the data size.  Returns
   3144    the address advanced completely over this instruction.
   3145 
   3146    E(src) is reg-or-mem
   3147    G(dst) is reg.
   3148 
   3149    If E is reg, -->    GET %E,  tmpv
   3150                        PUT tmpv, %G
   3151 
   3152    If E is mem  -->    (getAddr E) -> tmpa
   3153                        LD (tmpa), tmpb
   3154                        PUT tmpb, %G
   3155 */
   3156 static
   3157 ULong dis_mov_E_G ( VexAbiInfo* vbi,
   3158                     Prefix      pfx,
   3159                     Int         size,
   3160                     Long        delta0 )
   3161 {
   3162    Int len;
   3163    UChar rm = getUChar(delta0);
   3164    HChar dis_buf[50];
   3165 
   3166    if (epartIsReg(rm)) {
   3167       putIRegG(size, pfx, rm, getIRegE(size, pfx, rm));
   3168       DIP("mov%c %s,%s\n", nameISize(size),
   3169                            nameIRegE(size,pfx,rm),
   3170                            nameIRegG(size,pfx,rm));
   3171       return 1+delta0;
   3172    }
   3173 
   3174    /* E refers to memory */
   3175    {
   3176       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   3177       putIRegG(size, pfx, rm, loadLE(szToITy(size), mkexpr(addr)));
   3178       DIP("mov%c %s,%s\n", nameISize(size),
   3179                            dis_buf,
   3180                            nameIRegG(size,pfx,rm));
   3181       return delta0+len;
   3182    }
   3183 }
   3184 
   3185 
   3186 /* Handle move instructions of the form
   3187       mov G, E  meaning
   3188       mov reg, reg-or-mem
   3189    Is passed the a ptr to the modRM byte, and the data size.  Returns
   3190    the address advanced completely over this instruction.
   3191    We have to decide here whether F2 or F3 are acceptable.  F2 never is.
   3192 
   3193    G(src) is reg.
   3194    E(dst) is reg-or-mem
   3195 
   3196    If E is reg, -->    GET %G,  tmp
   3197                        PUT tmp, %E
   3198 
   3199    If E is mem, -->    (getAddr E) -> tmpa
   3200                        GET %G, tmpv
   3201                        ST tmpv, (tmpa)
   3202 */
   3203 static
   3204 ULong dis_mov_G_E ( VexAbiInfo*  vbi,
   3205                     Prefix       pfx,
   3206                     Int          size,
   3207                     Long         delta0,
   3208                     /*OUT*/Bool* ok )
   3209 {
   3210    Int   len;
   3211    UChar rm = getUChar(delta0);
   3212    HChar dis_buf[50];
   3213 
   3214    *ok = True;
   3215 
   3216    if (epartIsReg(rm)) {
   3217       if (haveF2orF3(pfx)) { *ok = False; return delta0; }
   3218       putIRegE(size, pfx, rm, getIRegG(size, pfx, rm));
   3219       DIP("mov%c %s,%s\n", nameISize(size),
   3220                            nameIRegG(size,pfx,rm),
   3221                            nameIRegE(size,pfx,rm));
   3222       return 1+delta0;
   3223    }
   3224 
   3225    /* E refers to memory */
   3226    {
   3227       if (haveF2(pfx)) { *ok = False; return delta0; }
   3228       /* F3(XRELEASE) is acceptable, though. */
   3229       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   3230       storeLE( mkexpr(addr), getIRegG(size, pfx, rm) );
   3231       DIP("mov%c %s,%s\n", nameISize(size),
   3232                            nameIRegG(size,pfx,rm),
   3233                            dis_buf);
   3234       return len+delta0;
   3235    }
   3236 }
   3237 
   3238 
   3239 /* op $immediate, AL/AX/EAX/RAX. */
   3240 static
   3241 ULong dis_op_imm_A ( Int    size,
   3242                      Bool   carrying,
   3243                      IROp   op8,
   3244                      Bool   keep,
   3245                      Long   delta,
   3246                      const HChar* t_amd64opc )
   3247 {
   3248    Int    size4 = imin(size,4);
   3249    IRType ty    = szToITy(size);
   3250    IRTemp dst0  = newTemp(ty);
   3251    IRTemp src   = newTemp(ty);
   3252    IRTemp dst1  = newTemp(ty);
   3253    Long  lit    = getSDisp(size4,delta);
   3254    assign(dst0, getIRegRAX(size));
   3255    assign(src,  mkU(ty,lit & mkSizeMask(size)));
   3256 
   3257    if (isAddSub(op8) && !carrying) {
   3258       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   3259       setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3260    }
   3261    else
   3262    if (isLogic(op8)) {
   3263       vassert(!carrying);
   3264       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   3265       setFlags_DEP1(op8, dst1, ty);
   3266    }
   3267    else
   3268    if (op8 == Iop_Add8 && carrying) {
   3269       helper_ADC( size, dst1, dst0, src,
   3270                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3271    }
   3272    else
   3273    if (op8 == Iop_Sub8 && carrying) {
   3274       helper_SBB( size, dst1, dst0, src,
   3275                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3276    }
   3277    else
   3278       vpanic("dis_op_imm_A(amd64,guest)");
   3279 
   3280    if (keep)
   3281       putIRegRAX(size, mkexpr(dst1));
   3282 
   3283    DIP("%s%c $%lld, %s\n", t_amd64opc, nameISize(size),
   3284                            lit, nameIRegRAX(size));
   3285    return delta+size4;
   3286 }
   3287 
   3288 
   3289 /* Sign- and Zero-extending moves. */
   3290 static
   3291 ULong dis_movx_E_G ( VexAbiInfo* vbi,
   3292                      Prefix pfx,
   3293                      Long delta, Int szs, Int szd, Bool sign_extend )
   3294 {
   3295    UChar rm = getUChar(delta);
   3296    if (epartIsReg(rm)) {
   3297       putIRegG(szd, pfx, rm,
   3298                     doScalarWidening(
   3299                        szs,szd,sign_extend,
   3300                        getIRegE(szs,pfx,rm)));
   3301       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   3302                                nameISize(szs),
   3303                                nameISize(szd),
   3304                                nameIRegE(szs,pfx,rm),
   3305                                nameIRegG(szd,pfx,rm));
   3306       return 1+delta;
   3307    }
   3308 
   3309    /* E refers to memory */
   3310    {
   3311       Int    len;
   3312       HChar  dis_buf[50];
   3313       IRTemp addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   3314       putIRegG(szd, pfx, rm,
   3315                     doScalarWidening(
   3316                        szs,szd,sign_extend,
   3317                        loadLE(szToITy(szs),mkexpr(addr))));
   3318       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   3319                                nameISize(szs),
   3320                                nameISize(szd),
   3321                                dis_buf,
   3322                                nameIRegG(szd,pfx,rm));
   3323       return len+delta;
   3324    }
   3325 }
   3326 
   3327 
   3328 /* Generate code to divide ArchRegs RDX:RAX / EDX:EAX / DX:AX / AX by
   3329    the 64 / 32 / 16 / 8 bit quantity in the given IRTemp.  */
   3330 static
   3331 void codegen_div ( Int sz, IRTemp t, Bool signed_divide )
   3332 {
   3333    /* special-case the 64-bit case */
   3334    if (sz == 8) {
   3335       IROp   op     = signed_divide ? Iop_DivModS128to64
   3336                                     : Iop_DivModU128to64;
   3337       IRTemp src128 = newTemp(Ity_I128);
   3338       IRTemp dst128 = newTemp(Ity_I128);
   3339       assign( src128, binop(Iop_64HLto128,
   3340                             getIReg64(R_RDX),
   3341                             getIReg64(R_RAX)) );
   3342       assign( dst128, binop(op, mkexpr(src128), mkexpr(t)) );
   3343       putIReg64( R_RAX, unop(Iop_128to64,mkexpr(dst128)) );
   3344       putIReg64( R_RDX, unop(Iop_128HIto64,mkexpr(dst128)) );
   3345    } else {
   3346       IROp   op    = signed_divide ? Iop_DivModS64to32
   3347                                    : Iop_DivModU64to32;
   3348       IRTemp src64 = newTemp(Ity_I64);
   3349       IRTemp dst64 = newTemp(Ity_I64);
   3350       switch (sz) {
   3351       case 4:
   3352          assign( src64,
   3353                  binop(Iop_32HLto64, getIRegRDX(4), getIRegRAX(4)) );
   3354          assign( dst64,
   3355                  binop(op, mkexpr(src64), mkexpr(t)) );
   3356          putIRegRAX( 4, unop(Iop_64to32,mkexpr(dst64)) );
   3357          putIRegRDX( 4, unop(Iop_64HIto32,mkexpr(dst64)) );
   3358          break;
   3359       case 2: {
   3360          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   3361          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   3362          assign( src64, unop(widen3264,
   3363                              binop(Iop_16HLto32,
   3364                                    getIRegRDX(2),
   3365                                    getIRegRAX(2))) );
   3366          assign( dst64, binop(op, mkexpr(src64), unop(widen1632,mkexpr(t))) );
   3367          putIRegRAX( 2, unop(Iop_32to16,unop(Iop_64to32,mkexpr(dst64))) );
   3368          putIRegRDX( 2, unop(Iop_32to16,unop(Iop_64HIto32,mkexpr(dst64))) );
   3369          break;
   3370       }
   3371       case 1: {
   3372          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   3373          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   3374          IROp widen816  = signed_divide ? Iop_8Sto16  : Iop_8Uto16;
   3375          assign( src64, unop(widen3264,
   3376                         unop(widen1632, getIRegRAX(2))) );
   3377          assign( dst64,
   3378                  binop(op, mkexpr(src64),
   3379                            unop(widen1632, unop(widen816, mkexpr(t)))) );
   3380          putIRegRAX( 1, unop(Iop_16to8,
   3381                         unop(Iop_32to16,
   3382                         unop(Iop_64to32,mkexpr(dst64)))) );
   3383          putIRegAH( unop(Iop_16to8,
   3384                     unop(Iop_32to16,
   3385                     unop(Iop_64HIto32,mkexpr(dst64)))) );
   3386          break;
   3387       }
   3388       default:
   3389          vpanic("codegen_div(amd64)");
   3390       }
   3391    }
   3392 }
   3393 
   3394 static
   3395 ULong dis_Grp1 ( VexAbiInfo* vbi,
   3396                  Prefix pfx,
   3397                  Long delta, UChar modrm,
   3398                  Int am_sz, Int d_sz, Int sz, Long d64 )
   3399 {
   3400    Int     len;
   3401    HChar   dis_buf[50];
   3402    IRType  ty   = szToITy(sz);
   3403    IRTemp  dst1 = newTemp(ty);
   3404    IRTemp  src  = newTemp(ty);
   3405    IRTemp  dst0 = newTemp(ty);
   3406    IRTemp  addr = IRTemp_INVALID;
   3407    IROp    op8  = Iop_INVALID;
   3408    ULong   mask = mkSizeMask(sz);
   3409 
   3410    switch (gregLO3ofRM(modrm)) {
   3411       case 0: op8 = Iop_Add8; break;  case 1: op8 = Iop_Or8;  break;
   3412       case 2: break;  // ADC
   3413       case 3: break;  // SBB
   3414       case 4: op8 = Iop_And8; break;  case 5: op8 = Iop_Sub8; break;
   3415       case 6: op8 = Iop_Xor8; break;  case 7: op8 = Iop_Sub8; break;
   3416       /*NOTREACHED*/
   3417       default: vpanic("dis_Grp1(amd64): unhandled case");
   3418    }
   3419 
   3420    if (epartIsReg(modrm)) {
   3421       vassert(am_sz == 1);
   3422 
   3423       assign(dst0, getIRegE(sz,pfx,modrm));
   3424       assign(src,  mkU(ty,d64 & mask));
   3425 
   3426       if (gregLO3ofRM(modrm) == 2 /* ADC */) {
   3427          helper_ADC( sz, dst1, dst0, src,
   3428                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3429       } else
   3430       if (gregLO3ofRM(modrm) == 3 /* SBB */) {
   3431          helper_SBB( sz, dst1, dst0, src,
   3432                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3433       } else {
   3434          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3435          if (isAddSub(op8))
   3436             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3437          else
   3438             setFlags_DEP1(op8, dst1, ty);
   3439       }
   3440 
   3441       if (gregLO3ofRM(modrm) < 7)
   3442          putIRegE(sz, pfx, modrm, mkexpr(dst1));
   3443 
   3444       delta += (am_sz + d_sz);
   3445       DIP("%s%c $%lld, %s\n",
   3446           nameGrp1(gregLO3ofRM(modrm)), nameISize(sz), d64,
   3447           nameIRegE(sz,pfx,modrm));
   3448    } else {
   3449       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
   3450 
   3451       assign(dst0, loadLE(ty,mkexpr(addr)));
   3452       assign(src, mkU(ty,d64 & mask));
   3453 
   3454       if (gregLO3ofRM(modrm) == 2 /* ADC */) {
   3455          if (haveLOCK(pfx)) {
   3456             /* cas-style store */
   3457             helper_ADC( sz, dst1, dst0, src,
   3458                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3459          } else {
   3460             /* normal store */
   3461             helper_ADC( sz, dst1, dst0, src,
   3462                         /*store*/addr, IRTemp_INVALID, 0 );
   3463          }
   3464       } else
   3465       if (gregLO3ofRM(modrm) == 3 /* SBB */) {
   3466          if (haveLOCK(pfx)) {
   3467             /* cas-style store */
   3468             helper_SBB( sz, dst1, dst0, src,
   3469                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3470          } else {
   3471             /* normal store */
   3472             helper_SBB( sz, dst1, dst0, src,
   3473                         /*store*/addr, IRTemp_INVALID, 0 );
   3474          }
   3475       } else {
   3476          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3477          if (gregLO3ofRM(modrm) < 7) {
   3478             if (haveLOCK(pfx)) {
   3479                casLE( mkexpr(addr), mkexpr(dst0)/*expVal*/,
   3480                                     mkexpr(dst1)/*newVal*/,
   3481                                     guest_RIP_curr_instr );
   3482             } else {
   3483                storeLE(mkexpr(addr), mkexpr(dst1));
   3484             }
   3485          }
   3486          if (isAddSub(op8))
   3487             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3488          else
   3489             setFlags_DEP1(op8, dst1, ty);
   3490       }
   3491 
   3492       delta += (len+d_sz);
   3493       DIP("%s%c $%lld, %s\n",
   3494           nameGrp1(gregLO3ofRM(modrm)), nameISize(sz),
   3495           d64, dis_buf);
   3496    }
   3497    return delta;
   3498 }
   3499 
   3500 
   3501 /* Group 2 extended opcodes.  shift_expr must be an 8-bit typed
   3502    expression. */
   3503 
   3504 static
   3505 ULong dis_Grp2 ( VexAbiInfo* vbi,
   3506                  Prefix pfx,
   3507                  Long delta, UChar modrm,
   3508                  Int am_sz, Int d_sz, Int sz, IRExpr* shift_expr,
   3509                  const HChar* shift_expr_txt, Bool* decode_OK )
   3510 {
   3511    /* delta on entry points at the modrm byte. */
   3512    HChar  dis_buf[50];
   3513    Int    len;
   3514    Bool   isShift, isRotate, isRotateC;
   3515    IRType ty    = szToITy(sz);
   3516    IRTemp dst0  = newTemp(ty);
   3517    IRTemp dst1  = newTemp(ty);
   3518    IRTemp addr  = IRTemp_INVALID;
   3519 
   3520    *decode_OK = True;
   3521 
   3522    vassert(sz == 1 || sz == 2 || sz == 4 || sz == 8);
   3523 
   3524    /* Put value to shift/rotate in dst0. */
   3525    if (epartIsReg(modrm)) {
   3526       assign(dst0, getIRegE(sz, pfx, modrm));
   3527       delta += (am_sz + d_sz);
   3528    } else {
   3529       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
   3530       assign(dst0, loadLE(ty,mkexpr(addr)));
   3531       delta += len + d_sz;
   3532    }
   3533 
   3534    isShift = False;
   3535    switch (gregLO3ofRM(modrm)) { case 4: case 5: case 6: case 7: isShift = True; }
   3536 
   3537    isRotate = False;
   3538    switch (gregLO3ofRM(modrm)) { case 0: case 1: isRotate = True; }
   3539 
   3540    isRotateC = False;
   3541    switch (gregLO3ofRM(modrm)) { case 2: case 3: isRotateC = True; }
   3542 
   3543    if (!isShift && !isRotate && !isRotateC) {
   3544       /*NOTREACHED*/
   3545       vpanic("dis_Grp2(Reg): unhandled case(amd64)");
   3546    }
   3547 
   3548    if (isRotateC) {
   3549       /* Call a helper; this insn is so ridiculous it does not deserve
   3550          better.  One problem is, the helper has to calculate both the
   3551          new value and the new flags.  This is more than 64 bits, and
   3552          there is no way to return more than 64 bits from the helper.
   3553          Hence the crude and obvious solution is to call it twice,
   3554          using the sign of the sz field to indicate whether it is the
   3555          value or rflags result we want.
   3556       */
   3557       Bool     left = toBool(gregLO3ofRM(modrm) == 2);
   3558       IRExpr** argsVALUE;
   3559       IRExpr** argsRFLAGS;
   3560 
   3561       IRTemp new_value  = newTemp(Ity_I64);
   3562       IRTemp new_rflags = newTemp(Ity_I64);
   3563       IRTemp old_rflags = newTemp(Ity_I64);
   3564 
   3565       assign( old_rflags, widenUto64(mk_amd64g_calculate_rflags_all()) );
   3566 
   3567       argsVALUE
   3568          = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
   3569                           widenUto64(shift_expr),   /* rotate amount */
   3570                           mkexpr(old_rflags),
   3571                           mkU64(sz) );
   3572       assign( new_value,
   3573                  mkIRExprCCall(
   3574                     Ity_I64,
   3575                     0/*regparm*/,
   3576                     left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
   3577                     left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
   3578                     argsVALUE
   3579                  )
   3580             );
   3581 
   3582       argsRFLAGS
   3583          = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
   3584                           widenUto64(shift_expr),   /* rotate amount */
   3585                           mkexpr(old_rflags),
   3586                           mkU64(-sz) );
   3587       assign( new_rflags,
   3588                  mkIRExprCCall(
   3589                     Ity_I64,
   3590                     0/*regparm*/,
   3591                     left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
   3592                     left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
   3593                     argsRFLAGS
   3594                  )
   3595             );
   3596 
   3597       assign( dst1, narrowTo(ty, mkexpr(new_value)) );
   3598       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   3599       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(new_rflags) ));
   3600       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   3601       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   3602    }
   3603 
   3604    else
   3605    if (isShift) {
   3606 
   3607       IRTemp pre64     = newTemp(Ity_I64);
   3608       IRTemp res64     = newTemp(Ity_I64);
   3609       IRTemp res64ss   = newTemp(Ity_I64);
   3610       IRTemp shift_amt = newTemp(Ity_I8);
   3611       UChar  mask      = toUChar(sz==8 ? 63 : 31);
   3612       IROp   op64;
   3613 
   3614       switch (gregLO3ofRM(modrm)) {
   3615          case 4: op64 = Iop_Shl64; break;
   3616          case 5: op64 = Iop_Shr64; break;
   3617          case 6: op64 = Iop_Shl64; break;
   3618          case 7: op64 = Iop_Sar64; break;
   3619          /*NOTREACHED*/
   3620          default: vpanic("dis_Grp2:shift"); break;
   3621       }
   3622 
   3623       /* Widen the value to be shifted to 64 bits, do the shift, and
   3624          narrow back down.  This seems surprisingly long-winded, but
   3625          unfortunately the AMD semantics requires that 8/16/32-bit
   3626          shifts give defined results for shift values all the way up
   3627          to 32, and this seems the simplest way to do it.  It has the
   3628          advantage that the only IR level shifts generated are of 64
   3629          bit values, and the shift amount is guaranteed to be in the
   3630          range 0 .. 63, thereby observing the IR semantics requiring
   3631          all shift values to be in the range 0 .. 2^word_size-1.
   3632 
   3633          Therefore the shift amount is masked with 63 for 64-bit shifts
   3634          and 31 for all others.
   3635       */
   3636       /* shift_amt = shift_expr & MASK, regardless of operation size */
   3637       assign( shift_amt, binop(Iop_And8, shift_expr, mkU8(mask)) );
   3638 
   3639       /* suitably widen the value to be shifted to 64 bits. */
   3640       assign( pre64, op64==Iop_Sar64 ? widenSto64(mkexpr(dst0))
   3641                                      : widenUto64(mkexpr(dst0)) );
   3642 
   3643       /* res64 = pre64 `shift` shift_amt */
   3644       assign( res64, binop(op64, mkexpr(pre64), mkexpr(shift_amt)) );
   3645 
   3646       /* res64ss = pre64 `shift` ((shift_amt - 1) & MASK) */
   3647       assign( res64ss,
   3648               binop(op64,
   3649                     mkexpr(pre64),
   3650                     binop(Iop_And8,
   3651                           binop(Iop_Sub8,
   3652                                 mkexpr(shift_amt), mkU8(1)),
   3653                           mkU8(mask))) );
   3654 
   3655       /* Build the flags thunk. */
   3656       setFlags_DEP1_DEP2_shift(op64, res64, res64ss, ty, shift_amt);
   3657 
   3658       /* Narrow the result back down. */
   3659       assign( dst1, narrowTo(ty, mkexpr(res64)) );
   3660 
   3661    } /* if (isShift) */
   3662 
   3663    else
   3664    if (isRotate) {
   3665       Int    ccOp      = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1
   3666                                         : (ty==Ity_I32 ? 2 : 3));
   3667       Bool   left      = toBool(gregLO3ofRM(modrm) == 0);
   3668       IRTemp rot_amt   = newTemp(Ity_I8);
   3669       IRTemp rot_amt64 = newTemp(Ity_I8);
   3670       IRTemp oldFlags  = newTemp(Ity_I64);
   3671       UChar  mask      = toUChar(sz==8 ? 63 : 31);
   3672 
   3673       /* rot_amt = shift_expr & mask */
   3674       /* By masking the rotate amount thusly, the IR-level Shl/Shr
   3675          expressions never shift beyond the word size and thus remain
   3676          well defined. */
   3677       assign(rot_amt64, binop(Iop_And8, shift_expr, mkU8(mask)));
   3678 
   3679       if (ty == Ity_I64)
   3680          assign(rot_amt, mkexpr(rot_amt64));
   3681       else
   3682          assign(rot_amt, binop(Iop_And8, mkexpr(rot_amt64), mkU8(8*sz-1)));
   3683 
   3684       if (left) {
   3685 
   3686          /* dst1 = (dst0 << rot_amt) | (dst0 >>u (wordsize-rot_amt)) */
   3687          assign(dst1,
   3688             binop( mkSizedOp(ty,Iop_Or8),
   3689                    binop( mkSizedOp(ty,Iop_Shl8),
   3690                           mkexpr(dst0),
   3691                           mkexpr(rot_amt)
   3692                    ),
   3693                    binop( mkSizedOp(ty,Iop_Shr8),
   3694                           mkexpr(dst0),
   3695                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   3696                    )
   3697             )
   3698          );
   3699          ccOp += AMD64G_CC_OP_ROLB;
   3700 
   3701       } else { /* right */
   3702 
   3703          /* dst1 = (dst0 >>u rot_amt) | (dst0 << (wordsize-rot_amt)) */
   3704          assign(dst1,
   3705             binop( mkSizedOp(ty,Iop_Or8),
   3706                    binop( mkSizedOp(ty,Iop_Shr8),
   3707                           mkexpr(dst0),
   3708                           mkexpr(rot_amt)
   3709                    ),
   3710                    binop( mkSizedOp(ty,Iop_Shl8),
   3711                           mkexpr(dst0),
   3712                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   3713                    )
   3714             )
   3715          );
   3716          ccOp += AMD64G_CC_OP_RORB;
   3717 
   3718       }
   3719 
   3720       /* dst1 now holds the rotated value.  Build flag thunk.  We
   3721          need the resulting value for this, and the previous flags.
   3722          Except don't set it if the rotate count is zero. */
   3723 
   3724       assign(oldFlags, mk_amd64g_calculate_rflags_all());
   3725 
   3726       /* rot_amt64 :: Ity_I8.  We need to convert it to I1. */
   3727       IRTemp rot_amt64b = newTemp(Ity_I1);
   3728       assign(rot_amt64b, binop(Iop_CmpNE8, mkexpr(rot_amt64), mkU8(0)) );
   3729 
   3730       /* CC_DEP1 is the rotated value.  CC_NDEP is flags before. */
   3731       stmt( IRStmt_Put( OFFB_CC_OP,
   3732                         IRExpr_ITE( mkexpr(rot_amt64b),
   3733                                     mkU64(ccOp),
   3734                                     IRExpr_Get(OFFB_CC_OP,Ity_I64) ) ));
   3735       stmt( IRStmt_Put( OFFB_CC_DEP1,
   3736                         IRExpr_ITE( mkexpr(rot_amt64b),
   3737                                     widenUto64(mkexpr(dst1)),
   3738                                     IRExpr_Get(OFFB_CC_DEP1,Ity_I64) ) ));
   3739       stmt( IRStmt_Put( OFFB_CC_DEP2,
   3740                         IRExpr_ITE( mkexpr(rot_amt64b),
   3741                                     mkU64(0),
   3742                                     IRExpr_Get(OFFB_CC_DEP2,Ity_I64) ) ));
   3743       stmt( IRStmt_Put( OFFB_CC_NDEP,
   3744                         IRExpr_ITE( mkexpr(rot_amt64b),
   3745                                     mkexpr(oldFlags),
   3746                                     IRExpr_Get(OFFB_CC_NDEP,Ity_I64) ) ));
   3747    } /* if (isRotate) */
   3748 
   3749    /* Save result, and finish up. */
   3750    if (epartIsReg(modrm)) {
   3751       putIRegE(sz, pfx, modrm, mkexpr(dst1));
   3752       if (vex_traceflags & VEX_TRACE_FE) {
   3753          vex_printf("%s%c ",
   3754                     nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
   3755          if (shift_expr_txt)
   3756             vex_printf("%s", shift_expr_txt);
   3757          else
   3758             ppIRExpr(shift_expr);
   3759          vex_printf(", %s\n", nameIRegE(sz,pfx,modrm));
   3760       }
   3761    } else {
   3762       storeLE(mkexpr(addr), mkexpr(dst1));
   3763       if (vex_traceflags & VEX_TRACE_FE) {
   3764          vex_printf("%s%c ",
   3765                     nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
   3766          if (shift_expr_txt)
   3767             vex_printf("%s", shift_expr_txt);
   3768          else
   3769             ppIRExpr(shift_expr);
   3770          vex_printf(", %s\n", dis_buf);
   3771       }
   3772    }
   3773    return delta;
   3774 }
   3775 
   3776 
   3777 /* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
   3778 static
   3779 ULong dis_Grp8_Imm ( VexAbiInfo* vbi,
   3780                      Prefix pfx,
   3781                      Long delta, UChar modrm,
   3782                      Int am_sz, Int sz, ULong src_val,
   3783                      Bool* decode_OK )
   3784 {
   3785    /* src_val denotes a d8.
   3786       And delta on entry points at the modrm byte. */
   3787 
   3788    IRType ty     = szToITy(sz);
   3789    IRTemp t2     = newTemp(Ity_I64);
   3790    IRTemp t2m    = newTemp(Ity_I64);
   3791    IRTemp t_addr = IRTemp_INVALID;
   3792    HChar  dis_buf[50];
   3793    ULong  mask;
   3794 
   3795    /* we're optimists :-) */
   3796    *decode_OK = True;
   3797 
   3798    /* Check whether F2 or F3 are acceptable. */
   3799    if (epartIsReg(modrm)) {
   3800       /* F2 or F3 are not allowed in the register case. */
   3801       if (haveF2orF3(pfx)) {
   3802          *decode_OK = False;
   3803          return delta;
   3804      }
   3805    } else {
   3806       /* F2 or F3 (but not both) are allowable provided LOCK is also
   3807          present. */
   3808       if (haveF2orF3(pfx)) {
   3809          if (haveF2andF3(pfx) || !haveLOCK(pfx)) {
   3810             *decode_OK = False;
   3811             return delta;
   3812          }
   3813       }
   3814    }
   3815 
   3816    /* Limit src_val -- the bit offset -- to something within a word.
   3817       The Intel docs say that literal offsets larger than a word are
   3818       masked in this way. */
   3819    switch (sz) {
   3820       case 2:  src_val &= 15; break;
   3821       case 4:  src_val &= 31; break;
   3822       case 8:  src_val &= 63; break;
   3823       default: *decode_OK = False; return delta;
   3824    }
   3825 
   3826    /* Invent a mask suitable for the operation. */
   3827    switch (gregLO3ofRM(modrm)) {
   3828       case 4: /* BT */  mask = 0;                  break;
   3829       case 5: /* BTS */ mask = 1ULL << src_val;    break;
   3830       case 6: /* BTR */ mask = ~(1ULL << src_val); break;
   3831       case 7: /* BTC */ mask = 1ULL << src_val;    break;
   3832          /* If this needs to be extended, probably simplest to make a
   3833             new function to handle the other cases (0 .. 3).  The
   3834             Intel docs do however not indicate any use for 0 .. 3, so
   3835             we don't expect this to happen. */
   3836       default: *decode_OK = False; return delta;
   3837    }
   3838 
   3839    /* Fetch the value to be tested and modified into t2, which is
   3840       64-bits wide regardless of sz. */
   3841    if (epartIsReg(modrm)) {
   3842       vassert(am_sz == 1);
   3843       assign( t2, widenUto64(getIRegE(sz, pfx, modrm)) );
   3844       delta += (am_sz + 1);
   3845       DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
   3846                                 nameISize(sz),
   3847                                 src_val, nameIRegE(sz,pfx,modrm));
   3848    } else {
   3849       Int len;
   3850       t_addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 1 );
   3851       delta  += (len+1);
   3852       assign( t2, widenUto64(loadLE(ty, mkexpr(t_addr))) );
   3853       DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
   3854                                 nameISize(sz),
   3855                                 src_val, dis_buf);
   3856    }
   3857 
   3858    /* Compute the new value into t2m, if non-BT. */
   3859    switch (gregLO3ofRM(modrm)) {
   3860       case 4: /* BT */
   3861          break;
   3862       case 5: /* BTS */
   3863          assign( t2m, binop(Iop_Or64, mkU64(mask), mkexpr(t2)) );
   3864          break;
   3865       case 6: /* BTR */
   3866          assign( t2m, binop(Iop_And64, mkU64(mask), mkexpr(t2)) );
   3867          break;
   3868       case 7: /* BTC */
   3869          assign( t2m, binop(Iop_Xor64, mkU64(mask), mkexpr(t2)) );
   3870          break;
   3871      default:
   3872          /*NOTREACHED*/ /*the previous switch guards this*/
   3873          vassert(0);
   3874    }
   3875 
   3876    /* Write the result back, if non-BT. */
   3877    if (gregLO3ofRM(modrm) != 4 /* BT */) {
   3878       if (epartIsReg(modrm)) {
   3879         putIRegE(sz, pfx, modrm, narrowTo(ty, mkexpr(t2m)));
   3880       } else {
   3881          if (haveLOCK(pfx)) {
   3882             casLE( mkexpr(t_addr),
   3883                    narrowTo(ty, mkexpr(t2))/*expd*/,
   3884                    narrowTo(ty, mkexpr(t2m))/*new*/,
   3885                    guest_RIP_curr_instr );
   3886          } else {
   3887             storeLE(mkexpr(t_addr), narrowTo(ty, mkexpr(t2m)));
   3888          }
   3889       }
   3890    }
   3891 
   3892    /* Copy relevant bit from t2 into the carry flag. */
   3893    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
   3894    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   3895    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   3896    stmt( IRStmt_Put(
   3897             OFFB_CC_DEP1,
   3898             binop(Iop_And64,
   3899                   binop(Iop_Shr64, mkexpr(t2), mkU8(src_val)),
   3900                   mkU64(1))
   3901        ));
   3902    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   3903       elimination of previous stores to this field work better. */
   3904    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   3905 
   3906    return delta;
   3907 }
   3908 
   3909 
   3910 /* Signed/unsigned widening multiply.  Generate IR to multiply the
   3911    value in RAX/EAX/AX/AL by the given IRTemp, and park the result in
   3912    RDX:RAX/EDX:EAX/DX:AX/AX.
   3913 */
   3914 static void codegen_mulL_A_D ( Int sz, Bool syned,
   3915                                IRTemp tmp, const HChar* tmp_txt )
   3916 {
   3917    IRType ty = szToITy(sz);
   3918    IRTemp t1 = newTemp(ty);
   3919 
   3920    assign( t1, getIRegRAX(sz) );
   3921 
   3922    switch (ty) {
   3923       case Ity_I64: {
   3924          IRTemp res128  = newTemp(Ity_I128);
   3925          IRTemp resHi   = newTemp(Ity_I64);
   3926          IRTemp resLo   = newTemp(Ity_I64);
   3927          IROp   mulOp   = syned ? Iop_MullS64 : Iop_MullU64;
   3928          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   3929          setFlags_MUL ( Ity_I64, t1, tmp, tBaseOp );
   3930          assign( res128, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   3931          assign( resHi, unop(Iop_128HIto64,mkexpr(res128)));
   3932          assign( resLo, unop(Iop_128to64,mkexpr(res128)));
   3933          putIReg64(R_RDX, mkexpr(resHi));
   3934          putIReg64(R_RAX, mkexpr(resLo));
   3935          break;
   3936       }
   3937       case Ity_I32: {
   3938          IRTemp res64   = newTemp(Ity_I64);
   3939          IRTemp resHi   = newTemp(Ity_I32);
   3940          IRTemp resLo   = newTemp(Ity_I32);
   3941          IROp   mulOp   = syned ? Iop_MullS32 : Iop_MullU32;
   3942          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   3943          setFlags_MUL ( Ity_I32, t1, tmp, tBaseOp );
   3944          assign( res64, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   3945          assign( resHi, unop(Iop_64HIto32,mkexpr(res64)));
   3946          assign( resLo, unop(Iop_64to32,mkexpr(res64)));
   3947          putIRegRDX(4, mkexpr(resHi));
   3948          putIRegRAX(4, mkexpr(resLo));
   3949          break;
   3950       }
   3951       case Ity_I16: {
   3952          IRTemp res32   = newTemp(Ity_I32);
   3953          IRTemp resHi   = newTemp(Ity_I16);
   3954          IRTemp resLo   = newTemp(Ity_I16);
   3955          IROp   mulOp   = syned ? Iop_MullS16 : Iop_MullU16;
   3956          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   3957          setFlags_MUL ( Ity_I16, t1, tmp, tBaseOp );
   3958          assign( res32, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   3959          assign( resHi, unop(Iop_32HIto16,mkexpr(res32)));
   3960          assign( resLo, unop(Iop_32to16,mkexpr(res32)));
   3961          putIRegRDX(2, mkexpr(resHi));
   3962          putIRegRAX(2, mkexpr(resLo));
   3963          break;
   3964       }
   3965       case Ity_I8: {
   3966          IRTemp res16   = newTemp(Ity_I16);
   3967          IRTemp resHi   = newTemp(Ity_I8);
   3968          IRTemp resLo   = newTemp(Ity_I8);
   3969          IROp   mulOp   = syned ? Iop_MullS8 : Iop_MullU8;
   3970          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   3971          setFlags_MUL ( Ity_I8, t1, tmp, tBaseOp );
   3972          assign( res16, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   3973          assign( resHi, unop(Iop_16HIto8,mkexpr(res16)));
   3974          assign( resLo, unop(Iop_16to8,mkexpr(res16)));
   3975          putIRegRAX(2, mkexpr(res16));
   3976          break;
   3977       }
   3978       default:
   3979          ppIRType(ty);
   3980          vpanic("codegen_mulL_A_D(amd64)");
   3981    }
   3982    DIP("%s%c %s\n", syned ? "imul" : "mul", nameISize(sz), tmp_txt);
   3983 }
   3984 
   3985 
   3986 /* Group 3 extended opcodes.  We have to decide here whether F2 and F3
   3987    might be valid.*/
   3988 static
   3989 ULong dis_Grp3 ( VexAbiInfo* vbi,
   3990                  Prefix pfx, Int sz, Long delta, Bool* decode_OK )
   3991 {
   3992    Long    d64;
   3993    UChar   modrm;
   3994    HChar   dis_buf[50];
   3995    Int     len;
   3996    IRTemp  addr;
   3997    IRType  ty = szToITy(sz);
   3998    IRTemp  t1 = newTemp(ty);
   3999    IRTemp dst1, src, dst0;
   4000    *decode_OK = True;
   4001    modrm = getUChar(delta);
   4002    if (epartIsReg(modrm)) {
   4003       /* F2/XACQ and F3/XREL are always invalid in the non-mem case. */
   4004       if (haveF2orF3(pfx)) goto unhandled;
   4005       switch (gregLO3ofRM(modrm)) {
   4006          case 0: { /* TEST */
   4007             delta++;
   4008             d64 = getSDisp(imin(4,sz), delta);
   4009             delta += imin(4,sz);
   4010             dst1 = newTemp(ty);
   4011             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   4012                                getIRegE(sz,pfx,modrm),
   4013                                mkU(ty, d64 & mkSizeMask(sz))));
   4014             setFlags_DEP1( Iop_And8, dst1, ty );
   4015             DIP("test%c $%lld, %s\n",
   4016                 nameISize(sz), d64,
   4017                 nameIRegE(sz, pfx, modrm));
   4018             break;
   4019          }
   4020          case 1:
   4021             *decode_OK = False;
   4022             return delta;
   4023          case 2: /* NOT */
   4024             delta++;
   4025             putIRegE(sz, pfx, modrm,
   4026                               unop(mkSizedOp(ty,Iop_Not8),
   4027                                    getIRegE(sz, pfx, modrm)));
   4028             DIP("not%c %s\n", nameISize(sz),
   4029                               nameIRegE(sz, pfx, modrm));
   4030             break;
   4031          case 3: /* NEG */
   4032             delta++;
   4033             dst0 = newTemp(ty);
   4034             src  = newTemp(ty);
   4035             dst1 = newTemp(ty);
   4036             assign(dst0, mkU(ty,0));
   4037             assign(src,  getIRegE(sz, pfx, modrm));
   4038             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
   4039                                                        mkexpr(src)));
   4040             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   4041             putIRegE(sz, pfx, modrm, mkexpr(dst1));
   4042             DIP("neg%c %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm));
   4043             break;
   4044          case 4: /* MUL (unsigned widening) */
   4045             delta++;
   4046             src = newTemp(ty);
   4047             assign(src, getIRegE(sz,pfx,modrm));
   4048             codegen_mulL_A_D ( sz, False, src,
   4049                                nameIRegE(sz,pfx,modrm) );
   4050             break;
   4051          case 5: /* IMUL (signed widening) */
   4052             delta++;
   4053             src = newTemp(ty);
   4054             assign(src, getIRegE(sz,pfx,modrm));
   4055             codegen_mulL_A_D ( sz, True, src,
   4056                                nameIRegE(sz,pfx,modrm) );
   4057             break;
   4058          case 6: /* DIV */
   4059             delta++;
   4060             assign( t1, getIRegE(sz, pfx, modrm) );
   4061             codegen_div ( sz, t1, False );
   4062             DIP("div%c %s\n", nameISize(sz),
   4063                               nameIRegE(sz, pfx, modrm));
   4064             break;
   4065          case 7: /* IDIV */
   4066             delta++;
   4067             assign( t1, getIRegE(sz, pfx, modrm) );
   4068             codegen_div ( sz, t1, True );
   4069             DIP("idiv%c %s\n", nameISize(sz),
   4070                                nameIRegE(sz, pfx, modrm));
   4071             break;
   4072          default:
   4073             /*NOTREACHED*/
   4074             vpanic("Grp3(amd64,R)");
   4075       }
   4076    } else {
   4077       /* Decide if F2/XACQ or F3/XREL might be valid. */
   4078       Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
   4079       if ((gregLO3ofRM(modrm) == 3/*NEG*/ || gregLO3ofRM(modrm) == 2/*NOT*/)
   4080           && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
   4081          validF2orF3 = True;
   4082       }
   4083       if (!validF2orF3) goto unhandled;
   4084       /* */
   4085       addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
   4086                         /* we have to inform disAMode of any immediate
   4087                            bytes used */
   4088                         gregLO3ofRM(modrm)==0/*TEST*/
   4089                            ? imin(4,sz)
   4090                            : 0
   4091                       );
   4092       t1   = newTemp(ty);
   4093       delta += len;
   4094       assign(t1, loadLE(ty,mkexpr(addr)));
   4095       switch (gregLO3ofRM(modrm)) {
   4096          case 0: { /* TEST */
   4097             d64 = getSDisp(imin(4,sz), delta);
   4098             delta += imin(4,sz);
   4099             dst1 = newTemp(ty);
   4100             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   4101                                mkexpr(t1),
   4102                                mkU(ty, d64 & mkSizeMask(sz))));
   4103             setFlags_DEP1( Iop_And8, dst1, ty );
   4104             DIP("test%c $%lld, %s\n", nameISize(sz), d64, dis_buf);
   4105             break;
   4106          }
   4107          case 1:
   4108             *decode_OK = False;
   4109             return delta;
   4110          case 2: /* NOT */
   4111             dst1 = newTemp(ty);
   4112             assign(dst1, unop(mkSizedOp(ty,Iop_Not8), mkexpr(t1)));
   4113             if (haveLOCK(pfx)) {
   4114                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   4115                                     guest_RIP_curr_instr );
   4116             } else {
   4117                storeLE( mkexpr(addr), mkexpr(dst1) );
   4118             }
   4119             DIP("not%c %s\n", nameISize(sz), dis_buf);
   4120             break;
   4121          case 3: /* NEG */
   4122             dst0 = newTemp(ty);
   4123             src  = newTemp(ty);
   4124             dst1 = newTemp(ty);
   4125             assign(dst0, mkU(ty,0));
   4126             assign(src,  mkexpr(t1));
   4127             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
   4128                                                        mkexpr(src)));
   4129             if (haveLOCK(pfx)) {
   4130                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   4131                                     guest_RIP_curr_instr );
   4132             } else {
   4133                storeLE( mkexpr(addr), mkexpr(dst1) );
   4134             }
   4135             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   4136             DIP("neg%c %s\n", nameISize(sz), dis_buf);
   4137             break;
   4138          case 4: /* MUL (unsigned widening) */
   4139             codegen_mulL_A_D ( sz, False, t1, dis_buf );
   4140             break;
   4141          case 5: /* IMUL */
   4142             codegen_mulL_A_D ( sz, True, t1, dis_buf );
   4143             break;
   4144          case 6: /* DIV */
   4145             codegen_div ( sz, t1, False );
   4146             DIP("div%c %s\n", nameISize(sz), dis_buf);
   4147             break;
   4148          case 7: /* IDIV */
   4149             codegen_div ( sz, t1, True );
   4150             DIP("idiv%c %s\n", nameISize(sz), dis_buf);
   4151             break;
   4152          default:
   4153             /*NOTREACHED*/
   4154             vpanic("Grp3(amd64,M)");
   4155       }
   4156    }
   4157    return delta;
   4158   unhandled:
   4159    *decode_OK = False;
   4160    return delta;
   4161 }
   4162 
   4163 
   4164 /* Group 4 extended opcodes.  We have to decide here whether F2 and F3
   4165    might be valid. */
   4166 static
   4167 ULong dis_Grp4 ( VexAbiInfo* vbi,
   4168                  Prefix pfx, Long delta, Bool* decode_OK )
   4169 {
   4170    Int   alen;
   4171    UChar modrm;
   4172    HChar dis_buf[50];
   4173    IRType ty = Ity_I8;
   4174    IRTemp t1 = newTemp(ty);
   4175    IRTemp t2 = newTemp(ty);
   4176 
   4177    *decode_OK = True;
   4178 
   4179    modrm = getUChar(delta);
   4180    if (epartIsReg(modrm)) {
   4181       /* F2/XACQ and F3/XREL are always invalid in the non-mem case. */
   4182       if (haveF2orF3(pfx)) goto unhandled;
   4183       assign(t1, getIRegE(1, pfx, modrm));
   4184       switch (gregLO3ofRM(modrm)) {
   4185          case 0: /* INC */
   4186             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   4187             putIRegE(1, pfx, modrm, mkexpr(t2));
   4188             setFlags_INC_DEC( True, t2, ty );
   4189             break;
   4190          case 1: /* DEC */
   4191             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   4192             putIRegE(1, pfx, modrm, mkexpr(t2));
   4193             setFlags_INC_DEC( False, t2, ty );
   4194             break;
   4195          default:
   4196             *decode_OK = False;
   4197             return delta;
   4198       }
   4199       delta++;
   4200       DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)),
   4201                       nameIRegE(1, pfx, modrm));
   4202    } else {
   4203       /* Decide if F2/XACQ or F3/XREL might be valid. */
   4204       Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
   4205       if ((gregLO3ofRM(modrm) == 0/*INC*/ || gregLO3ofRM(modrm) == 1/*DEC*/)
   4206           && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
   4207          validF2orF3 = True;
   4208       }
   4209       if (!validF2orF3) goto unhandled;
   4210       /* */
   4211       IRTemp addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   4212       assign( t1, loadLE(ty, mkexpr(addr)) );
   4213       switch (gregLO3ofRM(modrm)) {
   4214          case 0: /* INC */
   4215             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   4216             if (haveLOCK(pfx)) {
   4217                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   4218                       guest_RIP_curr_instr );
   4219             } else {
   4220                storeLE( mkexpr(addr), mkexpr(t2) );
   4221             }
   4222             setFlags_INC_DEC( True, t2, ty );
   4223             break;
   4224          case 1: /* DEC */
   4225             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   4226             if (haveLOCK(pfx)) {
   4227                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   4228                       guest_RIP_curr_instr );
   4229             } else {
   4230                storeLE( mkexpr(addr), mkexpr(t2) );
   4231             }
   4232             setFlags_INC_DEC( False, t2, ty );
   4233             break;
   4234          default:
   4235             *decode_OK = False;
   4236             return delta;
   4237       }
   4238       delta += alen;
   4239       DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)), dis_buf);
   4240    }
   4241    return delta;
   4242   unhandled:
   4243    *decode_OK = False;
   4244    return delta;
   4245 }
   4246 
   4247 
   4248 /* Group 5 extended opcodes.  We have to decide here whether F2 and F3
   4249    might be valid. */
   4250 static
   4251 ULong dis_Grp5 ( VexAbiInfo* vbi,
   4252                  Prefix pfx, Int sz, Long delta,
   4253                  /*MOD*/DisResult* dres, /*OUT*/Bool* decode_OK )
   4254 {
   4255    Int     len;
   4256    UChar   modrm;
   4257    HChar   dis_buf[50];
   4258    IRTemp  addr = IRTemp_INVALID;
   4259    IRType  ty = szToITy(sz);
   4260    IRTemp  t1 = newTemp(ty);
   4261    IRTemp  t2 = IRTemp_INVALID;
   4262    IRTemp  t3 = IRTemp_INVALID;
   4263    Bool    showSz = True;
   4264 
   4265    *decode_OK = True;
   4266 
   4267    modrm = getUChar(delta);
   4268    if (epartIsReg(modrm)) {
   4269       /* F2/XACQ and F3/XREL are always invalid in the non-mem case.
   4270          F2/CALL and F2/JMP may have bnd prefix. */
   4271      if (haveF2orF3(pfx)
   4272          && ! (haveF2(pfx)
   4273                && (gregLO3ofRM(modrm) == 2 || gregLO3ofRM(modrm) == 4)))
   4274         goto unhandledR;
   4275       assign(t1, getIRegE(sz,pfx,modrm));
   4276       switch (gregLO3ofRM(modrm)) {
   4277          case 0: /* INC */
   4278             t2 = newTemp(ty);
   4279             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   4280                              mkexpr(t1), mkU(ty,1)));
   4281             setFlags_INC_DEC( True, t2, ty );
   4282             putIRegE(sz,pfx,modrm, mkexpr(t2));
   4283             break;
   4284          case 1: /* DEC */
   4285             t2 = newTemp(ty);
   4286             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   4287                              mkexpr(t1), mkU(ty,1)));
   4288             setFlags_INC_DEC( False, t2, ty );
   4289             putIRegE(sz,pfx,modrm, mkexpr(t2));
   4290             break;
   4291          case 2: /* call Ev */
   4292             /* Ignore any sz value and operate as if sz==8. */
   4293             if (!(sz == 4 || sz == 8)) goto unhandledR;
   4294             if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   4295             sz = 8;
   4296             t3 = newTemp(Ity_I64);
   4297             assign(t3, getIRegE(sz,pfx,modrm));
   4298             t2 = newTemp(Ity_I64);
   4299             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   4300             putIReg64(R_RSP, mkexpr(t2));
   4301             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+1));
   4302             make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(reg)");
   4303             jmp_treg(dres, Ijk_Call, t3);
   4304             vassert(dres->whatNext == Dis_StopHere);
   4305             showSz = False;
   4306             break;
   4307          case 4: /* jmp Ev */
   4308             /* Ignore any sz value and operate as if sz==8. */
   4309             if (!(sz == 4 || sz == 8)) goto unhandledR;
   4310             if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   4311             sz = 8;
   4312             t3 = newTemp(Ity_I64);
   4313             assign(t3, getIRegE(sz,pfx,modrm));
   4314             jmp_treg(dres, Ijk_Boring, t3);
   4315             vassert(dres->whatNext == Dis_StopHere);
   4316             showSz = False;
   4317             break;
   4318          case 6: /* PUSH Ev */
   4319             /* There is no encoding for 32-bit operand size; hence ... */
   4320             if (sz == 4) sz = 8;
   4321             if (sz == 8 || sz == 2) {
   4322                ty = szToITy(sz); /* redo it, since sz might have changed */
   4323                t3 = newTemp(ty);
   4324                assign(t3, getIRegE(sz,pfx,modrm));
   4325                t2 = newTemp(Ity_I64);
   4326                assign( t2, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
   4327                putIReg64(R_RSP, mkexpr(t2) );
   4328                storeLE( mkexpr(t2), mkexpr(t3) );
   4329                break;
   4330             } else {
   4331                goto unhandledR; /* awaiting test case */
   4332             }
   4333          default:
   4334          unhandledR:
   4335             *decode_OK = False;
   4336             return delta;
   4337       }
   4338       delta++;
   4339       DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
   4340                        showSz ? nameISize(sz) : ' ',
   4341                        nameIRegE(sz, pfx, modrm));
   4342    } else {
   4343       /* Decide if F2/XACQ, F3/XREL, F2/CALL or F2/JMP might be valid. */
   4344       Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
   4345       if ((gregLO3ofRM(modrm) == 0/*INC*/ || gregLO3ofRM(modrm) == 1/*DEC*/)
   4346           && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
   4347          validF2orF3 = True;
   4348       } else if ((gregLO3ofRM(modrm) == 2 || gregLO3ofRM(modrm) == 4)
   4349                  && (haveF2(pfx) && !haveF3(pfx))) {
   4350          validF2orF3 = True;
   4351       }
   4352       if (!validF2orF3) goto unhandledM;
   4353       /* */
   4354       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   4355       if (gregLO3ofRM(modrm) != 2 && gregLO3ofRM(modrm) != 4
   4356                                   && gregLO3ofRM(modrm) != 6) {
   4357          assign(t1, loadLE(ty,mkexpr(addr)));
   4358       }
   4359       switch (gregLO3ofRM(modrm)) {
   4360          case 0: /* INC */
   4361             t2 = newTemp(ty);
   4362             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   4363                              mkexpr(t1), mkU(ty,1)));
   4364             if (haveLOCK(pfx)) {
   4365                casLE( mkexpr(addr),
   4366                       mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
   4367             } else {
   4368                storeLE(mkexpr(addr),mkexpr(t2));
   4369             }
   4370             setFlags_INC_DEC( True, t2, ty );
   4371             break;
   4372          case 1: /* DEC */
   4373             t2 = newTemp(ty);
   4374             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   4375                              mkexpr(t1), mkU(ty,1)));
   4376             if (haveLOCK(pfx)) {
   4377                casLE( mkexpr(addr),
   4378                       mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
   4379             } else {
   4380                storeLE(mkexpr(addr),mkexpr(t2));
   4381             }
   4382             setFlags_INC_DEC( False, t2, ty );
   4383             break;
   4384          case 2: /* call Ev */
   4385             /* Ignore any sz value and operate as if sz==8. */
   4386             if (!(sz == 4 || sz == 8)) goto unhandledM;
   4387             if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   4388             sz = 8;
   4389             t3 = newTemp(Ity_I64);
   4390             assign(t3, loadLE(Ity_I64,mkexpr(addr)));
   4391             t2 = newTemp(Ity_I64);
   4392             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   4393             putIReg64(R_RSP, mkexpr(t2));
   4394             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+len));
   4395             make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(mem)");
   4396             jmp_treg(dres, Ijk_Call, t3);
   4397             vassert(dres->whatNext == Dis_StopHere);
   4398             showSz = False;
   4399             break;
   4400          case 4: /* JMP Ev */
   4401             /* Ignore any sz value and operate as if sz==8. */
   4402             if (!(sz == 4 || sz == 8)) goto unhandledM;
   4403             if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   4404             sz = 8;
   4405             t3 = newTemp(Ity_I64);
   4406             assign(t3, loadLE(Ity_I64,mkexpr(addr)));
   4407             jmp_treg(dres, Ijk_Boring, t3);
   4408             vassert(dres->whatNext == Dis_StopHere);
   4409             showSz = False;
   4410             break;
   4411          case 6: /* PUSH Ev */
   4412             /* There is no encoding for 32-bit operand size; hence ... */
   4413             if (sz == 4) sz = 8;
   4414             if (sz == 8 || sz == 2) {
   4415                ty = szToITy(sz); /* redo it, since sz might have changed */
   4416                t3 = newTemp(ty);
   4417                assign(t3, loadLE(ty,mkexpr(addr)));
   4418                t2 = newTemp(Ity_I64);
   4419                assign( t2, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
   4420                putIReg64(R_RSP, mkexpr(t2) );
   4421                storeLE( mkexpr(t2), mkexpr(t3) );
   4422                break;
   4423             } else {
   4424                goto unhandledM; /* awaiting test case */
   4425             }
   4426          default:
   4427          unhandledM:
   4428             *decode_OK = False;
   4429             return delta;
   4430       }
   4431       delta += len;
   4432       DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
   4433                        showSz ? nameISize(sz) : ' ',
   4434                        dis_buf);
   4435    }
   4436    return delta;
   4437 }
   4438 
   4439 
   4440 /*------------------------------------------------------------*/
   4441 /*--- Disassembling string ops (including REP prefixes)    ---*/
   4442 /*------------------------------------------------------------*/
   4443 
   4444 /* Code shared by all the string ops */
   4445 static
   4446 void dis_string_op_increment ( Int sz, IRTemp t_inc )
   4447 {
   4448    UChar logSz;
   4449    if (sz == 8 || sz == 4 || sz == 2) {
   4450       logSz = 1;
   4451       if (sz == 4) logSz = 2;
   4452       if (sz == 8) logSz = 3;
   4453       assign( t_inc,
   4454               binop(Iop_Shl64, IRExpr_Get( OFFB_DFLAG, Ity_I64 ),
   4455                                mkU8(logSz) ) );
   4456    } else {
   4457       assign( t_inc,
   4458               IRExpr_Get( OFFB_DFLAG, Ity_I64 ) );
   4459    }
   4460 }
   4461 
   4462 static
   4463 void dis_string_op( void (*dis_OP)( Int, IRTemp, Prefix pfx ),
   4464                     Int sz, const HChar* name, Prefix pfx )
   4465 {
   4466    IRTemp t_inc = newTemp(Ity_I64);
   4467    /* Really we ought to inspect the override prefixes, but we don't.
   4468       The following assertion catches any resulting sillyness. */
   4469    vassert(pfx == clearSegBits(pfx));
   4470    dis_string_op_increment(sz, t_inc);
   4471    dis_OP( sz, t_inc, pfx );
   4472    DIP("%s%c\n", name, nameISize(sz));
   4473 }
   4474 
   4475 static
   4476 void dis_MOVS ( Int sz, IRTemp t_inc, Prefix pfx )
   4477 {
   4478    IRType ty = szToITy(sz);
   4479    IRTemp td = newTemp(Ity_I64);   /* RDI */
   4480    IRTemp ts = newTemp(Ity_I64);   /* RSI */
   4481    IRExpr *incd, *incs;
   4482 
   4483    if (haveASO(pfx)) {
   4484       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4485       assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
   4486    } else {
   4487       assign( td, getIReg64(R_RDI) );
   4488       assign( ts, getIReg64(R_RSI) );
   4489    }
   4490 
   4491    storeLE( mkexpr(td), loadLE(ty,mkexpr(ts)) );
   4492 
   4493    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4494    incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
   4495    if (haveASO(pfx)) {
   4496       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4497       incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
   4498    }
   4499    putIReg64( R_RDI, incd );
   4500    putIReg64( R_RSI, incs );
   4501 }
   4502 
   4503 static
   4504 void dis_LODS ( Int sz, IRTemp t_inc, Prefix pfx )
   4505 {
   4506    IRType ty = szToITy(sz);
   4507    IRTemp ts = newTemp(Ity_I64);   /* RSI */
   4508    IRExpr *incs;
   4509 
   4510    if (haveASO(pfx))
   4511       assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
   4512    else
   4513       assign( ts, getIReg64(R_RSI) );
   4514 
   4515    putIRegRAX ( sz, loadLE(ty, mkexpr(ts)) );
   4516 
   4517    incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
   4518    if (haveASO(pfx))
   4519       incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
   4520    putIReg64( R_RSI, incs );
   4521 }
   4522 
   4523 static
   4524 void dis_STOS ( Int sz, IRTemp t_inc, Prefix pfx )
   4525 {
   4526    IRType ty = szToITy(sz);
   4527    IRTemp ta = newTemp(ty);        /* rAX */
   4528    IRTemp td = newTemp(Ity_I64);   /* RDI */
   4529    IRExpr *incd;
   4530 
   4531    assign( ta, getIRegRAX(sz) );
   4532 
   4533    if (haveASO(pfx))
   4534       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4535    else
   4536       assign( td, getIReg64(R_RDI) );
   4537 
   4538    storeLE( mkexpr(td), mkexpr(ta) );
   4539 
   4540    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4541    if (haveASO(pfx))
   4542       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4543    putIReg64( R_RDI, incd );
   4544 }
   4545 
   4546 static
   4547 void dis_CMPS ( Int sz, IRTemp t_inc, Prefix pfx )
   4548 {
   4549    IRType ty  = szToITy(sz);
   4550    IRTemp tdv = newTemp(ty);      /* (RDI) */
   4551    IRTemp tsv = newTemp(ty);      /* (RSI) */
   4552    IRTemp td  = newTemp(Ity_I64); /*  RDI  */
   4553    IRTemp ts  = newTemp(Ity_I64); /*  RSI  */
   4554    IRExpr *incd, *incs;
   4555 
   4556    if (haveASO(pfx)) {
   4557       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4558       assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
   4559    } else {
   4560       assign( td, getIReg64(R_RDI) );
   4561       assign( ts, getIReg64(R_RSI) );
   4562    }
   4563 
   4564    assign( tdv, loadLE(ty,mkexpr(td)) );
   4565 
   4566    assign( tsv, loadLE(ty,mkexpr(ts)) );
   4567 
   4568    setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
   4569 
   4570    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4571    incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
   4572    if (haveASO(pfx)) {
   4573       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4574       incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
   4575    }
   4576    putIReg64( R_RDI, incd );
   4577    putIReg64( R_RSI, incs );
   4578 }
   4579 
   4580 static
   4581 void dis_SCAS ( Int sz, IRTemp t_inc, Prefix pfx )
   4582 {
   4583    IRType ty  = szToITy(sz);
   4584    IRTemp ta  = newTemp(ty);       /*  rAX  */
   4585    IRTemp td  = newTemp(Ity_I64);  /*  RDI  */
   4586    IRTemp tdv = newTemp(ty);       /* (RDI) */
   4587    IRExpr *incd;
   4588 
   4589    assign( ta, getIRegRAX(sz) );
   4590 
   4591    if (haveASO(pfx))
   4592       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4593    else
   4594       assign( td, getIReg64(R_RDI) );
   4595 
   4596    assign( tdv, loadLE(ty,mkexpr(td)) );
   4597 
   4598    setFlags_DEP1_DEP2 ( Iop_Sub8, ta, tdv, ty );
   4599 
   4600    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4601    if (haveASO(pfx))
   4602       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4603    putIReg64( R_RDI, incd );
   4604 }
   4605 
   4606 
   4607 /* Wrap the appropriate string op inside a REP/REPE/REPNE.  We assume
   4608    the insn is the last one in the basic block, and so emit a jump to
   4609    the next insn, rather than just falling through. */
   4610 static
   4611 void dis_REP_op ( /*MOD*/DisResult* dres,
   4612                   AMD64Condcode cond,
   4613                   void (*dis_OP)(Int, IRTemp, Prefix),
   4614                   Int sz, Addr64 rip, Addr64 rip_next, const HChar* name,
   4615                   Prefix pfx )
   4616 {
   4617    IRTemp t_inc = newTemp(Ity_I64);
   4618    IRTemp tc;
   4619    IRExpr* cmp;
   4620 
   4621    /* Really we ought to inspect the override prefixes, but we don't.
   4622       The following assertion catches any resulting sillyness. */
   4623    vassert(pfx == clearSegBits(pfx));
   4624 
   4625    if (haveASO(pfx)) {
   4626       tc = newTemp(Ity_I32);  /*  ECX  */
   4627       assign( tc, getIReg32(R_RCX) );
   4628       cmp = binop(Iop_CmpEQ32, mkexpr(tc), mkU32(0));
   4629    } else {
   4630       tc = newTemp(Ity_I64);  /*  RCX  */
   4631       assign( tc, getIReg64(R_RCX) );
   4632       cmp = binop(Iop_CmpEQ64, mkexpr(tc), mkU64(0));
   4633    }
   4634 
   4635    stmt( IRStmt_Exit( cmp, Ijk_Boring,
   4636                       IRConst_U64(rip_next), OFFB_RIP ) );
   4637 
   4638    if (haveASO(pfx))
   4639       putIReg32(R_RCX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
   4640   else
   4641       putIReg64(R_RCX, binop(Iop_Sub64, mkexpr(tc), mkU64(1)) );
   4642 
   4643    dis_string_op_increment(sz, t_inc);
   4644    dis_OP (sz, t_inc, pfx);
   4645 
   4646    if (cond == AMD64CondAlways) {
   4647       jmp_lit(dres, Ijk_Boring, rip);
   4648       vassert(dres->whatNext == Dis_StopHere);
   4649    } else {
   4650       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(cond),
   4651                          Ijk_Boring,
   4652                          IRConst_U64(rip),
   4653                          OFFB_RIP ) );
   4654       jmp_lit(dres, Ijk_Boring, rip_next);
   4655       vassert(dres->whatNext == Dis_StopHere);
   4656    }
   4657    DIP("%s%c\n", name, nameISize(sz));
   4658 }
   4659 
   4660 
   4661 /*------------------------------------------------------------*/
   4662 /*--- Arithmetic, etc.                                     ---*/
   4663 /*------------------------------------------------------------*/
   4664 
   4665 /* IMUL E, G.  Supplied eip points to the modR/M byte. */
   4666 static
   4667 ULong dis_mul_E_G ( VexAbiInfo* vbi,
   4668                     Prefix      pfx,
   4669                     Int         size,
   4670                     Long        delta0 )
   4671 {
   4672    Int    alen;
   4673    HChar  dis_buf[50];
   4674    UChar  rm = getUChar(delta0);
   4675    IRType ty = szToITy(size);
   4676    IRTemp te = newTemp(ty);
   4677    IRTemp tg = newTemp(ty);
   4678    IRTemp resLo = newTemp(ty);
   4679 
   4680    assign( tg, getIRegG(size, pfx, rm) );
   4681    if (epartIsReg(rm)) {
   4682       assign( te, getIRegE(size, pfx, rm) );
   4683    } else {
   4684       IRTemp addr = disAMode( &alen, vbi, pfx, delta0, dis_buf, 0 );
   4685       assign( te, loadLE(ty,mkexpr(addr)) );
   4686    }
   4687 
   4688    setFlags_MUL ( ty, te, tg, AMD64G_CC_OP_SMULB );
   4689 
   4690    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tg) ) );
   4691 
   4692    putIRegG(size, pfx, rm, mkexpr(resLo) );
   4693 
   4694    if (epartIsReg(rm)) {
   4695       DIP("imul%c %s, %s\n", nameISize(size),
   4696                              nameIRegE(size,pfx,rm),
   4697                              nameIRegG(size,pfx,rm));
   4698       return 1+delta0;
   4699    } else {
   4700       DIP("imul%c %s, %s\n", nameISize(size),
   4701                              dis_buf,
   4702                              nameIRegG(size,pfx,rm));
   4703       return alen+delta0;
   4704    }
   4705 }
   4706 
   4707 
   4708 /* IMUL I * E -> G.  Supplied rip points to the modR/M byte. */
   4709 static
   4710 ULong dis_imul_I_E_G ( VexAbiInfo* vbi,
   4711                        Prefix      pfx,
   4712                        Int         size,
   4713                        Long        delta,
   4714                        Int         litsize )
   4715 {
   4716    Long   d64;
   4717    Int    alen;
   4718    HChar  dis_buf[50];
   4719    UChar  rm = getUChar(delta);
   4720    IRType ty = szToITy(size);
   4721    IRTemp te = newTemp(ty);
   4722    IRTemp tl = newTemp(ty);
   4723    IRTemp resLo = newTemp(ty);
   4724 
   4725    vassert(/*size == 1 ||*/ size == 2 || size == 4 || size == 8);
   4726 
   4727    if (epartIsReg(rm)) {
   4728       assign(te, getIRegE(size, pfx, rm));
   4729       delta++;
   4730    } else {
   4731       IRTemp addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   4732                                      imin(4,litsize) );
   4733       assign(te, loadLE(ty, mkexpr(addr)));
   4734       delta += alen;
   4735    }
   4736    d64 = getSDisp(imin(4,litsize),delta);
   4737    delta += imin(4,litsize);
   4738 
   4739    d64 &= mkSizeMask(size);
   4740    assign(tl, mkU(ty,d64));
   4741 
   4742    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tl) ));
   4743 
   4744    setFlags_MUL ( ty, te, tl, AMD64G_CC_OP_SMULB );
   4745 
   4746    putIRegG(size, pfx, rm, mkexpr(resLo));
   4747 
   4748    DIP("imul%c $%lld, %s, %s\n",
   4749        nameISize(size), d64,
   4750        ( epartIsReg(rm) ? nameIRegE(size,pfx,rm) : dis_buf ),
   4751        nameIRegG(size,pfx,rm) );
   4752    return delta;
   4753 }
   4754 
   4755 
   4756 /* Generate an IR sequence to do a popcount operation on the supplied
   4757    IRTemp, and return a new IRTemp holding the result.  'ty' may be
   4758    Ity_I16, Ity_I32 or Ity_I64 only. */
   4759 static IRTemp gen_POPCOUNT ( IRType ty, IRTemp src )
   4760 {
   4761    Int i;
   4762    if (ty == Ity_I16) {
   4763       IRTemp old = IRTemp_INVALID;
   4764       IRTemp nyu = IRTemp_INVALID;
   4765       IRTemp mask[4], shift[4];
   4766       for (i = 0; i < 4; i++) {
   4767          mask[i]  = newTemp(ty);
   4768          shift[i] = 1 << i;
   4769       }
   4770       assign(mask[0], mkU16(0x5555));
   4771       assign(mask[1], mkU16(0x3333));
   4772       assign(mask[2], mkU16(0x0F0F));
   4773       assign(mask[3], mkU16(0x00FF));
   4774       old = src;
   4775       for (i = 0; i < 4; i++) {
   4776          nyu = newTemp(ty);
   4777          assign(nyu,
   4778                 binop(Iop_Add16,
   4779                       binop(Iop_And16,
   4780                             mkexpr(old),
   4781                             mkexpr(mask[i])),
   4782                       binop(Iop_And16,
   4783                             binop(Iop_Shr16, mkexpr(old), mkU8(shift[i])),
   4784                             mkexpr(mask[i]))));
   4785          old = nyu;
   4786       }
   4787       return nyu;
   4788    }
   4789    if (ty == Ity_I32) {
   4790       IRTemp old = IRTemp_INVALID;
   4791       IRTemp nyu = IRTemp_INVALID;
   4792       IRTemp mask[5], shift[5];
   4793       for (i = 0; i < 5; i++) {
   4794          mask[i]  = newTemp(ty);
   4795          shift[i] = 1 << i;
   4796       }
   4797       assign(mask[0], mkU32(0x55555555));
   4798       assign(mask[1], mkU32(0x33333333));
   4799       assign(mask[2], mkU32(0x0F0F0F0F));
   4800       assign(mask[3], mkU32(0x00FF00FF));
   4801       assign(mask[4], mkU32(0x0000FFFF));
   4802       old = src;
   4803       for (i = 0; i < 5; i++) {
   4804          nyu = newTemp(ty);
   4805          assign(nyu,
   4806                 binop(Iop_Add32,
   4807                       binop(Iop_And32,
   4808                             mkexpr(old),
   4809                             mkexpr(mask[i])),
   4810                       binop(Iop_And32,
   4811                             binop(Iop_Shr32, mkexpr(old), mkU8(shift[i])),
   4812                             mkexpr(mask[i]))));
   4813          old = nyu;
   4814       }
   4815       return nyu;
   4816    }
   4817    if (ty == Ity_I64) {
   4818       IRTemp old = IRTemp_INVALID;
   4819       IRTemp nyu = IRTemp_INVALID;
   4820       IRTemp mask[6], shift[6];
   4821       for (i = 0; i < 6; i++) {
   4822          mask[i]  = newTemp(ty);
   4823          shift[i] = 1 << i;
   4824       }
   4825       assign(mask[0], mkU64(0x5555555555555555ULL));
   4826       assign(mask[1], mkU64(0x3333333333333333ULL));
   4827       assign(mask[2], mkU64(0x0F0F0F0F0F0F0F0FULL));
   4828       assign(mask[3], mkU64(0x00FF00FF00FF00FFULL));
   4829       assign(mask[4], mkU64(0x0000FFFF0000FFFFULL));
   4830       assign(mask[5], mkU64(0x00000000FFFFFFFFULL));
   4831       old = src;
   4832       for (i = 0; i < 6; i++) {
   4833          nyu = newTemp(ty);
   4834          assign(nyu,
   4835                 binop(Iop_Add64,
   4836                       binop(Iop_And64,
   4837                             mkexpr(old),
   4838                             mkexpr(mask[i])),
   4839                       binop(Iop_And64,
   4840                             binop(Iop_Shr64, mkexpr(old), mkU8(shift[i])),
   4841                             mkexpr(mask[i]))));
   4842          old = nyu;
   4843       }
   4844       return nyu;
   4845    }
   4846    /*NOTREACHED*/
   4847    vassert(0);
   4848 }
   4849 
   4850 
   4851 /* Generate an IR sequence to do a count-leading-zeroes operation on
   4852    the supplied IRTemp, and return a new IRTemp holding the result.
   4853    'ty' may be Ity_I16, Ity_I32 or Ity_I64 only.  In the case where
   4854    the argument is zero, return the number of bits in the word (the
   4855    natural semantics). */
   4856 static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
   4857 {
   4858    vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16);
   4859 
   4860    IRTemp src64 = newTemp(Ity_I64);
   4861    assign(src64, widenUto64( mkexpr(src) ));
   4862 
   4863    IRTemp src64x = newTemp(Ity_I64);
   4864    assign(src64x,
   4865           binop(Iop_Shl64, mkexpr(src64),
   4866                            mkU8(64 - 8 * sizeofIRType(ty))));
   4867 
   4868    // Clz64 has undefined semantics when its input is zero, so
   4869    // special-case around that.
   4870    IRTemp res64 = newTemp(Ity_I64);
   4871    assign(res64,
   4872           IRExpr_ITE(
   4873              binop(Iop_CmpEQ64, mkexpr(src64x), mkU64(0)),
   4874              mkU64(8 * sizeofIRType(ty)),
   4875              unop(Iop_Clz64, mkexpr(src64x))
   4876    ));
   4877 
   4878    IRTemp res = newTemp(ty);
   4879    assign(res, narrowTo(ty, mkexpr(res64)));
   4880    return res;
   4881 }
   4882 
   4883 
   4884 /* Generate an IR sequence to do a count-trailing-zeroes operation on
   4885    the supplied IRTemp, and return a new IRTemp holding the result.
   4886    'ty' may be Ity_I16, Ity_I32 or Ity_I64 only.  In the case where
   4887    the argument is zero, return the number of bits in the word (the
   4888    natural semantics). */
   4889 static IRTemp gen_TZCNT ( IRType ty, IRTemp src )
   4890 {
   4891    vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16);
   4892 
   4893    IRTemp src64 = newTemp(Ity_I64);
   4894    assign(src64, widenUto64( mkexpr(src) ));
   4895 
   4896    // Ctz64 has undefined semantics when its input is zero, so
   4897    // special-case around that.
   4898    IRTemp res64 = newTemp(Ity_I64);
   4899    assign(res64,
   4900           IRExpr_ITE(
   4901              binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0)),
   4902              mkU64(8 * sizeofIRType(ty)),
   4903              unop(Iop_Ctz64, mkexpr(src64))
   4904    ));
   4905 
   4906    IRTemp res = newTemp(ty);
   4907    assign(res, narrowTo(ty, mkexpr(res64)));
   4908    return res;
   4909 }
   4910 
   4911 
   4912 /*------------------------------------------------------------*/
   4913 /*---                                                      ---*/
   4914 /*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
   4915 /*---                                                      ---*/
   4916 /*------------------------------------------------------------*/
   4917 
   4918 /* --- Helper functions for dealing with the register stack. --- */
   4919 
   4920 /* --- Set the emulation-warning pseudo-register. --- */
   4921 
   4922 static void put_emwarn ( IRExpr* e /* :: Ity_I32 */ )
   4923 {
   4924    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   4925    stmt( IRStmt_Put( OFFB_EMNOTE, e ) );
   4926 }
   4927 
   4928 /* --- Produce an IRExpr* denoting a 64-bit QNaN. --- */
   4929 
   4930 static IRExpr* mkQNaN64 ( void )
   4931 {
   4932   /* QNaN is 0 2047 1 0(51times)
   4933      == 0b 11111111111b 1 0(51times)
   4934      == 0x7FF8 0000 0000 0000
   4935    */
   4936    return IRExpr_Const(IRConst_F64i(0x7FF8000000000000ULL));
   4937 }
   4938 
   4939 /* --------- Get/put the top-of-stack pointer :: Ity_I32 --------- */
   4940 
   4941 static IRExpr* get_ftop ( void )
   4942 {
   4943    return IRExpr_Get( OFFB_FTOP, Ity_I32 );
   4944 }
   4945 
   4946 static void put_ftop ( IRExpr* e )
   4947 {
   4948    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   4949    stmt( IRStmt_Put( OFFB_FTOP, e ) );
   4950 }
   4951 
   4952 /* --------- Get/put the C3210 bits. --------- */
   4953 
   4954 static IRExpr*  /* :: Ity_I64 */ get_C3210 ( void )
   4955 {
   4956    return IRExpr_Get( OFFB_FC3210, Ity_I64 );
   4957 }
   4958 
   4959 static void put_C3210 ( IRExpr* e  /* :: Ity_I64 */ )
   4960 {
   4961    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
   4962    stmt( IRStmt_Put( OFFB_FC3210, e ) );
   4963 }
   4964 
   4965 /* --------- Get/put the FPU rounding mode. --------- */
   4966 static IRExpr* /* :: Ity_I32 */ get_fpround ( void )
   4967 {
   4968    return unop(Iop_64to32, IRExpr_Get( OFFB_FPROUND, Ity_I64 ));
   4969 }
   4970 
   4971 static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
   4972 {
   4973    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   4974    stmt( IRStmt_Put( OFFB_FPROUND, unop(Iop_32Uto64,e) ) );
   4975 }
   4976 
   4977 
   4978 /* --------- Synthesise a 2-bit FPU rounding mode. --------- */
   4979 /* Produces a value in 0 .. 3, which is encoded as per the type
   4980    IRRoundingMode.  Since the guest_FPROUND value is also encoded as
   4981    per IRRoundingMode, we merely need to get it and mask it for
   4982    safety.
   4983 */
   4984 static IRExpr* /* :: Ity_I32 */ get_roundingmode ( void )
   4985 {
   4986    return binop( Iop_And32, get_fpround(), mkU32(3) );
   4987 }
   4988 
   4989 static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
   4990 {
   4991    return mkU32(Irrm_NEAREST);
   4992 }
   4993 
   4994 
   4995 /* --------- Get/set FP register tag bytes. --------- */
   4996 
   4997 /* Given i, and some expression e, generate 'ST_TAG(i) = e'. */
   4998 
   4999 static void put_ST_TAG ( Int i, IRExpr* value )
   5000 {
   5001    IRRegArray* descr;
   5002    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_I8);
   5003    descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   5004    stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
   5005 }
   5006 
   5007 /* Given i, generate an expression yielding 'ST_TAG(i)'.  This will be
   5008    zero to indicate "Empty" and nonzero to indicate "NonEmpty".  */
   5009 
   5010 static IRExpr* get_ST_TAG ( Int i )
   5011 {
   5012    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   5013    return IRExpr_GetI( descr, get_ftop(), i );
   5014 }
   5015 
   5016 
   5017 /* --------- Get/set FP registers. --------- */
   5018 
   5019 /* Given i, and some expression e, emit 'ST(i) = e' and set the
   5020    register's tag to indicate the register is full.  The previous
   5021    state of the register is not checked. */
   5022 
   5023 static void put_ST_UNCHECKED ( Int i, IRExpr* value )
   5024 {
   5025    IRRegArray* descr;
   5026    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_F64);
   5027    descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   5028    stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
   5029    /* Mark the register as in-use. */
   5030    put_ST_TAG(i, mkU8(1));
   5031 }
   5032 
   5033 /* Given i, and some expression e, emit
   5034       ST(i) = is_full(i) ? NaN : e
   5035    and set the tag accordingly.
   5036 */
   5037 
   5038 static void put_ST ( Int i, IRExpr* value )
   5039 {
   5040    put_ST_UNCHECKED(
   5041       i,
   5042       IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
   5043                   /* non-0 means full */
   5044                   mkQNaN64(),
   5045                   /* 0 means empty */
   5046                   value
   5047       )
   5048    );
   5049 }
   5050 
   5051 
   5052 /* Given i, generate an expression yielding 'ST(i)'. */
   5053 
   5054 static IRExpr* get_ST_UNCHECKED ( Int i )
   5055 {
   5056    IRRegArray* descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   5057    return IRExpr_GetI( descr, get_ftop(), i );
   5058 }
   5059 
   5060 
   5061 /* Given i, generate an expression yielding
   5062   is_full(i) ? ST(i) : NaN
   5063 */
   5064 
   5065 static IRExpr* get_ST ( Int i )
   5066 {
   5067    return
   5068       IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
   5069                   /* non-0 means full */
   5070                   get_ST_UNCHECKED(i),
   5071                   /* 0 means empty */
   5072                   mkQNaN64());
   5073 }
   5074 
   5075 
   5076 /* Given i, and some expression e, and a condition cond, generate IR
   5077    which has the same effect as put_ST(i,e) when cond is true and has
   5078    no effect when cond is false.  Given the lack of proper
   5079    if-then-else in the IR, this is pretty tricky.
   5080 */
   5081 
   5082 static void maybe_put_ST ( IRTemp cond, Int i, IRExpr* value )
   5083 {
   5084    // new_tag = if cond then FULL else old_tag
   5085    // new_val = if cond then (if old_tag==FULL then NaN else val)
   5086    //                   else old_val
   5087 
   5088    IRTemp old_tag = newTemp(Ity_I8);
   5089    assign(old_tag, get_ST_TAG(i));
   5090    IRTemp new_tag = newTemp(Ity_I8);
   5091    assign(new_tag,
   5092           IRExpr_ITE(mkexpr(cond), mkU8(1)/*FULL*/, mkexpr(old_tag)));
   5093 
   5094    IRTemp old_val = newTemp(Ity_F64);
   5095    assign(old_val, get_ST_UNCHECKED(i));
   5096    IRTemp new_val = newTemp(Ity_F64);
   5097    assign(new_val,
   5098           IRExpr_ITE(mkexpr(cond),
   5099                      IRExpr_ITE(binop(Iop_CmpNE8, mkexpr(old_tag), mkU8(0)),
   5100                                 /* non-0 means full */
   5101                                 mkQNaN64(),
   5102                                 /* 0 means empty */
   5103                                 value),
   5104                      mkexpr(old_val)));
   5105 
   5106    put_ST_UNCHECKED(i, mkexpr(new_val));
   5107    // put_ST_UNCHECKED incorrectly sets tag(i) to always be FULL.  So
   5108    // now set it to new_tag instead.
   5109    put_ST_TAG(i, mkexpr(new_tag));
   5110 }
   5111 
   5112 /* Adjust FTOP downwards by one register. */
   5113 
   5114 static void fp_push ( void )
   5115 {
   5116    put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
   5117 }
   5118 
   5119 /* Adjust FTOP downwards by one register when COND is 1:I1.  Else
   5120    don't change it. */
   5121 
   5122 static void maybe_fp_push ( IRTemp cond )
   5123 {
   5124    put_ftop( binop(Iop_Sub32, get_ftop(), unop(Iop_1Uto32,mkexpr(cond))) );
   5125 }
   5126 
   5127 /* Adjust FTOP upwards by one register, and mark the vacated register
   5128    as empty.  */
   5129 
   5130 static void fp_pop ( void )
   5131 {
   5132    put_ST_TAG(0, mkU8(0));
   5133    put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   5134 }
   5135 
   5136 /* Set the C2 bit of the FPU status register to e[0].  Assumes that
   5137    e[31:1] == 0.
   5138 */
   5139 static void set_C2 ( IRExpr* e )
   5140 {
   5141    IRExpr* cleared = binop(Iop_And64, get_C3210(), mkU64(~AMD64G_FC_MASK_C2));
   5142    put_C3210( binop(Iop_Or64,
   5143                     cleared,
   5144                     binop(Iop_Shl64, e, mkU8(AMD64G_FC_SHIFT_C2))) );
   5145 }
   5146 
   5147 /* Generate code to check that abs(d64) < 2^63 and is finite.  This is
   5148    used to do the range checks for FSIN, FCOS, FSINCOS and FPTAN.  The
   5149    test is simple, but the derivation of it is not so simple.
   5150 
   5151    The exponent field for an IEEE754 double is 11 bits.  That means it
   5152    can take values 0 through 0x7FF.  If the exponent has value 0x7FF,
   5153    the number is either a NaN or an Infinity and so is not finite.
   5154    Furthermore, a finite value of exactly 2^63 is the smallest value
   5155    that has exponent value 0x43E.  Hence, what we need to do is
   5156    extract the exponent, ignoring the sign bit and mantissa, and check
   5157    it is < 0x43E, or <= 0x43D.
   5158 
   5159    To make this easily applicable to 32- and 64-bit targets, a
   5160    roundabout approach is used.  First the number is converted to I64,
   5161    then the top 32 bits are taken.  Shifting them right by 20 bits
   5162    places the sign bit and exponent in the bottom 12 bits.  Anding
   5163    with 0x7FF gets rid of the sign bit, leaving just the exponent
   5164    available for comparison.
   5165 */
   5166 static IRTemp math_IS_TRIG_ARG_FINITE_AND_IN_RANGE ( IRTemp d64 )
   5167 {
   5168    IRTemp i64 = newTemp(Ity_I64);
   5169    assign(i64, unop(Iop_ReinterpF64asI64, mkexpr(d64)) );
   5170    IRTemp exponent = newTemp(Ity_I32);
   5171    assign(exponent,
   5172           binop(Iop_And32,
   5173                 binop(Iop_Shr32, unop(Iop_64HIto32, mkexpr(i64)), mkU8(20)),
   5174                 mkU32(0x7FF)));
   5175    IRTemp in_range_and_finite = newTemp(Ity_I1);
   5176    assign(in_range_and_finite,
   5177           binop(Iop_CmpLE32U, mkexpr(exponent), mkU32(0x43D)));
   5178    return in_range_and_finite;
   5179 }
   5180 
   5181 /* Invent a plausible-looking FPU status word value:
   5182       ((ftop & 7) << 11) | (c3210 & 0x4700)
   5183  */
   5184 static IRExpr* get_FPU_sw ( void )
   5185 {
   5186    return
   5187       unop(Iop_32to16,
   5188            binop(Iop_Or32,
   5189                  binop(Iop_Shl32,
   5190                        binop(Iop_And32, get_ftop(), mkU32(7)),
   5191                              mkU8(11)),
   5192                        binop(Iop_And32, unop(Iop_64to32, get_C3210()),
   5193                                         mkU32(0x4700))
   5194       ));
   5195 }
   5196 
   5197 
   5198 /* ------------------------------------------------------- */
   5199 /* Given all that stack-mangling junk, we can now go ahead
   5200    and describe FP instructions.
   5201 */
   5202 
   5203 /* ST(0) = ST(0) `op` mem64/32(addr)
   5204    Need to check ST(0)'s tag on read, but not on write.
   5205 */
   5206 static
   5207 void fp_do_op_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
   5208                          IROp op, Bool dbl )
   5209 {
   5210    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   5211    if (dbl) {
   5212       put_ST_UNCHECKED(0,
   5213          triop( op,
   5214                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5215                 get_ST(0),
   5216                 loadLE(Ity_F64,mkexpr(addr))
   5217          ));
   5218    } else {
   5219       put_ST_UNCHECKED(0,
   5220          triop( op,
   5221                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5222                 get_ST(0),
   5223                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
   5224          ));
   5225    }
   5226 }
   5227 
   5228 
   5229 /* ST(0) = mem64/32(addr) `op` ST(0)
   5230    Need to check ST(0)'s tag on read, but not on write.
   5231 */
   5232 static
   5233 void fp_do_oprev_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
   5234                             IROp op, Bool dbl )
   5235 {
   5236    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   5237    if (dbl) {
   5238       put_ST_UNCHECKED(0,
   5239          triop( op,
   5240                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5241                 loadLE(Ity_F64,mkexpr(addr)),
   5242                 get_ST(0)
   5243          ));
   5244    } else {
   5245       put_ST_UNCHECKED(0,
   5246          triop( op,
   5247                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5248                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
   5249                 get_ST(0)
   5250          ));
   5251    }
   5252 }
   5253 
   5254 
   5255 /* ST(dst) = ST(dst) `op` ST(src).
   5256    Check dst and src tags when reading but not on write.
   5257 */
   5258 static
   5259 void fp_do_op_ST_ST ( const HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
   5260                       Bool pop_after )
   5261 {
   5262    DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
   5263    put_ST_UNCHECKED(
   5264       st_dst,
   5265       triop( op,
   5266              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5267              get_ST(st_dst),
   5268              get_ST(st_src) )
   5269    );
   5270    if (pop_after)
   5271       fp_pop();
   5272 }
   5273 
   5274 /* ST(dst) = ST(src) `op` ST(dst).
   5275    Check dst and src tags when reading but not on write.
   5276 */
   5277 static
   5278 void fp_do_oprev_ST_ST ( const HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
   5279                          Bool pop_after )
   5280 {
   5281    DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
   5282    put_ST_UNCHECKED(
   5283       st_dst,
   5284       triop( op,
   5285              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5286              get_ST(st_src),
   5287              get_ST(st_dst) )
   5288    );
   5289    if (pop_after)
   5290       fp_pop();
   5291 }
   5292 
   5293 /* %rflags(Z,P,C) = UCOMI( st(0), st(i) ) */
   5294 static void fp_do_ucomi_ST0_STi ( UInt i, Bool pop_after )
   5295 {
   5296    DIP("fucomi%s %%st(0),%%st(%u)\n", pop_after ? "p" : "", i);
   5297    /* This is a bit of a hack (and isn't really right).  It sets
   5298       Z,P,C,O correctly, but forces A and S to zero, whereas the Intel
   5299       documentation implies A and S are unchanged.
   5300    */
   5301    /* It's also fishy in that it is used both for COMIP and
   5302       UCOMIP, and they aren't the same (although similar). */
   5303    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   5304    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   5305    stmt( IRStmt_Put(
   5306             OFFB_CC_DEP1,
   5307             binop( Iop_And64,
   5308                    unop( Iop_32Uto64,
   5309                          binop(Iop_CmpF64, get_ST(0), get_ST(i))),
   5310                    mkU64(0x45)
   5311         )));
   5312    if (pop_after)
   5313       fp_pop();
   5314 }
   5315 
   5316 
   5317 /* returns
   5318    32to16( if e32 <s -32768 || e32 >s 32767 then -32768 else e32 )
   5319 */
   5320 static IRExpr* x87ishly_qnarrow_32_to_16 ( IRExpr* e32 )
   5321 {
   5322    IRTemp t32 = newTemp(Ity_I32);
   5323    assign( t32, e32 );
   5324    return
   5325       IRExpr_ITE(
   5326          binop(Iop_CmpLT64U,
   5327                unop(Iop_32Uto64,
   5328                     binop(Iop_Add32, mkexpr(t32), mkU32(32768))),
   5329                mkU64(65536)),
   5330          unop(Iop_32to16, mkexpr(t32)),
   5331          mkU16( 0x8000 ) );
   5332 }
   5333 
   5334 
   5335 static
   5336 ULong dis_FPU ( /*OUT*/Bool* decode_ok,
   5337                 VexAbiInfo* vbi, Prefix pfx, Long delta )
   5338 {
   5339    Int    len;
   5340    UInt   r_src, r_dst;
   5341    HChar  dis_buf[50];
   5342    IRTemp t1, t2;
   5343 
   5344    /* On entry, delta points at the second byte of the insn (the modrm
   5345       byte).*/
   5346    UChar first_opcode = getUChar(delta-1);
   5347    UChar modrm        = getUChar(delta+0);
   5348 
   5349    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */
   5350 
   5351    if (first_opcode == 0xD8) {
   5352       if (modrm < 0xC0) {
   5353 
   5354          /* bits 5,4,3 are an opcode extension, and the modRM also
   5355            specifies an address. */
   5356          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   5357          delta += len;
   5358 
   5359          switch (gregLO3ofRM(modrm)) {
   5360 
   5361             case 0: /* FADD single-real */
   5362                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, False );
   5363                break;
   5364 
   5365             case 1: /* FMUL single-real */
   5366                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, False );
   5367                break;
   5368 
   5369             case 2: /* FCOM single-real */
   5370                DIP("fcoms %s\n", dis_buf);
   5371                /* This forces C1 to zero, which isn't right. */
   5372                /* The AMD documentation suggests that forcing C1 to
   5373                   zero is correct (Eliot Moss) */
   5374                put_C3210(
   5375                    unop( Iop_32Uto64,
   5376                        binop( Iop_And32,
   5377                               binop(Iop_Shl32,
   5378                                     binop(Iop_CmpF64,
   5379                                           get_ST(0),
   5380                                           unop(Iop_F32toF64,
   5381                                                loadLE(Ity_F32,mkexpr(addr)))),
   5382                                     mkU8(8)),
   5383                               mkU32(0x4500)
   5384                    )));
   5385                break;
   5386 
   5387             case 3: /* FCOMP single-real */
   5388                /* The AMD documentation suggests that forcing C1 to
   5389                   zero is correct (Eliot Moss) */
   5390                DIP("fcomps %s\n", dis_buf);
   5391                /* This forces C1 to zero, which isn't right. */
   5392                put_C3210(
   5393                    unop( Iop_32Uto64,
   5394                        binop( Iop_And32,
   5395                               binop(Iop_Shl32,
   5396                                     binop(Iop_CmpF64,
   5397                                           get_ST(0),
   5398                                           unop(Iop_F32toF64,
   5399                                                loadLE(Ity_F32,mkexpr(addr)))),
   5400                                     mkU8(8)),
   5401                               mkU32(0x4500)
   5402                    )));
   5403                fp_pop();
   5404                break;
   5405 
   5406             case 4: /* FSUB single-real */
   5407                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, False );
   5408                break;
   5409 
   5410             case 5: /* FSUBR single-real */
   5411                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, False );
   5412                break;
   5413 
   5414             case 6: /* FDIV single-real */
   5415                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, False );
   5416                break;
   5417 
   5418             case 7: /* FDIVR single-real */
   5419                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, False );
   5420                break;
   5421 
   5422             default:
   5423                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   5424                vex_printf("first_opcode == 0xD8\n");
   5425                goto decode_fail;
   5426          }
   5427       } else {
   5428          delta++;
   5429          switch (modrm) {
   5430 
   5431             case 0xC0 ... 0xC7: /* FADD %st(?),%st(0) */
   5432                fp_do_op_ST_ST ( "add", Iop_AddF64, modrm - 0xC0, 0, False );
   5433                break;
   5434 
   5435             case 0xC8 ... 0xCF: /* FMUL %st(?),%st(0) */
   5436                fp_do_op_ST_ST ( "mul", Iop_MulF64, modrm - 0xC8, 0, False );
   5437                break;
   5438 
   5439             /* Dunno if this is right */
   5440             case 0xD0 ... 0xD7: /* FCOM %st(?),%st(0) */
   5441                r_dst = (UInt)modrm - 0xD0;
   5442                DIP("fcom %%st(0),%%st(%d)\n", r_dst);
   5443                /* This forces C1 to zero, which isn't right. */
   5444                put_C3210(
   5445                    unop(Iop_32Uto64,
   5446                    binop( Iop_And32,
   5447                           binop(Iop_Shl32,
   5448                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   5449                                 mkU8(8)),
   5450                           mkU32(0x4500)
   5451                    )));
   5452                break;
   5453 
   5454             /* Dunno if this is right */
   5455             case 0xD8 ... 0xDF: /* FCOMP %st(?),%st(0) */
   5456                r_dst = (UInt)modrm - 0xD8;
   5457                DIP("fcomp %%st(0),%%st(%d)\n", r_dst);
   5458                /* This forces C1 to zero, which isn't right. */
   5459                put_C3210(
   5460                    unop(Iop_32Uto64,
   5461                    binop( Iop_And32,
   5462                           binop(Iop_Shl32,
   5463                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   5464                                 mkU8(8)),
   5465                           mkU32(0x4500)
   5466                    )));
   5467                fp_pop();
   5468                break;
   5469 
   5470             case 0xE0 ... 0xE7: /* FSUB %st(?),%st(0) */
   5471                fp_do_op_ST_ST ( "sub", Iop_SubF64, modrm - 0xE0, 0, False );
   5472                break;
   5473 
   5474             case 0xE8 ... 0xEF: /* FSUBR %st(?),%st(0) */
   5475                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, modrm - 0xE8, 0, False );
   5476                break;
   5477 
   5478             case 0xF0 ... 0xF7: /* FDIV %st(?),%st(0) */
   5479                fp_do_op_ST_ST ( "div", Iop_DivF64, modrm - 0xF0, 0, False );
   5480                break;
   5481 
   5482             case 0xF8 ... 0xFF: /* FDIVR %st(?),%st(0) */
   5483                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, modrm - 0xF8, 0, False );
   5484                break;
   5485 
   5486             default:
   5487                goto decode_fail;
   5488          }
   5489       }
   5490    }
   5491 
   5492    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
   5493    else
   5494    if (first_opcode == 0xD9) {
   5495       if (modrm < 0xC0) {
   5496 
   5497          /* bits 5,4,3 are an opcode extension, and the modRM also
   5498             specifies an address. */
   5499          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   5500          delta += len;
   5501 
   5502          switch (gregLO3ofRM(modrm)) {
   5503 
   5504             case 0: /* FLD single-real */
   5505                DIP("flds %s\n", dis_buf);
   5506                fp_push();
   5507                put_ST(0, unop(Iop_F32toF64,
   5508                               loadLE(Ity_F32, mkexpr(addr))));
   5509                break;
   5510 
   5511             case 2: /* FST single-real */
   5512                DIP("fsts %s\n", dis_buf);
   5513                storeLE(mkexpr(addr),
   5514                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   5515                break;
   5516 
   5517             case 3: /* FSTP single-real */
   5518                DIP("fstps %s\n", dis_buf);
   5519                storeLE(mkexpr(addr),
   5520                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   5521                fp_pop();
   5522                break;
   5523 
   5524             case 4: { /* FLDENV m28 */
   5525                /* Uses dirty helper:
   5526                      VexEmNote amd64g_do_FLDENV ( VexGuestX86State*, HWord ) */
   5527                IRTemp    ew = newTemp(Ity_I32);
   5528                IRTemp   w64 = newTemp(Ity_I64);
   5529                IRDirty*   d = unsafeIRDirty_0_N (
   5530                                  0/*regparms*/,
   5531                                  "amd64g_dirtyhelper_FLDENV",
   5532                                  &amd64g_dirtyhelper_FLDENV,
   5533                                  mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   5534                               );
   5535                d->tmp       = w64;
   5536                /* declare we're reading memory */
   5537                d->mFx   = Ifx_Read;
   5538                d->mAddr = mkexpr(addr);
   5539                d->mSize = 28;
   5540 
   5541                /* declare we're writing guest state */
   5542                d->nFxState = 4;
   5543                vex_bzero(&d->fxState, sizeof(d->fxState));
   5544 
   5545                d->fxState[0].fx     = Ifx_Write;
   5546                d->fxState[0].offset = OFFB_FTOP;
   5547                d->fxState[0].size   = sizeof(UInt);
   5548 
   5549                d->fxState[1].fx     = Ifx_Write;
   5550                d->fxState[1].offset = OFFB_FPTAGS;
   5551                d->fxState[1].size   = 8 * sizeof(UChar);
   5552 
   5553                d->fxState[2].fx     = Ifx_Write;
   5554                d->fxState[2].offset = OFFB_FPROUND;
   5555                d->fxState[2].size   = sizeof(ULong);
   5556 
   5557                d->fxState[3].fx     = Ifx_Write;
   5558                d->fxState[3].offset = OFFB_FC3210;
   5559                d->fxState[3].size   = sizeof(ULong);
   5560 
   5561                stmt( IRStmt_Dirty(d) );
   5562 
   5563                /* ew contains any emulation warning we may need to
   5564                   issue.  If needed, side-exit to the next insn,
   5565                   reporting the warning, so that Valgrind's dispatcher
   5566                   sees the warning. */
   5567                assign(ew, unop(Iop_64to32,mkexpr(w64)) );
   5568                put_emwarn( mkexpr(ew) );
   5569                stmt(
   5570                   IRStmt_Exit(
   5571                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   5572                      Ijk_EmWarn,
   5573                      IRConst_U64( guest_RIP_bbstart+delta ),
   5574                      OFFB_RIP
   5575                   )
   5576                );
   5577 
   5578                DIP("fldenv %s\n", dis_buf);
   5579                break;
   5580             }
   5581 
   5582             case 5: {/* FLDCW */
   5583                /* The only thing we observe in the control word is the
   5584                   rounding mode.  Therefore, pass the 16-bit value
   5585                   (x87 native-format control word) to a clean helper,
   5586                   getting back a 64-bit value, the lower half of which
   5587                   is the FPROUND value to store, and the upper half of
   5588                   which is the emulation-warning token which may be
   5589                   generated.
   5590                */
   5591                /* ULong amd64h_check_fldcw ( ULong ); */
   5592                IRTemp t64 = newTemp(Ity_I64);
   5593                IRTemp ew = newTemp(Ity_I32);
   5594                DIP("fldcw %s\n", dis_buf);
   5595                assign( t64, mkIRExprCCall(
   5596                                Ity_I64, 0/*regparms*/,
   5597                                "amd64g_check_fldcw",
   5598                                &amd64g_check_fldcw,
   5599                                mkIRExprVec_1(
   5600                                   unop( Iop_16Uto64,
   5601                                         loadLE(Ity_I16, mkexpr(addr)))
   5602                                )
   5603                             )
   5604                      );
   5605 
   5606                put_fpround( unop(Iop_64to32, mkexpr(t64)) );
   5607                assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   5608                put_emwarn( mkexpr(ew) );
   5609                /* Finally, if an emulation warning was reported,
   5610                   side-exit to the next insn, reporting the warning,
   5611                   so that Valgrind's dispatcher sees the warning. */
   5612                stmt(
   5613                   IRStmt_Exit(
   5614                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   5615                      Ijk_EmWarn,
   5616                      IRConst_U64( guest_RIP_bbstart+delta ),
   5617                      OFFB_RIP
   5618                   )
   5619                );
   5620                break;
   5621             }
   5622 
   5623             case 6: { /* FNSTENV m28 */
   5624                /* Uses dirty helper:
   5625                      void amd64g_do_FSTENV ( VexGuestAMD64State*, HWord ) */
   5626                IRDirty* d = unsafeIRDirty_0_N (
   5627                                0/*regparms*/,
   5628                                "amd64g_dirtyhelper_FSTENV",
   5629                                &amd64g_dirtyhelper_FSTENV,
   5630                                mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   5631                             );
   5632                /* declare we're writing memory */
   5633                d->mFx   = Ifx_Write;
   5634                d->mAddr = mkexpr(addr);
   5635                d->mSize = 28;
   5636 
   5637                /* declare we're reading guest state */
   5638                d->nFxState = 4;
   5639                vex_bzero(&d->fxState, sizeof(d->fxState));
   5640 
   5641                d->fxState[0].fx     = Ifx_Read;
   5642                d->fxState[0].offset = OFFB_FTOP;
   5643                d->fxState[0].size   = sizeof(UInt);
   5644 
   5645                d->fxState[1].fx     = Ifx_Read;
   5646                d->fxState[1].offset = OFFB_FPTAGS;
   5647                d->fxState[1].size   = 8 * sizeof(UChar);
   5648 
   5649                d->fxState[2].fx     = Ifx_Read;
   5650                d->fxState[2].offset = OFFB_FPROUND;
   5651                d->fxState[2].size   = sizeof(ULong);
   5652 
   5653                d->fxState[3].fx     = Ifx_Read;
   5654                d->fxState[3].offset = OFFB_FC3210;
   5655                d->fxState[3].size   = sizeof(ULong);
   5656 
   5657                stmt( IRStmt_Dirty(d) );
   5658 
   5659                DIP("fnstenv %s\n", dis_buf);
   5660                break;
   5661             }
   5662 
   5663             case 7: /* FNSTCW */
   5664                /* Fake up a native x87 FPU control word.  The only
   5665                   thing it depends on is FPROUND[1:0], so call a clean
   5666                   helper to cook it up. */
   5667                /* ULong amd64g_create_fpucw ( ULong fpround ) */
   5668                DIP("fnstcw %s\n", dis_buf);
   5669                storeLE(
   5670                   mkexpr(addr),
   5671                   unop( Iop_64to16,
   5672                         mkIRExprCCall(
   5673                            Ity_I64, 0/*regp*/,
   5674                            "amd64g_create_fpucw", &amd64g_create_fpucw,
   5675                            mkIRExprVec_1( unop(Iop_32Uto64, get_fpround()) )
   5676                         )
   5677                   )
   5678                );
   5679                break;
   5680 
   5681             default:
   5682                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   5683                vex_printf("first_opcode == 0xD9\n");
   5684                goto decode_fail;
   5685          }
   5686 
   5687       } else {
   5688          delta++;
   5689          switch (modrm) {
   5690 
   5691             case 0xC0 ... 0xC7: /* FLD %st(?) */
   5692                r_src = (UInt)modrm - 0xC0;
   5693                DIP("fld %%st(%u)\n", r_src);
   5694                t1 = newTemp(Ity_F64);
   5695                assign(t1, get_ST(r_src));
   5696                fp_push();
   5697                put_ST(0, mkexpr(t1));
   5698                break;
   5699 
   5700             case 0xC8 ... 0xCF: /* FXCH %st(?) */
   5701                r_src = (UInt)modrm - 0xC8;
   5702                DIP("fxch %%st(%u)\n", r_src);
   5703                t1 = newTemp(Ity_F64);
   5704                t2 = newTemp(Ity_F64);
   5705                assign(t1, get_ST(0));
   5706                assign(t2, get_ST(r_src));
   5707                put_ST_UNCHECKED(0, mkexpr(t2));
   5708                put_ST_UNCHECKED(r_src, mkexpr(t1));
   5709                break;
   5710 
   5711             case 0xE0: /* FCHS */
   5712                DIP("fchs\n");
   5713                put_ST_UNCHECKED(0, unop(Iop_NegF64, get_ST(0)));
   5714                break;
   5715 
   5716             case 0xE1: /* FABS */
   5717                DIP("fabs\n");
   5718                put_ST_UNCHECKED(0, unop(Iop_AbsF64, get_ST(0)));
   5719                break;
   5720 
   5721             case 0xE5: { /* FXAM */
   5722                /* This is an interesting one.  It examines %st(0),
   5723                   regardless of whether the tag says it's empty or not.
   5724                   Here, just pass both the tag (in our format) and the
   5725                   value (as a double, actually a ULong) to a helper
   5726                   function. */
   5727                IRExpr** args
   5728                   = mkIRExprVec_2( unop(Iop_8Uto64, get_ST_TAG(0)),
   5729                                    unop(Iop_ReinterpF64asI64,
   5730                                         get_ST_UNCHECKED(0)) );
   5731                put_C3210(mkIRExprCCall(
   5732                             Ity_I64,
   5733                             0/*regparm*/,
   5734                             "amd64g_calculate_FXAM", &amd64g_calculate_FXAM,
   5735                             args
   5736                         ));
   5737                DIP("fxam\n");
   5738                break;
   5739             }
   5740 
   5741             case 0xE8: /* FLD1 */
   5742                DIP("fld1\n");
   5743                fp_push();
   5744                /* put_ST(0, IRExpr_Const(IRConst_F64(1.0))); */
   5745                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL)));
   5746                break;
   5747 
   5748             case 0xE9: /* FLDL2T */
   5749                DIP("fldl2t\n");
   5750                fp_push();
   5751                /* put_ST(0, IRExpr_Const(IRConst_F64(3.32192809488736234781))); */
   5752                put_ST(0, IRExpr_Const(IRConst_F64i(0x400a934f0979a371ULL)));
   5753                break;
   5754 
   5755             case 0xEA: /* FLDL2E */
   5756                DIP("fldl2e\n");
   5757                fp_push();
   5758                /* put_ST(0, IRExpr_Const(IRConst_F64(1.44269504088896340739))); */
   5759                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff71547652b82feULL)));
   5760                break;
   5761 
   5762             case 0xEB: /* FLDPI */
   5763                DIP("fldpi\n");
   5764                fp_push();
   5765                /* put_ST(0, IRExpr_Const(IRConst_F64(3.14159265358979323851))); */
   5766                put_ST(0, IRExpr_Const(IRConst_F64i(0x400921fb54442d18ULL)));
   5767                break;
   5768 
   5769             case 0xEC: /* FLDLG2 */
   5770                DIP("fldlg2\n");
   5771                fp_push();
   5772                /* put_ST(0, IRExpr_Const(IRConst_F64(0.301029995663981143))); */
   5773                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fd34413509f79ffULL)));
   5774                break;
   5775 
   5776             case 0xED: /* FLDLN2 */
   5777                DIP("fldln2\n");
   5778                fp_push();
   5779                /* put_ST(0, IRExpr_Const(IRConst_F64(0.69314718055994530942))); */
   5780                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fe62e42fefa39efULL)));
   5781                break;
   5782 
   5783             case 0xEE: /* FLDZ */
   5784                DIP("fldz\n");
   5785                fp_push();
   5786                /* put_ST(0, IRExpr_Const(IRConst_F64(0.0))); */
   5787                put_ST(0, IRExpr_Const(IRConst_F64i(0x0000000000000000ULL)));
   5788                break;
   5789 
   5790             case 0xF0: /* F2XM1 */
   5791                DIP("f2xm1\n");
   5792                put_ST_UNCHECKED(0,
   5793                   binop(Iop_2xm1F64,
   5794                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5795                         get_ST(0)));
   5796                break;
   5797 
   5798             case 0xF1: /* FYL2X */
   5799                DIP("fyl2x\n");
   5800                put_ST_UNCHECKED(1,
   5801                   triop(Iop_Yl2xF64,
   5802                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5803                         get_ST(1),
   5804                         get_ST(0)));
   5805                fp_pop();
   5806                break;
   5807 
   5808             case 0xF2: { /* FPTAN */
   5809                DIP("fptan\n");
   5810                IRTemp argD = newTemp(Ity_F64);
   5811                assign(argD, get_ST(0));
   5812                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
   5813                IRTemp resD = newTemp(Ity_F64);
   5814                assign(resD,
   5815                   IRExpr_ITE(
   5816                      mkexpr(argOK),
   5817                      binop(Iop_TanF64,
   5818                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5819                            mkexpr(argD)),
   5820                      mkexpr(argD))
   5821                );
   5822                put_ST_UNCHECKED(0, mkexpr(resD));
   5823                /* Conditionally push 1.0 on the stack, if the arg is
   5824                   in range */
   5825                maybe_fp_push(argOK);
   5826                maybe_put_ST(argOK, 0,
   5827                             IRExpr_Const(IRConst_F64(1.0)));
   5828                set_C2( binop(Iop_Xor64,
   5829                              unop(Iop_1Uto64, mkexpr(argOK)),
   5830                              mkU64(1)) );
   5831                break;
   5832             }
   5833 
   5834             case 0xF3: /* FPATAN */
   5835                DIP("fpatan\n");
   5836                put_ST_UNCHECKED(1,
   5837                   triop(Iop_AtanF64,
   5838                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5839                         get_ST(1),
   5840                         get_ST(0)));
   5841                fp_pop();
   5842                break;
   5843 
   5844             case 0xF4: { /* FXTRACT */
   5845                IRTemp argF = newTemp(Ity_F64);
   5846                IRTemp sigF = newTemp(Ity_F64);
   5847                IRTemp expF = newTemp(Ity_F64);
   5848                IRTemp argI = newTemp(Ity_I64);
   5849                IRTemp sigI = newTemp(Ity_I64);
   5850                IRTemp expI = newTemp(Ity_I64);
   5851                DIP("fxtract\n");
   5852                assign( argF, get_ST(0) );
   5853                assign( argI, unop(Iop_ReinterpF64asI64, mkexpr(argF)));
   5854                assign( sigI,
   5855                        mkIRExprCCall(
   5856                           Ity_I64, 0/*regparms*/,
   5857                           "x86amd64g_calculate_FXTRACT",
   5858                           &x86amd64g_calculate_FXTRACT,
   5859                           mkIRExprVec_2( mkexpr(argI),
   5860                                          mkIRExpr_HWord(0)/*sig*/ ))
   5861                );
   5862                assign( expI,
   5863                        mkIRExprCCall(
   5864                           Ity_I64, 0/*regparms*/,
   5865                           "x86amd64g_calculate_FXTRACT",
   5866                           &x86amd64g_calculate_FXTRACT,
   5867                           mkIRExprVec_2( mkexpr(argI),
   5868                                          mkIRExpr_HWord(1)/*exp*/ ))
   5869                );
   5870                assign( sigF, unop(Iop_ReinterpI64asF64, mkexpr(sigI)) );
   5871                assign( expF, unop(Iop_ReinterpI64asF64, mkexpr(expI)) );
   5872                /* exponent */
   5873                put_ST_UNCHECKED(0, mkexpr(expF) );
   5874                fp_push();
   5875                /* significand */
   5876                put_ST(0, mkexpr(sigF) );
   5877                break;
   5878             }
   5879 
   5880             case 0xF5: { /* FPREM1 -- IEEE compliant */
   5881                IRTemp a1 = newTemp(Ity_F64);
   5882                IRTemp a2 = newTemp(Ity_F64);
   5883                DIP("fprem1\n");
   5884                /* Do FPREM1 twice, once to get the remainder, and once
   5885                   to get the C3210 flag values. */
   5886                assign( a1, get_ST(0) );
   5887                assign( a2, get_ST(1) );
   5888                put_ST_UNCHECKED(0,
   5889                   triop(Iop_PRem1F64,
   5890                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5891                         mkexpr(a1),
   5892                         mkexpr(a2)));
   5893                put_C3210(
   5894                   unop(Iop_32Uto64,
   5895                   triop(Iop_PRem1C3210F64,
   5896                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5897                         mkexpr(a1),
   5898                         mkexpr(a2)) ));
   5899                break;
   5900             }
   5901 
   5902             case 0xF7: /* FINCSTP */
   5903                DIP("fincstp\n");
   5904                put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   5905                break;
   5906 
   5907             case 0xF8: { /* FPREM -- not IEEE compliant */
   5908                IRTemp a1 = newTemp(Ity_F64);
   5909                IRTemp a2 = newTemp(Ity_F64);
   5910                DIP("fprem\n");
   5911                /* Do FPREM twice, once to get the remainder, and once
   5912                   to get the C3210 flag values. */
   5913                assign( a1, get_ST(0) );
   5914                assign( a2, get_ST(1) );
   5915                put_ST_UNCHECKED(0,
   5916                   triop(Iop_PRemF64,
   5917                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5918                         mkexpr(a1),
   5919                         mkexpr(a2)));
   5920                put_C3210(
   5921                   unop(Iop_32Uto64,
   5922                   triop(Iop_PRemC3210F64,
   5923                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5924                         mkexpr(a1),
   5925                         mkexpr(a2)) ));
   5926                break;
   5927             }
   5928 
   5929             case 0xF9: /* FYL2XP1 */
   5930                DIP("fyl2xp1\n");
   5931                put_ST_UNCHECKED(1,
   5932                   triop(Iop_Yl2xp1F64,
   5933                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5934                         get_ST(1),
   5935                         get_ST(0)));
   5936                fp_pop();
   5937                break;
   5938 
   5939             case 0xFA: /* FSQRT */
   5940                DIP("fsqrt\n");
   5941                put_ST_UNCHECKED(0,
   5942                   binop(Iop_SqrtF64,
   5943                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5944                         get_ST(0)));
   5945                break;
   5946 
   5947             case 0xFB: { /* FSINCOS */
   5948                DIP("fsincos\n");
   5949                IRTemp argD = newTemp(Ity_F64);
   5950                assign(argD, get_ST(0));
   5951                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
   5952                IRTemp resD = newTemp(Ity_F64);
   5953                assign(resD,
   5954                   IRExpr_ITE(
   5955                      mkexpr(argOK),
   5956                      binop(Iop_SinF64,
   5957                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5958                            mkexpr(argD)),
   5959                      mkexpr(argD))
   5960                );
   5961                put_ST_UNCHECKED(0, mkexpr(resD));
   5962                /* Conditionally push the cos value on the stack, if
   5963                   the arg is in range */
   5964                maybe_fp_push(argOK);
   5965                maybe_put_ST(argOK, 0,
   5966                   binop(Iop_CosF64,
   5967                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5968                         mkexpr(argD)));
   5969                set_C2( binop(Iop_Xor64,
   5970                              unop(Iop_1Uto64, mkexpr(argOK)),
   5971                              mkU64(1)) );
   5972                break;
   5973             }
   5974 
   5975             case 0xFC: /* FRNDINT */
   5976                DIP("frndint\n");
   5977                put_ST_UNCHECKED(0,
   5978                   binop(Iop_RoundF64toInt, get_roundingmode(), get_ST(0)) );
   5979                break;
   5980 
   5981             case 0xFD: /* FSCALE */
   5982                DIP("fscale\n");
   5983                put_ST_UNCHECKED(0,
   5984                   triop(Iop_ScaleF64,
   5985                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5986                         get_ST(0),
   5987                         get_ST(1)));
   5988                break;
   5989 
   5990             case 0xFE:   /* FSIN */
   5991             case 0xFF: { /* FCOS */
   5992                Bool isSIN = modrm == 0xFE;
   5993                DIP("%s\n", isSIN ? "fsin" : "fcos");
   5994                IRTemp argD = newTemp(Ity_F64);
   5995                assign(argD, get_ST(0));
   5996                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
   5997                IRTemp resD = newTemp(Ity_F64);
   5998                assign(resD,
   5999                   IRExpr_ITE(
   6000                      mkexpr(argOK),
   6001                      binop(isSIN ? Iop_SinF64 : Iop_CosF64,
   6002                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6003                            mkexpr(argD)),
   6004                      mkexpr(argD))
   6005                );
   6006                put_ST_UNCHECKED(0, mkexpr(resD));
   6007                set_C2( binop(Iop_Xor64,
   6008                              unop(Iop_1Uto64, mkexpr(argOK)),
   6009                              mkU64(1)) );
   6010                break;
   6011             }
   6012 
   6013             default:
   6014                goto decode_fail;
   6015          }
   6016       }
   6017    }
   6018 
   6019    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
   6020    else
   6021    if (first_opcode == 0xDA) {
   6022 
   6023       if (modrm < 0xC0) {
   6024 
   6025          /* bits 5,4,3 are an opcode extension, and the modRM also
   6026             specifies an address. */
   6027          IROp   fop;
   6028          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6029          delta += len;
   6030          switch (gregLO3ofRM(modrm)) {
   6031 
   6032             case 0: /* FIADD m32int */ /* ST(0) += m32int */
   6033                DIP("fiaddl %s\n", dis_buf);
   6034                fop = Iop_AddF64;
   6035                goto do_fop_m32;
   6036 
   6037             case 1: /* FIMUL m32int */ /* ST(0) *= m32int */
   6038                DIP("fimull %s\n", dis_buf);
   6039                fop = Iop_MulF64;
   6040                goto do_fop_m32;
   6041 
   6042             case 4: /* FISUB m32int */ /* ST(0) -= m32int */
   6043                DIP("fisubl %s\n", dis_buf);
   6044                fop = Iop_SubF64;
   6045                goto do_fop_m32;
   6046 
   6047             case 5: /* FISUBR m32int */ /* ST(0) = m32int - ST(0) */
   6048                DIP("fisubrl %s\n", dis_buf);
   6049                fop = Iop_SubF64;
   6050                goto do_foprev_m32;
   6051 
   6052             case 6: /* FIDIV m32int */ /* ST(0) /= m32int */
   6053                DIP("fisubl %s\n", dis_buf);
   6054                fop = Iop_DivF64;
   6055                goto do_fop_m32;
   6056 
   6057             case 7: /* FIDIVR m32int */ /* ST(0) = m32int / ST(0) */
   6058                DIP("fidivrl %s\n", dis_buf);
   6059                fop = Iop_DivF64;
   6060                goto do_foprev_m32;
   6061 
   6062             do_fop_m32:
   6063                put_ST_UNCHECKED(0,
   6064                   triop(fop,
   6065                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6066                         get_ST(0),
   6067                         unop(Iop_I32StoF64,
   6068                              loadLE(Ity_I32, mkexpr(addr)))));
   6069                break;
   6070 
   6071             do_foprev_m32:
   6072                put_ST_UNCHECKED(0,
   6073                   triop(fop,
   6074                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6075                         unop(Iop_I32StoF64,
   6076                              loadLE(Ity_I32, mkexpr(addr))),
   6077                         get_ST(0)));
   6078                break;
   6079 
   6080             default:
   6081                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   6082                vex_printf("first_opcode == 0xDA\n");
   6083                goto decode_fail;
   6084          }
   6085 
   6086       } else {
   6087 
   6088          delta++;
   6089          switch (modrm) {
   6090 
   6091             case 0xC0 ... 0xC7: /* FCMOVB ST(i), ST(0) */
   6092                r_src = (UInt)modrm - 0xC0;
   6093                DIP("fcmovb %%st(%u), %%st(0)\n", r_src);
   6094                put_ST_UNCHECKED(0,
   6095                                 IRExpr_ITE(
   6096                                     mk_amd64g_calculate_condition(AMD64CondB),
   6097                                     get_ST(r_src), get_ST(0)) );
   6098                break;
   6099 
   6100             case 0xC8 ... 0xCF: /* FCMOVE(Z) ST(i), ST(0) */
   6101                r_src = (UInt)modrm - 0xC8;
   6102                DIP("fcmovz %%st(%u), %%st(0)\n", r_src);
   6103                put_ST_UNCHECKED(0,
   6104                                 IRExpr_ITE(
   6105                                     mk_amd64g_calculate_condition(AMD64CondZ),
   6106                                     get_ST(r_src), get_ST(0)) );
   6107                break;
   6108 
   6109             case 0xD0 ... 0xD7: /* FCMOVBE ST(i), ST(0) */
   6110                r_src = (UInt)modrm - 0xD0;
   6111                DIP("fcmovbe %%st(%u), %%st(0)\n", r_src);
   6112                put_ST_UNCHECKED(0,
   6113                                 IRExpr_ITE(
   6114                                     mk_amd64g_calculate_condition(AMD64CondBE),
   6115                                     get_ST(r_src), get_ST(0)) );
   6116                break;
   6117 
   6118             case 0xD8 ... 0xDF: /* FCMOVU ST(i), ST(0) */
   6119                r_src = (UInt)modrm - 0xD8;
   6120                DIP("fcmovu %%st(%u), %%st(0)\n", r_src);
   6121                put_ST_UNCHECKED(0,
   6122                                 IRExpr_ITE(
   6123                                     mk_amd64g_calculate_condition(AMD64CondP),
   6124                                     get_ST(r_src), get_ST(0)) );
   6125                break;
   6126 
   6127             case 0xE9: /* FUCOMPP %st(0),%st(1) */
   6128                DIP("fucompp %%st(0),%%st(1)\n");
   6129                /* This forces C1 to zero, which isn't right. */
   6130                put_C3210(
   6131                    unop(Iop_32Uto64,
   6132                    binop( Iop_And32,
   6133                           binop(Iop_Shl32,
   6134                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   6135                                 mkU8(8)),
   6136                           mkU32(0x4500)
   6137                    )));
   6138                fp_pop();
   6139                fp_pop();
   6140                break;
   6141 
   6142             default:
   6143                goto decode_fail;
   6144          }
   6145 
   6146       }
   6147    }
   6148 
   6149    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDB opcodes +-+-+-+-+-+-+-+ */
   6150    else
   6151    if (first_opcode == 0xDB) {
   6152       if (modrm < 0xC0) {
   6153 
   6154          /* bits 5,4,3 are an opcode extension, and the modRM also
   6155             specifies an address. */
   6156          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6157          delta += len;
   6158 
   6159          switch (gregLO3ofRM(modrm)) {
   6160 
   6161             case 0: /* FILD m32int */
   6162                DIP("fildl %s\n", dis_buf);
   6163                fp_push();
   6164                put_ST(0, unop(Iop_I32StoF64,
   6165                               loadLE(Ity_I32, mkexpr(addr))));
   6166                break;
   6167 
   6168             case 1: /* FISTTPL m32 (SSE3) */
   6169                DIP("fisttpl %s\n", dis_buf);
   6170                storeLE( mkexpr(addr),
   6171                         binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) );
   6172                fp_pop();
   6173                break;
   6174 
   6175             case 2: /* FIST m32 */
   6176                DIP("fistl %s\n", dis_buf);
   6177                storeLE( mkexpr(addr),
   6178                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   6179                break;
   6180 
   6181             case 3: /* FISTP m32 */
   6182                DIP("fistpl %s\n", dis_buf);
   6183                storeLE( mkexpr(addr),
   6184                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   6185                fp_pop();
   6186                break;
   6187 
   6188             case 5: { /* FLD extended-real */
   6189                /* Uses dirty helper:
   6190                      ULong amd64g_loadF80le ( ULong )
   6191                   addr holds the address.  First, do a dirty call to
   6192                   get hold of the data. */
   6193                IRTemp   val  = newTemp(Ity_I64);
   6194                IRExpr** args = mkIRExprVec_1 ( mkexpr(addr) );
   6195 
   6196                IRDirty* d = unsafeIRDirty_1_N (
   6197                                val,
   6198                                0/*regparms*/,
   6199                                "amd64g_dirtyhelper_loadF80le",
   6200                                &amd64g_dirtyhelper_loadF80le,
   6201                                args
   6202                             );
   6203                /* declare that we're reading memory */
   6204                d->mFx   = Ifx_Read;
   6205                d->mAddr = mkexpr(addr);
   6206                d->mSize = 10;
   6207 
   6208                /* execute the dirty call, dumping the result in val. */
   6209                stmt( IRStmt_Dirty(d) );
   6210                fp_push();
   6211                put_ST(0, unop(Iop_ReinterpI64asF64, mkexpr(val)));
   6212 
   6213                DIP("fldt %s\n", dis_buf);
   6214                break;
   6215             }
   6216 
   6217             case 7: { /* FSTP extended-real */
   6218                /* Uses dirty helper:
   6219                      void amd64g_storeF80le ( ULong addr, ULong data )
   6220                */
   6221                IRExpr** args
   6222                   = mkIRExprVec_2( mkexpr(addr),
   6223                                    unop(Iop_ReinterpF64asI64, get_ST(0)) );
   6224 
   6225                IRDirty* d = unsafeIRDirty_0_N (
   6226                                0/*regparms*/,
   6227                                "amd64g_dirtyhelper_storeF80le",
   6228                                &amd64g_dirtyhelper_storeF80le,
   6229                                args
   6230                             );
   6231                /* declare we're writing memory */
   6232                d->mFx   = Ifx_Write;
   6233                d->mAddr = mkexpr(addr);
   6234                d->mSize = 10;
   6235 
   6236                /* execute the dirty call. */
   6237                stmt( IRStmt_Dirty(d) );
   6238                fp_pop();
   6239 
   6240                DIP("fstpt\n %s", dis_buf);
   6241                break;
   6242             }
   6243 
   6244             default:
   6245                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   6246                vex_printf("first_opcode == 0xDB\n");
   6247                goto decode_fail;
   6248          }
   6249 
   6250       } else {
   6251 
   6252          delta++;
   6253          switch (modrm) {
   6254 
   6255             case 0xC0 ... 0xC7: /* FCMOVNB ST(i), ST(0) */
   6256                r_src = (UInt)modrm - 0xC0;
   6257                DIP("fcmovnb %%st(%u), %%st(0)\n", r_src);
   6258                put_ST_UNCHECKED(0,
   6259                                 IRExpr_ITE(
   6260                                     mk_amd64g_calculate_condition(AMD64CondNB),
   6261                                     get_ST(r_src), get_ST(0)) );
   6262                break;
   6263 
   6264             case 0xC8 ... 0xCF: /* FCMOVNE(NZ) ST(i), ST(0) */
   6265                r_src = (UInt)modrm - 0xC8;
   6266                DIP("fcmovnz %%st(%u), %%st(0)\n", r_src);
   6267                put_ST_UNCHECKED(
   6268                   0,
   6269                   IRExpr_ITE(
   6270                      mk_amd64g_calculate_condition(AMD64CondNZ),
   6271                      get_ST(r_src),
   6272                      get_ST(0)
   6273                   )
   6274                );
   6275                break;
   6276 
   6277             case 0xD0 ... 0xD7: /* FCMOVNBE ST(i), ST(0) */
   6278                r_src = (UInt)modrm - 0xD0;
   6279                DIP("fcmovnbe %%st(%u), %%st(0)\n", r_src);
   6280                put_ST_UNCHECKED(
   6281                   0,
   6282                   IRExpr_ITE(
   6283                      mk_amd64g_calculate_condition(AMD64CondNBE),
   6284                      get_ST(r_src),
   6285                      get_ST(0)
   6286                   )
   6287                );
   6288                break;
   6289 
   6290             case 0xD8 ... 0xDF: /* FCMOVNU ST(i), ST(0) */
   6291                r_src = (UInt)modrm - 0xD8;
   6292                DIP("fcmovnu %%st(%u), %%st(0)\n", r_src);
   6293                put_ST_UNCHECKED(
   6294                   0,
   6295                   IRExpr_ITE(
   6296                      mk_amd64g_calculate_condition(AMD64CondNP),
   6297                      get_ST(r_src),
   6298                      get_ST(0)
   6299                   )
   6300                );
   6301                break;
   6302 
   6303             case 0xE2:
   6304                DIP("fnclex\n");
   6305                break;
   6306 
   6307             case 0xE3: {
   6308                /* Uses dirty helper:
   6309                      void amd64g_do_FINIT ( VexGuestAMD64State* ) */
   6310                IRDirty* d  = unsafeIRDirty_0_N (
   6311                                 0/*regparms*/,
   6312                                 "amd64g_dirtyhelper_FINIT",
   6313                                 &amd64g_dirtyhelper_FINIT,
   6314                                 mkIRExprVec_1( IRExpr_BBPTR() )
   6315                              );
   6316 
   6317                /* declare we're writing guest state */
   6318                d->nFxState = 5;
   6319                vex_bzero(&d->fxState, sizeof(d->fxState));
   6320 
   6321                d->fxState[0].fx     = Ifx_Write;
   6322                d->fxState[0].offset = OFFB_FTOP;
   6323                d->fxState[0].size   = sizeof(UInt);
   6324 
   6325                d->fxState[1].fx     = Ifx_Write;
   6326                d->fxState[1].offset = OFFB_FPREGS;
   6327                d->fxState[1].size   = 8 * sizeof(ULong);
   6328 
   6329                d->fxState[2].fx     = Ifx_Write;
   6330                d->fxState[2].offset = OFFB_FPTAGS;
   6331                d->fxState[2].size   = 8 * sizeof(UChar);
   6332 
   6333                d->fxState[3].fx     = Ifx_Write;
   6334                d->fxState[3].offset = OFFB_FPROUND;
   6335                d->fxState[3].size   = sizeof(ULong);
   6336 
   6337                d->fxState[4].fx     = Ifx_Write;
   6338                d->fxState[4].offset = OFFB_FC3210;
   6339                d->fxState[4].size   = sizeof(ULong);
   6340 
   6341                stmt( IRStmt_Dirty(d) );
   6342 
   6343                DIP("fninit\n");
   6344                break;
   6345             }
   6346 
   6347             case 0xE8 ... 0xEF: /* FUCOMI %st(0),%st(?) */
   6348                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, False );
   6349                break;
   6350 
   6351             case 0xF0 ... 0xF7: /* FCOMI %st(0),%st(?) */
   6352                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, False );
   6353                break;
   6354 
   6355             default:
   6356                goto decode_fail;
   6357          }
   6358       }
   6359    }
   6360 
   6361    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDC opcodes +-+-+-+-+-+-+-+ */
   6362    else
   6363    if (first_opcode == 0xDC) {
   6364       if (modrm < 0xC0) {
   6365 
   6366          /* bits 5,4,3 are an opcode extension, and the modRM also
   6367             specifies an address. */
   6368          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6369          delta += len;
   6370 
   6371          switch (gregLO3ofRM(modrm)) {
   6372 
   6373             case 0: /* FADD double-real */
   6374                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, True );
   6375                break;
   6376 
   6377             case 1: /* FMUL double-real */
   6378                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, True );
   6379                break;
   6380 
   6381 //..             case 2: /* FCOM double-real */
   6382 //..                DIP("fcoml %s\n", dis_buf);
   6383 //..                /* This forces C1 to zero, which isn't right. */
   6384 //..                put_C3210(
   6385 //..                    binop( Iop_And32,
   6386 //..                           binop(Iop_Shl32,
   6387 //..                                 binop(Iop_CmpF64,
   6388 //..                                       get_ST(0),
   6389 //..                                       loadLE(Ity_F64,mkexpr(addr))),
   6390 //..                                 mkU8(8)),
   6391 //..                           mkU32(0x4500)
   6392 //..                    ));
   6393 //..                break;
   6394 
   6395             case 3: /* FCOMP double-real */
   6396                DIP("fcompl %s\n", dis_buf);
   6397                /* This forces C1 to zero, which isn't right. */
   6398                put_C3210(
   6399                    unop(Iop_32Uto64,
   6400                    binop( Iop_And32,
   6401                           binop(Iop_Shl32,
   6402                                 binop(Iop_CmpF64,
   6403                                       get_ST(0),
   6404                                       loadLE(Ity_F64,mkexpr(addr))),
   6405                                 mkU8(8)),
   6406                           mkU32(0x4500)
   6407                    )));
   6408                fp_pop();
   6409                break;
   6410 
   6411             case 4: /* FSUB double-real */
   6412                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, True );
   6413                break;
   6414 
   6415             case 5: /* FSUBR double-real */
   6416                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, True );
   6417                break;
   6418 
   6419             case 6: /* FDIV double-real */
   6420                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, True );
   6421                break;
   6422 
   6423             case 7: /* FDIVR double-real */
   6424                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, True );
   6425                break;
   6426 
   6427             default:
   6428                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   6429                vex_printf("first_opcode == 0xDC\n");
   6430                goto decode_fail;
   6431          }
   6432 
   6433       } else {
   6434 
   6435          delta++;
   6436          switch (modrm) {
   6437 
   6438             case 0xC0 ... 0xC7: /* FADD %st(0),%st(?) */
   6439                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, False );
   6440                break;
   6441 
   6442             case 0xC8 ... 0xCF: /* FMUL %st(0),%st(?) */
   6443                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, False );
   6444                break;
   6445 
   6446             case 0xE0 ... 0xE7: /* FSUBR %st(0),%st(?) */
   6447                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0, modrm - 0xE0, False );
   6448                break;
   6449 
   6450             case 0xE8 ... 0xEF: /* FSUB %st(0),%st(?) */
   6451                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0, modrm - 0xE8, False );
   6452                break;
   6453 
   6454             case 0xF0 ... 0xF7: /* FDIVR %st(0),%st(?) */
   6455                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, False );
   6456                break;
   6457 
   6458             case 0xF8 ... 0xFF: /* FDIV %st(0),%st(?) */
   6459                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, False );
   6460                break;
   6461 
   6462             default:
   6463                goto decode_fail;
   6464          }
   6465 
   6466       }
   6467    }
   6468 
   6469    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDD opcodes +-+-+-+-+-+-+-+ */
   6470    else
   6471    if (first_opcode == 0xDD) {
   6472 
   6473       if (modrm < 0xC0) {
   6474 
   6475          /* bits 5,4,3 are an opcode extension, and the modRM also
   6476             specifies an address. */
   6477          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6478          delta += len;
   6479 
   6480          switch (gregLO3ofRM(modrm)) {
   6481 
   6482             case 0: /* FLD double-real */
   6483                DIP("fldl %s\n", dis_buf);
   6484                fp_push();
   6485                put_ST(0, loadLE(Ity_F64, mkexpr(addr)));
   6486                break;
   6487 
   6488             case 1: /* FISTTPQ m64 (SSE3) */
   6489                DIP("fistppll %s\n", dis_buf);
   6490                storeLE( mkexpr(addr),
   6491                         binop(Iop_F64toI64S, mkU32(Irrm_ZERO), get_ST(0)) );
   6492                fp_pop();
   6493                break;
   6494 
   6495             case 2: /* FST double-real */
   6496                DIP("fstl %s\n", dis_buf);
   6497                storeLE(mkexpr(addr), get_ST(0));
   6498                break;
   6499 
   6500             case 3: /* FSTP double-real */
   6501                DIP("fstpl %s\n", dis_buf);
   6502                storeLE(mkexpr(addr), get_ST(0));
   6503                fp_pop();
   6504                break;
   6505 
   6506             case 4: { /* FRSTOR m94/m108 */
   6507                IRTemp   ew = newTemp(Ity_I32);
   6508                IRTemp  w64 = newTemp(Ity_I64);
   6509                IRDirty*  d;
   6510                if ( have66(pfx) ) {
   6511                   /* Uses dirty helper:
   6512                      VexEmNote amd64g_dirtyhelper_FRSTORS
   6513                                   ( VexGuestAMD64State*, HWord ) */
   6514                   d = unsafeIRDirty_0_N (
   6515                          0/*regparms*/,
   6516                          "amd64g_dirtyhelper_FRSTORS",
   6517                          &amd64g_dirtyhelper_FRSTORS,
   6518                          mkIRExprVec_1( mkexpr(addr) )
   6519                       );
   6520                   d->mSize = 94;
   6521                } else {
   6522                   /* Uses dirty helper:
   6523                      VexEmNote amd64g_dirtyhelper_FRSTOR
   6524                                   ( VexGuestAMD64State*, HWord ) */
   6525                   d = unsafeIRDirty_0_N (
   6526                          0/*regparms*/,
   6527                          "amd64g_dirtyhelper_FRSTOR",
   6528                          &amd64g_dirtyhelper_FRSTOR,
   6529                          mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   6530                       );
   6531                   d->mSize = 108;
   6532                }
   6533 
   6534                d->tmp    = w64;
   6535                /* declare we're reading memory */
   6536                d->mFx   = Ifx_Read;
   6537                d->mAddr = mkexpr(addr);
   6538                /* d->mSize set above */
   6539 
   6540                /* declare we're writing guest state */
   6541                d->nFxState = 5;
   6542                vex_bzero(&d->fxState, sizeof(d->fxState));
   6543 
   6544                d->fxState[0].fx     = Ifx_Write;
   6545                d->fxState[0].offset = OFFB_FTOP;
   6546                d->fxState[0].size   = sizeof(UInt);
   6547 
   6548                d->fxState[1].fx     = Ifx_Write;
   6549                d->fxState[1].offset = OFFB_FPREGS;
   6550                d->fxState[1].size   = 8 * sizeof(ULong);
   6551 
   6552                d->fxState[2].fx     = Ifx_Write;
   6553                d->fxState[2].offset = OFFB_FPTAGS;
   6554                d->fxState[2].size   = 8 * sizeof(UChar);
   6555 
   6556                d->fxState[3].fx     = Ifx_Write;
   6557                d->fxState[3].offset = OFFB_FPROUND;
   6558                d->fxState[3].size   = sizeof(ULong);
   6559 
   6560                d->fxState[4].fx     = Ifx_Write;
   6561                d->fxState[4].offset = OFFB_FC3210;
   6562                d->fxState[4].size   = sizeof(ULong);
   6563 
   6564                stmt( IRStmt_Dirty(d) );
   6565 
   6566                /* ew contains any emulation warning we may need to
   6567                   issue.  If needed, side-exit to the next insn,
   6568                   reporting the warning, so that Valgrind's dispatcher
   6569                   sees the warning. */
   6570                assign(ew, unop(Iop_64to32,mkexpr(w64)) );
   6571                put_emwarn( mkexpr(ew) );
   6572                stmt(
   6573                   IRStmt_Exit(
   6574                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   6575                      Ijk_EmWarn,
   6576                      IRConst_U64( guest_RIP_bbstart+delta ),
   6577                      OFFB_RIP
   6578                   )
   6579                );
   6580 
   6581                if ( have66(pfx) ) {
   6582                   DIP("frstors %s\n", dis_buf);
   6583                } else {
   6584                   DIP("frstor %s\n", dis_buf);
   6585                }
   6586                break;
   6587             }
   6588 
   6589             case 6: { /* FNSAVE m94/m108 */
   6590                IRDirty *d;
   6591                if ( have66(pfx) ) {
   6592                  /* Uses dirty helper:
   6593                     void amd64g_dirtyhelper_FNSAVES ( VexGuestAMD64State*,
   6594                                                       HWord ) */
   6595                   d = unsafeIRDirty_0_N (
   6596                          0/*regparms*/,
   6597                          "amd64g_dirtyhelper_FNSAVES",
   6598                          &amd64g_dirtyhelper_FNSAVES,
   6599                          mkIRExprVec_1( mkexpr(addr) )
   6600                          );
   6601                   d->mSize = 94;
   6602                } else {
   6603                  /* Uses dirty helper:
   6604                     void amd64g_dirtyhelper_FNSAVE ( VexGuestAMD64State*,
   6605                                                      HWord ) */
   6606                   d = unsafeIRDirty_0_N (
   6607                          0/*regparms*/,
   6608                          "amd64g_dirtyhelper_FNSAVE",
   6609                          &amd64g_dirtyhelper_FNSAVE,
   6610                          mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   6611                       );
   6612                   d->mSize = 108;
   6613                }
   6614 
   6615                /* declare we're writing memory */
   6616                d->mFx   = Ifx_Write;
   6617                d->mAddr = mkexpr(addr);
   6618                /* d->mSize set above */
   6619 
   6620                /* declare we're reading guest state */
   6621                d->nFxState = 5;
   6622                vex_bzero(&d->fxState, sizeof(d->fxState));
   6623 
   6624                d->fxState[0].fx     = Ifx_Read;
   6625                d->fxState[0].offset = OFFB_FTOP;
   6626                d->fxState[0].size   = sizeof(UInt);
   6627 
   6628                d->fxState[1].fx     = Ifx_Read;
   6629                d->fxState[1].offset = OFFB_FPREGS;
   6630                d->fxState[1].size   = 8 * sizeof(ULong);
   6631 
   6632                d->fxState[2].fx     = Ifx_Read;
   6633                d->fxState[2].offset = OFFB_FPTAGS;
   6634                d->fxState[2].size   = 8 * sizeof(UChar);
   6635 
   6636                d->fxState[3].fx     = Ifx_Read;
   6637                d->fxState[3].offset = OFFB_FPROUND;
   6638                d->fxState[3].size   = sizeof(ULong);
   6639 
   6640                d->fxState[4].fx     = Ifx_Read;
   6641                d->fxState[4].offset = OFFB_FC3210;
   6642                d->fxState[4].size   = sizeof(ULong);
   6643 
   6644                stmt( IRStmt_Dirty(d) );
   6645 
   6646                if ( have66(pfx) ) {
   6647                  DIP("fnsaves %s\n", dis_buf);
   6648                } else {
   6649                  DIP("fnsave %s\n", dis_buf);
   6650                }
   6651                break;
   6652             }
   6653 
   6654             case 7: { /* FNSTSW m16 */
   6655                IRExpr* sw = get_FPU_sw();
   6656                vassert(typeOfIRExpr(irsb->tyenv, sw) == Ity_I16);
   6657                storeLE( mkexpr(addr), sw );
   6658                DIP("fnstsw %s\n", dis_buf);
   6659                break;
   6660             }
   6661 
   6662             default:
   6663                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   6664                vex_printf("first_opcode == 0xDD\n");
   6665                goto decode_fail;
   6666          }
   6667       } else {
   6668          delta++;
   6669          switch (modrm) {
   6670 
   6671             case 0xC0 ... 0xC7: /* FFREE %st(?) */
   6672                r_dst = (UInt)modrm - 0xC0;
   6673                DIP("ffree %%st(%u)\n", r_dst);
   6674                put_ST_TAG ( r_dst, mkU8(0) );
   6675                break;
   6676 
   6677             case 0xD0 ... 0xD7: /* FST %st(0),%st(?) */
   6678                r_dst = (UInt)modrm - 0xD0;
   6679                DIP("fst %%st(0),%%st(%u)\n", r_dst);
   6680                /* P4 manual says: "If the destination operand is a
   6681                   non-empty register, the invalid-operation exception
   6682                   is not generated.  Hence put_ST_UNCHECKED. */
   6683                put_ST_UNCHECKED(r_dst, get_ST(0));
   6684                break;
   6685 
   6686             case 0xD8 ... 0xDF: /* FSTP %st(0),%st(?) */
   6687                r_dst = (UInt)modrm - 0xD8;
   6688                DIP("fstp %%st(0),%%st(%u)\n", r_dst);
   6689                /* P4 manual says: "If the destination operand is a
   6690                   non-empty register, the invalid-operation exception
   6691                   is not generated.  Hence put_ST_UNCHECKED. */
   6692                put_ST_UNCHECKED(r_dst, get_ST(0));
   6693                fp_pop();
   6694                break;
   6695 
   6696             case 0xE0 ... 0xE7: /* FUCOM %st(0),%st(?) */
   6697                r_dst = (UInt)modrm - 0xE0;
   6698                DIP("fucom %%st(0),%%st(%u)\n", r_dst);
   6699                /* This forces C1 to zero, which isn't right. */
   6700                put_C3210(
   6701                    unop(Iop_32Uto64,
   6702                    binop( Iop_And32,
   6703                           binop(Iop_Shl32,
   6704                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   6705                                 mkU8(8)),
   6706                           mkU32(0x4500)
   6707                    )));
   6708                break;
   6709 
   6710             case 0xE8 ... 0xEF: /* FUCOMP %st(0),%st(?) */
   6711                r_dst = (UInt)modrm - 0xE8;
   6712                DIP("fucomp %%st(0),%%st(%u)\n", r_dst);
   6713                /* This forces C1 to zero, which isn't right. */
   6714                put_C3210(
   6715                    unop(Iop_32Uto64,
   6716                    binop( Iop_And32,
   6717                           binop(Iop_Shl32,
   6718                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   6719                                 mkU8(8)),
   6720                           mkU32(0x4500)
   6721                    )));
   6722                fp_pop();
   6723                break;
   6724 
   6725             default:
   6726                goto decode_fail;
   6727          }
   6728       }
   6729    }
   6730 
   6731    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDE opcodes +-+-+-+-+-+-+-+ */
   6732    else
   6733    if (first_opcode == 0xDE) {
   6734 
   6735       if (modrm < 0xC0) {
   6736 
   6737          /* bits 5,4,3 are an opcode extension, and the modRM also
   6738             specifies an address. */
   6739          IROp   fop;
   6740          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6741          delta += len;
   6742 
   6743          switch (gregLO3ofRM(modrm)) {
   6744 
   6745             case 0: /* FIADD m16int */ /* ST(0) += m16int */
   6746                DIP("fiaddw %s\n", dis_buf);
   6747                fop = Iop_AddF64;
   6748                goto do_fop_m16;
   6749 
   6750             case 1: /* FIMUL m16int */ /* ST(0) *= m16int */
   6751                DIP("fimulw %s\n", dis_buf);
   6752                fop = Iop_MulF64;
   6753                goto do_fop_m16;
   6754 
   6755             case 4: /* FISUB m16int */ /* ST(0) -= m16int */
   6756                DIP("fisubw %s\n", dis_buf);
   6757                fop = Iop_SubF64;
   6758                goto do_fop_m16;
   6759 
   6760             case 5: /* FISUBR m16int */ /* ST(0) = m16int - ST(0) */
   6761                DIP("fisubrw %s\n", dis_buf);
   6762                fop = Iop_SubF64;
   6763                goto do_foprev_m16;
   6764 
   6765             case 6: /* FIDIV m16int */ /* ST(0) /= m16int */
   6766                DIP("fisubw %s\n", dis_buf);
   6767                fop = Iop_DivF64;
   6768                goto do_fop_m16;
   6769 
   6770             case 7: /* FIDIVR m16int */ /* ST(0) = m16int / ST(0) */
   6771                DIP("fidivrw %s\n", dis_buf);
   6772                fop = Iop_DivF64;
   6773                goto do_foprev_m16;
   6774 
   6775             do_fop_m16:
   6776                put_ST_UNCHECKED(0,
   6777                   triop(fop,
   6778                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6779                         get_ST(0),
   6780                         unop(Iop_I32StoF64,
   6781                              unop(Iop_16Sto32,
   6782                                   loadLE(Ity_I16, mkexpr(addr))))));
   6783                break;
   6784 
   6785             do_foprev_m16:
   6786                put_ST_UNCHECKED(0,
   6787                   triop(fop,
   6788                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6789                         unop(Iop_I32StoF64,
   6790                              unop(Iop_16Sto32,
   6791                                   loadLE(Ity_I16, mkexpr(addr)))),
   6792                         get_ST(0)));
   6793                break;
   6794 
   6795             default:
   6796                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   6797                vex_printf("first_opcode == 0xDE\n");
   6798                goto decode_fail;
   6799          }
   6800 
   6801       } else {
   6802 
   6803          delta++;
   6804          switch (modrm) {
   6805 
   6806             case 0xC0 ... 0xC7: /* FADDP %st(0),%st(?) */
   6807                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, True );
   6808                break;
   6809 
   6810             case 0xC8 ... 0xCF: /* FMULP %st(0),%st(?) */
   6811                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, True );
   6812                break;
   6813 
   6814             case 0xD9: /* FCOMPP %st(0),%st(1) */
   6815                DIP("fcompp %%st(0),%%st(1)\n");
   6816                /* This forces C1 to zero, which isn't right. */
   6817                put_C3210(
   6818                    unop(Iop_32Uto64,
   6819                    binop( Iop_And32,
   6820                           binop(Iop_Shl32,
   6821                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   6822                                 mkU8(8)),
   6823                           mkU32(0x4500)
   6824                    )));
   6825                fp_pop();
   6826                fp_pop();
   6827                break;
   6828 
   6829             case 0xE0 ... 0xE7: /* FSUBRP %st(0),%st(?) */
   6830                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0,  modrm - 0xE0, True );
   6831                break;
   6832 
   6833             case 0xE8 ... 0xEF: /* FSUBP %st(0),%st(?) */
   6834                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0,  modrm - 0xE8, True );
   6835                break;
   6836 
   6837             case 0xF0 ... 0xF7: /* FDIVRP %st(0),%st(?) */
   6838                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, True );
   6839                break;
   6840 
   6841             case 0xF8 ... 0xFF: /* FDIVP %st(0),%st(?) */
   6842                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, True );
   6843                break;
   6844 
   6845             default:
   6846                goto decode_fail;
   6847          }
   6848 
   6849       }
   6850    }
   6851 
   6852    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDF opcodes +-+-+-+-+-+-+-+ */
   6853    else
   6854    if (first_opcode == 0xDF) {
   6855 
   6856       if (modrm < 0xC0) {
   6857 
   6858          /* bits 5,4,3 are an opcode extension, and the modRM also
   6859             specifies an address. */
   6860          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6861          delta += len;
   6862 
   6863          switch (gregLO3ofRM(modrm)) {
   6864 
   6865             case 0: /* FILD m16int */
   6866                DIP("fildw %s\n", dis_buf);
   6867                fp_push();
   6868                put_ST(0, unop(Iop_I32StoF64,
   6869                               unop(Iop_16Sto32,
   6870                                    loadLE(Ity_I16, mkexpr(addr)))));
   6871                break;
   6872 
   6873             case 1: /* FISTTPS m16 (SSE3) */
   6874                DIP("fisttps %s\n", dis_buf);
   6875                storeLE( mkexpr(addr),
   6876                         x87ishly_qnarrow_32_to_16(
   6877                         binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) ));
   6878                fp_pop();
   6879                break;
   6880 
   6881             case 2: /* FIST m16 */
   6882                DIP("fists %s\n", dis_buf);
   6883                storeLE( mkexpr(addr),
   6884                         x87ishly_qnarrow_32_to_16(
   6885                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
   6886                break;
   6887 
   6888             case 3: /* FISTP m16 */
   6889                DIP("fistps %s\n", dis_buf);
   6890                storeLE( mkexpr(addr),
   6891                         x87ishly_qnarrow_32_to_16(
   6892                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
   6893                fp_pop();
   6894                break;
   6895 
   6896             case 5: /* FILD m64 */
   6897                DIP("fildll %s\n", dis_buf);
   6898                fp_push();
   6899                put_ST(0, binop(Iop_I64StoF64,
   6900                                get_roundingmode(),
   6901                                loadLE(Ity_I64, mkexpr(addr))));
   6902                break;
   6903 
   6904             case 7: /* FISTP m64 */
   6905                DIP("fistpll %s\n", dis_buf);
   6906                storeLE( mkexpr(addr),
   6907                         binop(Iop_F64toI64S, get_roundingmode(), get_ST(0)) );
   6908                fp_pop();
   6909                break;
   6910 
   6911             default:
   6912                vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
   6913                vex_printf("first_opcode == 0xDF\n");
   6914                goto decode_fail;
   6915          }
   6916 
   6917       } else {
   6918 
   6919          delta++;
   6920          switch (modrm) {
   6921 
   6922             case 0xC0: /* FFREEP %st(0) */
   6923                DIP("ffreep %%st(%d)\n", 0);
   6924                put_ST_TAG ( 0, mkU8(0) );
   6925                fp_pop();
   6926                break;
   6927 
   6928             case 0xE0: /* FNSTSW %ax */
   6929                DIP("fnstsw %%ax\n");
   6930                /* Invent a plausible-looking FPU status word value and
   6931                   dump it in %AX:
   6932                      ((ftop & 7) << 11) | (c3210 & 0x4700)
   6933                */
   6934                putIRegRAX(
   6935                   2,
   6936                   unop(Iop_32to16,
   6937                        binop(Iop_Or32,
   6938                              binop(Iop_Shl32,
   6939                                    binop(Iop_And32, get_ftop(), mkU32(7)),
   6940                                    mkU8(11)),
   6941                              binop(Iop_And32,
   6942                                    unop(Iop_64to32, get_C3210()),
   6943                                    mkU32(0x4700))
   6944                )));
   6945                break;
   6946 
   6947             case 0xE8 ... 0xEF: /* FUCOMIP %st(0),%st(?) */
   6948                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, True );
   6949                break;
   6950 
   6951             case 0xF0 ... 0xF7: /* FCOMIP %st(0),%st(?) */
   6952                /* not really right since COMIP != UCOMIP */
   6953                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, True );
   6954                break;
   6955 
   6956             default:
   6957                goto decode_fail;
   6958          }
   6959       }
   6960 
   6961    }
   6962 
   6963    else
   6964       goto decode_fail;
   6965 
   6966    *decode_ok = True;
   6967    return delta;
   6968 
   6969   decode_fail:
   6970    *decode_ok = False;
   6971    return delta;
   6972 }
   6973 
   6974 
   6975 /*------------------------------------------------------------*/
   6976 /*---                                                      ---*/
   6977 /*--- MMX INSTRUCTIONS                                     ---*/
   6978 /*---                                                      ---*/
   6979 /*------------------------------------------------------------*/
   6980 
   6981 /* Effect of MMX insns on x87 FPU state (table 11-2 of
   6982    IA32 arch manual, volume 3):
   6983 
   6984    Read from, or write to MMX register (viz, any insn except EMMS):
   6985    * All tags set to Valid (non-empty) -- FPTAGS[i] := nonzero
   6986    * FP stack pointer set to zero
   6987 
   6988    EMMS:
   6989    * All tags set to Invalid (empty) -- FPTAGS[i] := zero
   6990    * FP stack pointer set to zero
   6991 */
   6992 
   6993 static void do_MMX_preamble ( void )
   6994 {
   6995    Int         i;
   6996    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   6997    IRExpr*     zero  = mkU32(0);
   6998    IRExpr*     tag1  = mkU8(1);
   6999    put_ftop(zero);
   7000    for (i = 0; i < 8; i++)
   7001       stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag1) ) );
   7002 }
   7003 
   7004 static void do_EMMS_preamble ( void )
   7005 {
   7006    Int         i;
   7007    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   7008    IRExpr*     zero  = mkU32(0);
   7009    IRExpr*     tag0  = mkU8(0);
   7010    put_ftop(zero);
   7011    for (i = 0; i < 8; i++)
   7012       stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag0) ) );
   7013 }
   7014 
   7015 
   7016 static IRExpr* getMMXReg ( UInt archreg )
   7017 {
   7018    vassert(archreg < 8);
   7019    return IRExpr_Get( OFFB_FPREGS + 8 * archreg, Ity_I64 );
   7020 }
   7021 
   7022 
   7023 static void putMMXReg ( UInt archreg, IRExpr* e )
   7024 {
   7025    vassert(archreg < 8);
   7026    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   7027    stmt( IRStmt_Put( OFFB_FPREGS + 8 * archreg, e ) );
   7028 }
   7029 
   7030 
   7031 /* Helper for non-shift MMX insns.  Note this is incomplete in the
   7032    sense that it does not first call do_MMX_preamble() -- that is the
   7033    responsibility of its caller. */
   7034 
   7035 static
   7036 ULong dis_MMXop_regmem_to_reg ( VexAbiInfo* vbi,
   7037                                 Prefix      pfx,
   7038                                 Long        delta,
   7039                                 UChar       opc,
   7040                                 const HChar* name,
   7041                                 Bool        show_granularity )
   7042 {
   7043    HChar   dis_buf[50];
   7044    UChar   modrm = getUChar(delta);
   7045    Bool    isReg = epartIsReg(modrm);
   7046    IRExpr* argL  = NULL;
   7047    IRExpr* argR  = NULL;
   7048    IRExpr* argG  = NULL;
   7049    IRExpr* argE  = NULL;
   7050    IRTemp  res   = newTemp(Ity_I64);
   7051 
   7052    Bool    invG  = False;
   7053    IROp    op    = Iop_INVALID;
   7054    void*   hAddr = NULL;
   7055    const HChar*  hName = NULL;
   7056    Bool    eLeft = False;
   7057 
   7058 #  define XXX(_name) do { hAddr = &_name; hName = #_name; } while (0)
   7059 
   7060    switch (opc) {
   7061       /* Original MMX ones */
   7062       case 0xFC: op = Iop_Add8x8; break;
   7063       case 0xFD: op = Iop_Add16x4; break;
   7064       case 0xFE: op = Iop_Add32x2; break;
   7065 
   7066       case 0xEC: op = Iop_QAdd8Sx8; break;
   7067       case 0xED: op = Iop_QAdd16Sx4; break;
   7068 
   7069       case 0xDC: op = Iop_QAdd8Ux8; break;
   7070       case 0xDD: op = Iop_QAdd16Ux4; break;
   7071 
   7072       case 0xF8: op = Iop_Sub8x8;  break;
   7073       case 0xF9: op = Iop_Sub16x4; break;
   7074       case 0xFA: op = Iop_Sub32x2; break;
   7075 
   7076       case 0xE8: op = Iop_QSub8Sx8; break;
   7077       case 0xE9: op = Iop_QSub16Sx4; break;
   7078 
   7079       case 0xD8: op = Iop_QSub8Ux8; break;
   7080       case 0xD9: op = Iop_QSub16Ux4; break;
   7081 
   7082       case 0xE5: op = Iop_MulHi16Sx4; break;
   7083       case 0xD5: op = Iop_Mul16x4; break;
   7084       case 0xF5: XXX(amd64g_calculate_mmx_pmaddwd); break;
   7085 
   7086       case 0x74: op = Iop_CmpEQ8x8; break;
   7087       case 0x75: op = Iop_CmpEQ16x4; break;
   7088       case 0x76: op = Iop_CmpEQ32x2; break;
   7089 
   7090       case 0x64: op = Iop_CmpGT8Sx8; break;
   7091       case 0x65: op = Iop_CmpGT16Sx4; break;
   7092       case 0x66: op = Iop_CmpGT32Sx2; break;
   7093 
   7094       case 0x6B: op = Iop_QNarrowBin32Sto16Sx4; eLeft = True; break;
   7095       case 0x63: op = Iop_QNarrowBin16Sto8Sx8;  eLeft = True; break;
   7096       case 0x67: op = Iop_QNarrowBin16Sto8Ux8;  eLeft = True; break;
   7097 
   7098       case 0x68: op = Iop_InterleaveHI8x8;  eLeft = True; break;
   7099       case 0x69: op = Iop_InterleaveHI16x4; eLeft = True; break;
   7100       case 0x6A: op = Iop_InterleaveHI32x2; eLeft = True; break;
   7101 
   7102       case 0x60: op = Iop_InterleaveLO8x8;  eLeft = True; break;
   7103       case 0x61: op = Iop_InterleaveLO16x4; eLeft = True; break;
   7104       case 0x62: op = Iop_InterleaveLO32x2; eLeft = True; break;
   7105 
   7106       case 0xDB: op = Iop_And64; break;
   7107       case 0xDF: op = Iop_And64; invG = True; break;
   7108       case 0xEB: op = Iop_Or64; break;
   7109       case 0xEF: /* Possibly do better here if argL and argR are the
   7110                     same reg */
   7111                  op = Iop_Xor64; break;
   7112 
   7113       /* Introduced in SSE1 */
   7114       case 0xE0: op = Iop_Avg8Ux8;    break;
   7115       case 0xE3: op = Iop_Avg16Ux4;   break;
   7116       case 0xEE: op = Iop_Max16Sx4;   break;
   7117       case 0xDE: op = Iop_Max8Ux8;    break;
   7118       case 0xEA: op = Iop_Min16Sx4;   break;
   7119       case 0xDA: op = Iop_Min8Ux8;    break;
   7120       case 0xE4: op = Iop_MulHi16Ux4; break;
   7121       case 0xF6: XXX(amd64g_calculate_mmx_psadbw); break;
   7122 
   7123       /* Introduced in SSE2 */
   7124       case 0xD4: op = Iop_Add64; break;
   7125       case 0xFB: op = Iop_Sub64; break;
   7126 
   7127       default:
   7128          vex_printf("\n0x%x\n", (Int)opc);
   7129          vpanic("dis_MMXop_regmem_to_reg");
   7130    }
   7131 
   7132 #  undef XXX
   7133 
   7134    argG = getMMXReg(gregLO3ofRM(modrm));
   7135    if (invG)
   7136       argG = unop(Iop_Not64, argG);
   7137 
   7138    if (isReg) {
   7139       delta++;
   7140       argE = getMMXReg(eregLO3ofRM(modrm));
   7141    } else {
   7142       Int    len;
   7143       IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7144       delta += len;
   7145       argE = loadLE(Ity_I64, mkexpr(addr));
   7146    }
   7147 
   7148    if (eLeft) {
   7149       argL = argE;
   7150       argR = argG;
   7151    } else {
   7152       argL = argG;
   7153       argR = argE;
   7154    }
   7155 
   7156    if (op != Iop_INVALID) {
   7157       vassert(hName == NULL);
   7158       vassert(hAddr == NULL);
   7159       assign(res, binop(op, argL, argR));
   7160    } else {
   7161       vassert(hName != NULL);
   7162       vassert(hAddr != NULL);
   7163       assign( res,
   7164               mkIRExprCCall(
   7165                  Ity_I64,
   7166                  0/*regparms*/, hName, hAddr,
   7167                  mkIRExprVec_2( argL, argR )
   7168               )
   7169             );
   7170    }
   7171 
   7172    putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
   7173 
   7174    DIP("%s%s %s, %s\n",
   7175        name, show_granularity ? nameMMXGran(opc & 3) : "",
   7176        ( isReg ? nameMMXReg(eregLO3ofRM(modrm)) : dis_buf ),
   7177        nameMMXReg(gregLO3ofRM(modrm)) );
   7178 
   7179    return delta;
   7180 }
   7181 
   7182 
   7183 /* Vector by scalar shift of G by the amount specified at the bottom
   7184    of E.  This is a straight copy of dis_SSE_shiftG_byE. */
   7185 
   7186 static ULong dis_MMX_shiftG_byE ( VexAbiInfo* vbi,
   7187                                   Prefix pfx, Long delta,
   7188                                   const HChar* opname, IROp op )
   7189 {
   7190    HChar   dis_buf[50];
   7191    Int     alen, size;
   7192    IRTemp  addr;
   7193    Bool    shl, shr, sar;
   7194    UChar   rm   = getUChar(delta);
   7195    IRTemp  g0   = newTemp(Ity_I64);
   7196    IRTemp  g1   = newTemp(Ity_I64);
   7197    IRTemp  amt  = newTemp(Ity_I64);
   7198    IRTemp  amt8 = newTemp(Ity_I8);
   7199 
   7200    if (epartIsReg(rm)) {
   7201       assign( amt, getMMXReg(eregLO3ofRM(rm)) );
   7202       DIP("%s %s,%s\n", opname,
   7203                         nameMMXReg(eregLO3ofRM(rm)),
   7204                         nameMMXReg(gregLO3ofRM(rm)) );
   7205       delta++;
   7206    } else {
   7207       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   7208       assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
   7209       DIP("%s %s,%s\n", opname,
   7210                         dis_buf,
   7211                         nameMMXReg(gregLO3ofRM(rm)) );
   7212       delta += alen;
   7213    }
   7214    assign( g0,   getMMXReg(gregLO3ofRM(rm)) );
   7215    assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
   7216 
   7217    shl = shr = sar = False;
   7218    size = 0;
   7219    switch (op) {
   7220       case Iop_ShlN16x4: shl = True; size = 32; break;
   7221       case Iop_ShlN32x2: shl = True; size = 32; break;
   7222       case Iop_Shl64:    shl = True; size = 64; break;
   7223       case Iop_ShrN16x4: shr = True; size = 16; break;
   7224       case Iop_ShrN32x2: shr = True; size = 32; break;
   7225       case Iop_Shr64:    shr = True; size = 64; break;
   7226       case Iop_SarN16x4: sar = True; size = 16; break;
   7227       case Iop_SarN32x2: sar = True; size = 32; break;
   7228       default: vassert(0);
   7229    }
   7230 
   7231    if (shl || shr) {
   7232      assign(
   7233         g1,
   7234         IRExpr_ITE(
   7235            binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size)),
   7236            binop(op, mkexpr(g0), mkexpr(amt8)),
   7237            mkU64(0)
   7238         )
   7239      );
   7240    } else
   7241    if (sar) {
   7242      assign(
   7243         g1,
   7244         IRExpr_ITE(
   7245            binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size)),
   7246            binop(op, mkexpr(g0), mkexpr(amt8)),
   7247            binop(op, mkexpr(g0), mkU8(size-1))
   7248         )
   7249      );
   7250    } else {
   7251       vassert(0);
   7252    }
   7253 
   7254    putMMXReg( gregLO3ofRM(rm), mkexpr(g1) );
   7255    return delta;
   7256 }
   7257 
   7258 
   7259 /* Vector by scalar shift of E by an immediate byte.  This is a
   7260    straight copy of dis_SSE_shiftE_imm. */
   7261 
   7262 static
   7263 ULong dis_MMX_shiftE_imm ( Long delta, const HChar* opname, IROp op )
   7264 {
   7265    Bool    shl, shr, sar;
   7266    UChar   rm   = getUChar(delta);
   7267    IRTemp  e0   = newTemp(Ity_I64);
   7268    IRTemp  e1   = newTemp(Ity_I64);
   7269    UChar   amt, size;
   7270    vassert(epartIsReg(rm));
   7271    vassert(gregLO3ofRM(rm) == 2
   7272            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
   7273    amt = getUChar(delta+1);
   7274    delta += 2;
   7275    DIP("%s $%d,%s\n", opname,
   7276                       (Int)amt,
   7277                       nameMMXReg(eregLO3ofRM(rm)) );
   7278 
   7279    assign( e0, getMMXReg(eregLO3ofRM(rm)) );
   7280 
   7281    shl = shr = sar = False;
   7282    size = 0;
   7283    switch (op) {
   7284       case Iop_ShlN16x4: shl = True; size = 16; break;
   7285       case Iop_ShlN32x2: shl = True; size = 32; break;
   7286       case Iop_Shl64:    shl = True; size = 64; break;
   7287       case Iop_SarN16x4: sar = True; size = 16; break;
   7288       case Iop_SarN32x2: sar = True; size = 32; break;
   7289       case Iop_ShrN16x4: shr = True; size = 16; break;
   7290       case Iop_ShrN32x2: shr = True; size = 32; break;
   7291       case Iop_Shr64:    shr = True; size = 64; break;
   7292       default: vassert(0);
   7293    }
   7294 
   7295    if (shl || shr) {
   7296      assign( e1, amt >= size
   7297                     ? mkU64(0)
   7298                     : binop(op, mkexpr(e0), mkU8(amt))
   7299      );
   7300    } else
   7301    if (sar) {
   7302      assign( e1, amt >= size
   7303                     ? binop(op, mkexpr(e0), mkU8(size-1))
   7304                     : binop(op, mkexpr(e0), mkU8(amt))
   7305      );
   7306    } else {
   7307       vassert(0);
   7308    }
   7309 
   7310    putMMXReg( eregLO3ofRM(rm), mkexpr(e1) );
   7311    return delta;
   7312 }
   7313 
   7314 
   7315 /* Completely handle all MMX instructions except emms. */
   7316 
   7317 static
   7318 ULong dis_MMX ( Bool* decode_ok,
   7319                 VexAbiInfo* vbi, Prefix pfx, Int sz, Long delta )
   7320 {
   7321    Int   len;
   7322    UChar modrm;
   7323    HChar dis_buf[50];
   7324    UChar opc = getUChar(delta);
   7325    delta++;
   7326 
   7327    /* dis_MMX handles all insns except emms. */
   7328    do_MMX_preamble();
   7329 
   7330    switch (opc) {
   7331 
   7332       case 0x6E:
   7333          if (sz == 4) {
   7334             /* MOVD (src)ireg32-or-mem32 (E), (dst)mmxreg (G)*/
   7335             modrm = getUChar(delta);
   7336             if (epartIsReg(modrm)) {
   7337                delta++;
   7338                putMMXReg(
   7339                   gregLO3ofRM(modrm),
   7340                   binop( Iop_32HLto64,
   7341                          mkU32(0),
   7342                          getIReg32(eregOfRexRM(pfx,modrm)) ) );
   7343                DIP("movd %s, %s\n",
   7344                    nameIReg32(eregOfRexRM(pfx,modrm)),
   7345                    nameMMXReg(gregLO3ofRM(modrm)));
   7346             } else {
   7347                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7348                delta += len;
   7349                putMMXReg(
   7350                   gregLO3ofRM(modrm),
   7351                   binop( Iop_32HLto64,
   7352                          mkU32(0),
   7353                          loadLE(Ity_I32, mkexpr(addr)) ) );
   7354                DIP("movd %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
   7355             }
   7356          }
   7357          else
   7358          if (sz == 8) {
   7359             /* MOVD (src)ireg64-or-mem64 (E), (dst)mmxreg (G)*/
   7360             modrm = getUChar(delta);
   7361             if (epartIsReg(modrm)) {
   7362                delta++;
   7363                putMMXReg( gregLO3ofRM(modrm),
   7364                           getIReg64(eregOfRexRM(pfx,modrm)) );
   7365                DIP("movd %s, %s\n",
   7366                    nameIReg64(eregOfRexRM(pfx,modrm)),
   7367                    nameMMXReg(gregLO3ofRM(modrm)));
   7368             } else {
   7369                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7370                delta += len;
   7371                putMMXReg( gregLO3ofRM(modrm),
   7372                           loadLE(Ity_I64, mkexpr(addr)) );
   7373                DIP("movd{64} %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
   7374             }
   7375          }
   7376          else {
   7377             goto mmx_decode_failure;
   7378          }
   7379          break;
   7380 
   7381       case 0x7E:
   7382          if (sz == 4) {
   7383             /* MOVD (src)mmxreg (G), (dst)ireg32-or-mem32 (E) */
   7384             modrm = getUChar(delta);
   7385             if (epartIsReg(modrm)) {
   7386                delta++;
   7387                putIReg32( eregOfRexRM(pfx,modrm),
   7388                           unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
   7389                DIP("movd %s, %s\n",
   7390                    nameMMXReg(gregLO3ofRM(modrm)),
   7391                    nameIReg32(eregOfRexRM(pfx,modrm)));
   7392             } else {
   7393                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7394                delta += len;
   7395                storeLE( mkexpr(addr),
   7396                         unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
   7397                DIP("movd %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
   7398             }
   7399          }
   7400          else
   7401          if (sz == 8) {
   7402             /* MOVD (src)mmxreg (G), (dst)ireg64-or-mem64 (E) */
   7403             modrm = getUChar(delta);
   7404             if (epartIsReg(modrm)) {
   7405                delta++;
   7406                putIReg64( eregOfRexRM(pfx,modrm),
   7407                           getMMXReg(gregLO3ofRM(modrm)) );
   7408                DIP("movd %s, %s\n",
   7409                    nameMMXReg(gregLO3ofRM(modrm)),
   7410                    nameIReg64(eregOfRexRM(pfx,modrm)));
   7411             } else {
   7412                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7413                delta += len;
   7414                storeLE( mkexpr(addr),
   7415                        getMMXReg(gregLO3ofRM(modrm)) );
   7416                DIP("movd{64} %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
   7417             }
   7418          } else {
   7419             goto mmx_decode_failure;
   7420          }
   7421          break;
   7422 
   7423       case 0x6F:
   7424          /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
   7425          if (sz != 4
   7426              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7427             goto mmx_decode_failure;
   7428          modrm = getUChar(delta);
   7429          if (epartIsReg(modrm)) {
   7430             delta++;
   7431             putMMXReg( gregLO3ofRM(modrm), getMMXReg(eregLO3ofRM(modrm)) );
   7432             DIP("movq %s, %s\n",
   7433                 nameMMXReg(eregLO3ofRM(modrm)),
   7434                 nameMMXReg(gregLO3ofRM(modrm)));
   7435          } else {
   7436             IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7437             delta += len;
   7438             putMMXReg( gregLO3ofRM(modrm), loadLE(Ity_I64, mkexpr(addr)) );
   7439             DIP("movq %s, %s\n",
   7440                 dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
   7441          }
   7442          break;
   7443 
   7444       case 0x7F:
   7445          /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
   7446          if (sz != 4
   7447              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7448             goto mmx_decode_failure;
   7449          modrm = getUChar(delta);
   7450          if (epartIsReg(modrm)) {
   7451             delta++;
   7452             putMMXReg( eregLO3ofRM(modrm), getMMXReg(gregLO3ofRM(modrm)) );
   7453             DIP("movq %s, %s\n",
   7454                 nameMMXReg(gregLO3ofRM(modrm)),
   7455                 nameMMXReg(eregLO3ofRM(modrm)));
   7456          } else {
   7457             IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7458             delta += len;
   7459             storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
   7460             DIP("mov(nt)q %s, %s\n",
   7461                 nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
   7462          }
   7463          break;
   7464 
   7465       case 0xFC:
   7466       case 0xFD:
   7467       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
   7468          if (sz != 4)
   7469             goto mmx_decode_failure;
   7470          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padd", True );
   7471          break;
   7472 
   7473       case 0xEC:
   7474       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
   7475          if (sz != 4
   7476              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7477             goto mmx_decode_failure;
   7478          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padds", True );
   7479          break;
   7480 
   7481       case 0xDC:
   7482       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   7483          if (sz != 4)
   7484             goto mmx_decode_failure;
   7485          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "paddus", True );
   7486          break;
   7487 
   7488       case 0xF8:
   7489       case 0xF9:
   7490       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
   7491          if (sz != 4)
   7492             goto mmx_decode_failure;
   7493          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psub", True );
   7494          break;
   7495 
   7496       case 0xE8:
   7497       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
   7498          if (sz != 4)
   7499             goto mmx_decode_failure;
   7500          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubs", True );
   7501          break;
   7502 
   7503       case 0xD8:
   7504       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   7505          if (sz != 4)
   7506             goto mmx_decode_failure;
   7507          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubus", True );
   7508          break;
   7509 
   7510       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
   7511          if (sz != 4)
   7512             goto mmx_decode_failure;
   7513          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmulhw", False );
   7514          break;
   7515 
   7516       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
   7517          if (sz != 4)
   7518             goto mmx_decode_failure;
   7519          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmullw", False );
   7520          break;
   7521 
   7522       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
   7523          vassert(sz == 4);
   7524          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmaddwd", False );
   7525          break;
   7526 
   7527       case 0x74:
   7528       case 0x75:
   7529       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
   7530          if (sz != 4)
   7531             goto mmx_decode_failure;
   7532          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpeq", True );
   7533          break;
   7534 
   7535       case 0x64:
   7536       case 0x65:
   7537       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
   7538          if (sz != 4)
   7539             goto mmx_decode_failure;
   7540          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpgt", True );
   7541          break;
   7542 
   7543       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
   7544          if (sz != 4)
   7545             goto mmx_decode_failure;
   7546          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packssdw", False );
   7547          break;
   7548 
   7549       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
   7550          if (sz != 4)
   7551             goto mmx_decode_failure;
   7552          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packsswb", False );
   7553          break;
   7554 
   7555       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
   7556          if (sz != 4)
   7557             goto mmx_decode_failure;
   7558          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packuswb", False );
   7559          break;
   7560 
   7561       case 0x68:
   7562       case 0x69:
   7563       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
   7564          if (sz != 4
   7565              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7566             goto mmx_decode_failure;
   7567          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckh", True );
   7568          break;
   7569 
   7570       case 0x60:
   7571       case 0x61:
   7572       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
   7573          if (sz != 4
   7574              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7575             goto mmx_decode_failure;
   7576          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckl", True );
   7577          break;
   7578 
   7579       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
   7580          if (sz != 4)
   7581             goto mmx_decode_failure;
   7582          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pand", False );
   7583          break;
   7584 
   7585       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
   7586          if (sz != 4)
   7587             goto mmx_decode_failure;
   7588          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pandn", False );
   7589          break;
   7590 
   7591       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
   7592          if (sz != 4)
   7593             goto mmx_decode_failure;
   7594          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "por", False );
   7595          break;
   7596 
   7597       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
   7598          if (sz != 4)
   7599             goto mmx_decode_failure;
   7600          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pxor", False );
   7601          break;
   7602 
   7603 #     define SHIFT_BY_REG(_name,_op)                                     \
   7604                 delta = dis_MMX_shiftG_byE(vbi, pfx, delta, _name, _op); \
   7605                 break;
   7606 
   7607       /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
   7608       case 0xF1: SHIFT_BY_REG("psllw", Iop_ShlN16x4);
   7609       case 0xF2: SHIFT_BY_REG("pslld", Iop_ShlN32x2);
   7610       case 0xF3: SHIFT_BY_REG("psllq", Iop_Shl64);
   7611 
   7612       /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
   7613       case 0xD1: SHIFT_BY_REG("psrlw", Iop_ShrN16x4);
   7614       case 0xD2: SHIFT_BY_REG("psrld", Iop_ShrN32x2);
   7615       case 0xD3: SHIFT_BY_REG("psrlq", Iop_Shr64);
   7616 
   7617       /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
   7618       case 0xE1: SHIFT_BY_REG("psraw", Iop_SarN16x4);
   7619       case 0xE2: SHIFT_BY_REG("psrad", Iop_SarN32x2);
   7620 
   7621 #     undef SHIFT_BY_REG
   7622 
   7623       case 0x71:
   7624       case 0x72:
   7625       case 0x73: {
   7626          /* (sz==4): PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
   7627          UChar byte2, subopc;
   7628          if (sz != 4)
   7629             goto mmx_decode_failure;
   7630          byte2  = getUChar(delta);      /* amode / sub-opcode */
   7631          subopc = toUChar( (byte2 >> 3) & 7 );
   7632 
   7633 #        define SHIFT_BY_IMM(_name,_op)                        \
   7634             do { delta = dis_MMX_shiftE_imm(delta,_name,_op);  \
   7635             } while (0)
   7636 
   7637               if (subopc == 2 /*SRL*/ && opc == 0x71)
   7638                   SHIFT_BY_IMM("psrlw", Iop_ShrN16x4);
   7639          else if (subopc == 2 /*SRL*/ && opc == 0x72)
   7640                  SHIFT_BY_IMM("psrld", Iop_ShrN32x2);
   7641          else if (subopc == 2 /*SRL*/ && opc == 0x73)
   7642                  SHIFT_BY_IMM("psrlq", Iop_Shr64);
   7643 
   7644          else if (subopc == 4 /*SAR*/ && opc == 0x71)
   7645                  SHIFT_BY_IMM("psraw", Iop_SarN16x4);
   7646          else if (subopc == 4 /*SAR*/ && opc == 0x72)
   7647                  SHIFT_BY_IMM("psrad", Iop_SarN32x2);
   7648 
   7649          else if (subopc == 6 /*SHL*/ && opc == 0x71)
   7650                  SHIFT_BY_IMM("psllw", Iop_ShlN16x4);
   7651          else if (subopc == 6 /*SHL*/ && opc == 0x72)
   7652                   SHIFT_BY_IMM("pslld", Iop_ShlN32x2);
   7653          else if (subopc == 6 /*SHL*/ && opc == 0x73)
   7654                  SHIFT_BY_IMM("psllq", Iop_Shl64);
   7655 
   7656          else goto mmx_decode_failure;
   7657 
   7658 #        undef SHIFT_BY_IMM
   7659          break;
   7660       }
   7661 
   7662       case 0xF7: {
   7663          IRTemp addr    = newTemp(Ity_I64);
   7664          IRTemp regD    = newTemp(Ity_I64);
   7665          IRTemp regM    = newTemp(Ity_I64);
   7666          IRTemp mask    = newTemp(Ity_I64);
   7667          IRTemp olddata = newTemp(Ity_I64);
   7668          IRTemp newdata = newTemp(Ity_I64);
   7669 
   7670          modrm = getUChar(delta);
   7671          if (sz != 4 || (!epartIsReg(modrm)))
   7672             goto mmx_decode_failure;
   7673          delta++;
   7674 
   7675          assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
   7676          assign( regM, getMMXReg( eregLO3ofRM(modrm) ));
   7677          assign( regD, getMMXReg( gregLO3ofRM(modrm) ));
   7678          assign( mask, binop(Iop_SarN8x8, mkexpr(regM), mkU8(7)) );
   7679          assign( olddata, loadLE( Ity_I64, mkexpr(addr) ));
   7680          assign( newdata,
   7681                  binop(Iop_Or64,
   7682                        binop(Iop_And64,
   7683                              mkexpr(regD),
   7684                              mkexpr(mask) ),
   7685                        binop(Iop_And64,
   7686                              mkexpr(olddata),
   7687                              unop(Iop_Not64, mkexpr(mask)))) );
   7688          storeLE( mkexpr(addr), mkexpr(newdata) );
   7689          DIP("maskmovq %s,%s\n", nameMMXReg( eregLO3ofRM(modrm) ),
   7690                                  nameMMXReg( gregLO3ofRM(modrm) ) );
   7691          break;
   7692       }
   7693 
   7694       /* --- MMX decode failure --- */
   7695       default:
   7696       mmx_decode_failure:
   7697          *decode_ok = False;
   7698          return delta; /* ignored */
   7699 
   7700    }
   7701 
   7702    *decode_ok = True;
   7703    return delta;
   7704 }
   7705 
   7706 
   7707 /*------------------------------------------------------------*/
   7708 /*--- More misc arithmetic and other obscure insns.        ---*/
   7709 /*------------------------------------------------------------*/
   7710 
   7711 /* Generate base << amt with vacated places filled with stuff
   7712    from xtra.  amt guaranteed in 0 .. 63. */
   7713 static
   7714 IRExpr* shiftL64_with_extras ( IRTemp base, IRTemp xtra, IRTemp amt )
   7715 {
   7716    /* if   amt == 0
   7717       then base
   7718       else (base << amt) | (xtra >>u (64-amt))
   7719    */
   7720    return
   7721       IRExpr_ITE(
   7722          binop(Iop_CmpNE8, mkexpr(amt), mkU8(0)),
   7723          binop(Iop_Or64,
   7724                binop(Iop_Shl64, mkexpr(base), mkexpr(amt)),
   7725                binop(Iop_Shr64, mkexpr(xtra),
   7726                                 binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
   7727                ),
   7728          mkexpr(base)
   7729       );
   7730 }
   7731 
   7732 /* Generate base >>u amt with vacated places filled with stuff
   7733    from xtra.  amt guaranteed in 0 .. 63. */
   7734 static
   7735 IRExpr* shiftR64_with_extras ( IRTemp xtra, IRTemp base, IRTemp amt )
   7736 {
   7737    /* if   amt == 0
   7738       then base
   7739       else (base >>u amt) | (xtra << (64-amt))
   7740    */
   7741    return
   7742       IRExpr_ITE(
   7743          binop(Iop_CmpNE8, mkexpr(amt), mkU8(0)),
   7744          binop(Iop_Or64,
   7745                binop(Iop_Shr64, mkexpr(base), mkexpr(amt)),
   7746                binop(Iop_Shl64, mkexpr(xtra),
   7747                                 binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
   7748                ),
   7749          mkexpr(base)
   7750       );
   7751 }
   7752 
   7753 /* Double length left and right shifts.  Apparently only required in
   7754    v-size (no b- variant). */
   7755 static
   7756 ULong dis_SHLRD_Gv_Ev ( VexAbiInfo* vbi,
   7757                         Prefix pfx,
   7758                         Long delta, UChar modrm,
   7759                         Int sz,
   7760                         IRExpr* shift_amt,
   7761                         Bool amt_is_literal,
   7762                         const HChar* shift_amt_txt,
   7763                         Bool left_shift )
   7764 {
   7765    /* shift_amt :: Ity_I8 is the amount to shift.  shift_amt_txt is used
   7766       for printing it.   And eip on entry points at the modrm byte. */
   7767    Int len;
   7768    HChar dis_buf[50];
   7769 
   7770    IRType ty     = szToITy(sz);
   7771    IRTemp gsrc   = newTemp(ty);
   7772    IRTemp esrc   = newTemp(ty);
   7773    IRTemp addr   = IRTemp_INVALID;
   7774    IRTemp tmpSH  = newTemp(Ity_I8);
   7775    IRTemp tmpSS  = newTemp(Ity_I8);
   7776    IRTemp tmp64  = IRTemp_INVALID;
   7777    IRTemp res64  = IRTemp_INVALID;
   7778    IRTemp rss64  = IRTemp_INVALID;
   7779    IRTemp resTy  = IRTemp_INVALID;
   7780    IRTemp rssTy  = IRTemp_INVALID;
   7781    Int    mask   = sz==8 ? 63 : 31;
   7782 
   7783    vassert(sz == 2 || sz == 4 || sz == 8);
   7784 
   7785    /* The E-part is the destination; this is shifted.  The G-part
   7786       supplies bits to be shifted into the E-part, but is not
   7787       changed.
   7788 
   7789       If shifting left, form a double-length word with E at the top
   7790       and G at the bottom, and shift this left.  The result is then in
   7791       the high part.
   7792 
   7793       If shifting right, form a double-length word with G at the top
   7794       and E at the bottom, and shift this right.  The result is then
   7795       at the bottom.  */
   7796 
   7797    /* Fetch the operands. */
   7798 
   7799    assign( gsrc, getIRegG(sz, pfx, modrm) );
   7800 
   7801    if (epartIsReg(modrm)) {
   7802       delta++;
   7803       assign( esrc, getIRegE(sz, pfx, modrm) );
   7804       DIP("sh%cd%c %s, %s, %s\n",
   7805           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   7806           shift_amt_txt,
   7807           nameIRegG(sz, pfx, modrm), nameIRegE(sz, pfx, modrm));
   7808    } else {
   7809       addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
   7810                         /* # bytes following amode */
   7811                         amt_is_literal ? 1 : 0 );
   7812       delta += len;
   7813       assign( esrc, loadLE(ty, mkexpr(addr)) );
   7814       DIP("sh%cd%c %s, %s, %s\n",
   7815           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   7816           shift_amt_txt,
   7817           nameIRegG(sz, pfx, modrm), dis_buf);
   7818    }
   7819 
   7820    /* Calculate the masked shift amount (tmpSH), the masked subshift
   7821       amount (tmpSS), the shifted value (res64) and the subshifted
   7822       value (rss64). */
   7823 
   7824    assign( tmpSH, binop(Iop_And8, shift_amt, mkU8(mask)) );
   7825    assign( tmpSS, binop(Iop_And8,
   7826                         binop(Iop_Sub8, mkexpr(tmpSH), mkU8(1) ),
   7827                         mkU8(mask)));
   7828 
   7829    tmp64 = newTemp(Ity_I64);
   7830    res64 = newTemp(Ity_I64);
   7831    rss64 = newTemp(Ity_I64);
   7832 
   7833    if (sz == 2 || sz == 4) {
   7834 
   7835       /* G is xtra; E is data */
   7836       /* what a freaking nightmare: */
   7837       if (sz == 4 && left_shift) {
   7838          assign( tmp64, binop(Iop_32HLto64, mkexpr(esrc), mkexpr(gsrc)) );
   7839          assign( res64,
   7840                  binop(Iop_Shr64,
   7841                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
   7842                        mkU8(32)) );
   7843          assign( rss64,
   7844                  binop(Iop_Shr64,
   7845                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSS)),
   7846                        mkU8(32)) );
   7847       }
   7848       else
   7849       if (sz == 4 && !left_shift) {
   7850          assign( tmp64, binop(Iop_32HLto64, mkexpr(gsrc), mkexpr(esrc)) );
   7851          assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
   7852          assign( rss64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSS)) );
   7853       }
   7854       else
   7855       if (sz == 2 && left_shift) {
   7856          assign( tmp64,
   7857                  binop(Iop_32HLto64,
   7858                        binop(Iop_16HLto32, mkexpr(esrc), mkexpr(gsrc)),
   7859                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc))
   7860          ));
   7861          /* result formed by shifting [esrc'gsrc'gsrc'gsrc] */
   7862          assign( res64,
   7863                  binop(Iop_Shr64,
   7864                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
   7865                        mkU8(48)) );
   7866          /* subshift formed by shifting [esrc'0000'0000'0000] */
   7867          assign( rss64,
   7868                  binop(Iop_Shr64,
   7869                        binop(Iop_Shl64,
   7870                              binop(Iop_Shl64, unop(Iop_16Uto64, mkexpr(esrc)),
   7871                                               mkU8(48)),
   7872                              mkexpr(tmpSS)),
   7873                        mkU8(48)) );
   7874       }
   7875       else
   7876       if (sz == 2 && !left_shift) {
   7877          assign( tmp64,
   7878                  binop(Iop_32HLto64,
   7879                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc)),
   7880                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(esrc))
   7881          ));
   7882          /* result formed by shifting [gsrc'gsrc'gsrc'esrc] */
   7883          assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
   7884          /* subshift formed by shifting [0000'0000'0000'esrc] */
   7885          assign( rss64, binop(Iop_Shr64,
   7886                               unop(Iop_16Uto64, mkexpr(esrc)),
   7887                               mkexpr(tmpSS)) );
   7888       }
   7889 
   7890    } else {
   7891 
   7892       vassert(sz == 8);
   7893       if (left_shift) {
   7894          assign( res64, shiftL64_with_extras( esrc, gsrc, tmpSH ));
   7895          assign( rss64, shiftL64_with_extras( esrc, gsrc, tmpSS ));
   7896       } else {
   7897          assign( res64, shiftR64_with_extras( gsrc, esrc, tmpSH ));
   7898          assign( rss64, shiftR64_with_extras( gsrc, esrc, tmpSS ));
   7899       }
   7900 
   7901    }
   7902 
   7903    resTy = newTemp(ty);
   7904    rssTy = newTemp(ty);
   7905    assign( resTy, narrowTo(ty, mkexpr(res64)) );
   7906    assign( rssTy, narrowTo(ty, mkexpr(rss64)) );
   7907 
   7908    /* Put result back and write the flags thunk. */
   7909    setFlags_DEP1_DEP2_shift ( left_shift ? Iop_Shl64 : Iop_Sar64,
   7910                               resTy, rssTy, ty, tmpSH );
   7911 
   7912    if (epartIsReg(modrm)) {
   7913       putIRegE(sz, pfx, modrm, mkexpr(resTy));
   7914    } else {
   7915       storeLE( mkexpr(addr), mkexpr(resTy) );
   7916    }
   7917 
   7918    if (amt_is_literal) delta++;
   7919    return delta;
   7920 }
   7921 
   7922 
   7923 /* Handle BT/BTS/BTR/BTC Gv, Ev.  Apparently b-size is not
   7924    required. */
   7925 
   7926 typedef enum { BtOpNone, BtOpSet, BtOpReset, BtOpComp } BtOp;
   7927 
   7928 static const HChar* nameBtOp ( BtOp op )
   7929 {
   7930    switch (op) {
   7931       case BtOpNone:  return "";
   7932       case BtOpSet:   return "s";
   7933       case BtOpReset: return "r";
   7934       case BtOpComp:  return "c";
   7935       default: vpanic("nameBtOp(amd64)");
   7936    }
   7937 }
   7938 
   7939 
   7940 static
   7941 ULong dis_bt_G_E ( VexAbiInfo* vbi,
   7942                    Prefix pfx, Int sz, Long delta, BtOp op,
   7943                    /*OUT*/Bool* decode_OK )
   7944 {
   7945    HChar  dis_buf[50];
   7946    UChar  modrm;
   7947    Int    len;
   7948    IRTemp t_fetched, t_bitno0, t_bitno1, t_bitno2, t_addr0,
   7949           t_addr1, t_rsp, t_mask, t_new;
   7950 
   7951    vassert(sz == 2 || sz == 4 || sz == 8);
   7952 
   7953    t_fetched = t_bitno0 = t_bitno1 = t_bitno2
   7954              = t_addr0 = t_addr1 = t_rsp
   7955              = t_mask = t_new = IRTemp_INVALID;
   7956 
   7957    t_fetched = newTemp(Ity_I8);
   7958    t_new     = newTemp(Ity_I8);
   7959    t_bitno0  = newTemp(Ity_I64);
   7960    t_bitno1  = newTemp(Ity_I64);
   7961    t_bitno2  = newTemp(Ity_I8);
   7962    t_addr1   = newTemp(Ity_I64);
   7963    modrm     = getUChar(delta);
   7964 
   7965    *decode_OK = True;
   7966    if (epartIsReg(modrm)) {
   7967       /* F2 and F3 are never acceptable. */
   7968       if (haveF2orF3(pfx)) {
   7969          *decode_OK = False;
   7970          return delta;
   7971       }
   7972    } else {
   7973       /* F2 or F3 (but not both) are allowed, provided LOCK is also
   7974          present, and only for the BTC/BTS/BTR cases (not BT). */
   7975       if (haveF2orF3(pfx)) {
   7976          if (haveF2andF3(pfx) || !haveLOCK(pfx) || op == BtOpNone) {
   7977             *decode_OK = False;
   7978             return delta;
   7979          }
   7980       }
   7981    }
   7982 
   7983    assign( t_bitno0, widenSto64(getIRegG(sz, pfx, modrm)) );
   7984 
   7985    if (epartIsReg(modrm)) {
   7986       delta++;
   7987       /* Get it onto the client's stack.  Oh, this is a horrible
   7988          kludge.  See https://bugs.kde.org/show_bug.cgi?id=245925.
   7989          Because of the ELF ABI stack redzone, there may be live data
   7990          up to 128 bytes below %RSP.  So we can't just push it on the
   7991          stack, else we may wind up trashing live data, and causing
   7992          impossible-to-find simulation errors.  (Yes, this did
   7993          happen.)  So we need to drop RSP before at least 128 before
   7994          pushing it.  That unfortunately means hitting Memcheck's
   7995          fast-case painting code.  Ideally we should drop more than
   7996          128, to reduce the chances of breaking buggy programs that
   7997          have live data below -128(%RSP).  Memcheck fast-cases moves
   7998          of 288 bytes due to the need to handle ppc64-linux quickly,
   7999          so let's use 288.  Of course the real fix is to get rid of
   8000          this kludge entirely.  */
   8001       t_rsp = newTemp(Ity_I64);
   8002       t_addr0 = newTemp(Ity_I64);
   8003 
   8004       vassert(vbi->guest_stack_redzone_size == 128);
   8005       assign( t_rsp, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(288)) );
   8006       putIReg64(R_RSP, mkexpr(t_rsp));
   8007 
   8008       storeLE( mkexpr(t_rsp), getIRegE(sz, pfx, modrm) );
   8009 
   8010       /* Make t_addr0 point at it. */
   8011       assign( t_addr0, mkexpr(t_rsp) );
   8012 
   8013       /* Mask out upper bits of the shift amount, since we're doing a
   8014          reg. */
   8015       assign( t_bitno1, binop(Iop_And64,
   8016                               mkexpr(t_bitno0),
   8017                               mkU64(sz == 8 ? 63 : sz == 4 ? 31 : 15)) );
   8018 
   8019    } else {
   8020       t_addr0 = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   8021       delta += len;
   8022       assign( t_bitno1, mkexpr(t_bitno0) );
   8023    }
   8024 
   8025    /* At this point: t_addr0 is the address being operated on.  If it
   8026       was a reg, we will have pushed it onto the client's stack.
   8027       t_bitno1 is the bit number, suitably masked in the case of a
   8028       reg.  */
   8029 
   8030    /* Now the main sequence. */
   8031    assign( t_addr1,
   8032            binop(Iop_Add64,
   8033                  mkexpr(t_addr0),
   8034                  binop(Iop_Sar64, mkexpr(t_bitno1), mkU8(3))) );
   8035 
   8036    /* t_addr1 now holds effective address */
   8037 
   8038    assign( t_bitno2,
   8039            unop(Iop_64to8,
   8040                 binop(Iop_And64, mkexpr(t_bitno1), mkU64(7))) );
   8041 
   8042    /* t_bitno2 contains offset of bit within byte */
   8043 
   8044    if (op != BtOpNone) {
   8045       t_mask = newTemp(Ity_I8);
   8046       assign( t_mask, binop(Iop_Shl8, mkU8(1), mkexpr(t_bitno2)) );
   8047    }
   8048 
   8049    /* t_mask is now a suitable byte mask */
   8050 
   8051    assign( t_fetched, loadLE(Ity_I8, mkexpr(t_addr1)) );
   8052 
   8053    if (op != BtOpNone) {
   8054       switch (op) {
   8055          case BtOpSet:
   8056             assign( t_new,
   8057                     binop(Iop_Or8, mkexpr(t_fetched), mkexpr(t_mask)) );
   8058             break;
   8059          case BtOpComp:
   8060             assign( t_new,
   8061                     binop(Iop_Xor8, mkexpr(t_fetched), mkexpr(t_mask)) );
   8062             break;
   8063          case BtOpReset:
   8064             assign( t_new,
   8065                     binop(Iop_And8, mkexpr(t_fetched),
   8066                                     unop(Iop_Not8, mkexpr(t_mask))) );
   8067             break;
   8068          default:
   8069             vpanic("dis_bt_G_E(amd64)");
   8070       }
   8071       if ((haveLOCK(pfx)) && !epartIsReg(modrm)) {
   8072          casLE( mkexpr(t_addr1), mkexpr(t_fetched)/*expd*/,
   8073                                  mkexpr(t_new)/*new*/,
   8074                                  guest_RIP_curr_instr );
   8075       } else {
   8076          storeLE( mkexpr(t_addr1), mkexpr(t_new) );
   8077       }
   8078    }
   8079 
   8080    /* Side effect done; now get selected bit into Carry flag */
   8081    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
   8082    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   8083    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   8084    stmt( IRStmt_Put(
   8085             OFFB_CC_DEP1,
   8086             binop(Iop_And64,
   8087                   binop(Iop_Shr64,
   8088                         unop(Iop_8Uto64, mkexpr(t_fetched)),
   8089                         mkexpr(t_bitno2)),
   8090                   mkU64(1)))
   8091        );
   8092    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   8093       elimination of previous stores to this field work better. */
   8094    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   8095 
   8096    /* Move reg operand from stack back to reg */
   8097    if (epartIsReg(modrm)) {
   8098       /* t_rsp still points at it. */
   8099       /* only write the reg if actually modifying it; doing otherwise
   8100          zeroes the top half erroneously when doing btl due to
   8101          standard zero-extend rule */
   8102       if (op != BtOpNone)
   8103          putIRegE(sz, pfx, modrm, loadLE(szToITy(sz), mkexpr(t_rsp)) );
   8104       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t_rsp), mkU64(288)) );
   8105    }
   8106 
   8107    DIP("bt%s%c %s, %s\n",
   8108        nameBtOp(op), nameISize(sz), nameIRegG(sz, pfx, modrm),
   8109        ( epartIsReg(modrm) ? nameIRegE(sz, pfx, modrm) : dis_buf ) );
   8110 
   8111    return delta;
   8112 }
   8113 
   8114 
   8115 
   8116 /* Handle BSF/BSR.  Only v-size seems necessary. */
   8117 static
   8118 ULong dis_bs_E_G ( VexAbiInfo* vbi,
   8119                    Prefix pfx, Int sz, Long delta, Bool fwds )
   8120 {
   8121    Bool   isReg;
   8122    UChar  modrm;
   8123    HChar  dis_buf[50];
   8124 
   8125    IRType ty    = szToITy(sz);
   8126    IRTemp src   = newTemp(ty);
   8127    IRTemp dst   = newTemp(ty);
   8128    IRTemp src64 = newTemp(Ity_I64);
   8129    IRTemp dst64 = newTemp(Ity_I64);
   8130    IRTemp srcB  = newTemp(Ity_I1);
   8131 
   8132    vassert(sz == 8 || sz == 4 || sz == 2);
   8133 
   8134    modrm = getUChar(delta);
   8135    isReg = epartIsReg(modrm);
   8136    if (isReg) {
   8137       delta++;
   8138       assign( src, getIRegE(sz, pfx, modrm) );
   8139    } else {
   8140       Int    len;
   8141       IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   8142       delta += len;
   8143       assign( src, loadLE(ty, mkexpr(addr)) );
   8144    }
   8145 
   8146    DIP("bs%c%c %s, %s\n",
   8147        fwds ? 'f' : 'r', nameISize(sz),
   8148        ( isReg ? nameIRegE(sz, pfx, modrm) : dis_buf ),
   8149        nameIRegG(sz, pfx, modrm));
   8150 
   8151    /* First, widen src to 64 bits if it is not already. */
   8152    assign( src64, widenUto64(mkexpr(src)) );
   8153 
   8154    /* Generate a bool expression which is zero iff the original is
   8155       zero, and nonzero otherwise.  Ask for a CmpNE version which, if
   8156       instrumented by Memcheck, is instrumented expensively, since
   8157       this may be used on the output of a preceding movmskb insn,
   8158       which has been known to be partially defined, and in need of
   8159       careful handling. */
   8160    assign( srcB, binop(Iop_ExpCmpNE64, mkexpr(src64), mkU64(0)) );
   8161 
   8162    /* Flags: Z is 1 iff source value is zero.  All others
   8163       are undefined -- we force them to zero. */
   8164    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   8165    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   8166    stmt( IRStmt_Put(
   8167             OFFB_CC_DEP1,
   8168             IRExpr_ITE( mkexpr(srcB),
   8169                         /* src!=0 */
   8170                         mkU64(0),
   8171                         /* src==0 */
   8172                         mkU64(AMD64G_CC_MASK_Z)
   8173                         )
   8174        ));
   8175    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   8176       elimination of previous stores to this field work better. */
   8177    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   8178 
   8179    /* Result: iff source value is zero, we can't use
   8180       Iop_Clz64/Iop_Ctz64 as they have no defined result in that case.
   8181       But anyway, amd64 semantics say the result is undefined in
   8182       such situations.  Hence handle the zero case specially. */
   8183 
   8184    /* Bleh.  What we compute:
   8185 
   8186           bsf64:  if src == 0 then {dst is unchanged}
   8187                               else Ctz64(src)
   8188 
   8189           bsr64:  if src == 0 then {dst is unchanged}
   8190                               else 63 - Clz64(src)
   8191 
   8192           bsf32:  if src == 0 then {dst is unchanged}
   8193                               else Ctz64(32Uto64(src))
   8194 
   8195           bsr32:  if src == 0 then {dst is unchanged}
   8196                               else 63 - Clz64(32Uto64(src))
   8197 
   8198           bsf16:  if src == 0 then {dst is unchanged}
   8199                               else Ctz64(32Uto64(16Uto32(src)))
   8200 
   8201           bsr16:  if src == 0 then {dst is unchanged}
   8202                               else 63 - Clz64(32Uto64(16Uto32(src)))
   8203    */
   8204 
   8205    /* The main computation, guarding against zero. */
   8206    assign( dst64,
   8207            IRExpr_ITE(
   8208               mkexpr(srcB),
   8209               /* src != 0 */
   8210               fwds ? unop(Iop_Ctz64, mkexpr(src64))
   8211                    : binop(Iop_Sub64,
   8212                            mkU64(63),
   8213                            unop(Iop_Clz64, mkexpr(src64))),
   8214               /* src == 0 -- leave dst unchanged */
   8215               widenUto64( getIRegG( sz, pfx, modrm ) )
   8216            )
   8217          );
   8218 
   8219    if (sz == 2)
   8220       assign( dst, unop(Iop_64to16, mkexpr(dst64)) );
   8221    else
   8222    if (sz == 4)
   8223       assign( dst, unop(Iop_64to32, mkexpr(dst64)) );
   8224    else
   8225       assign( dst, mkexpr(dst64) );
   8226 
   8227    /* dump result back */
   8228    putIRegG( sz, pfx, modrm, mkexpr(dst) );
   8229 
   8230    return delta;
   8231 }
   8232 
   8233 
   8234 /* swap rAX with the reg specified by reg and REX.B */
   8235 static
   8236 void codegen_xchg_rAX_Reg ( Prefix pfx, Int sz, UInt regLo3 )
   8237 {
   8238    IRType ty = szToITy(sz);
   8239    IRTemp t1 = newTemp(ty);
   8240    IRTemp t2 = newTemp(ty);
   8241    vassert(sz == 2 || sz == 4 || sz == 8);
   8242    vassert(regLo3 < 8);
   8243    if (sz == 8) {
   8244       assign( t1, getIReg64(R_RAX) );
   8245       assign( t2, getIRegRexB(8, pfx, regLo3) );
   8246       putIReg64( R_RAX, mkexpr(t2) );
   8247       putIRegRexB(8, pfx, regLo3, mkexpr(t1) );
   8248    } else if (sz == 4) {
   8249       assign( t1, getIReg32(R_RAX) );
   8250       assign( t2, getIRegRexB(4, pfx, regLo3) );
   8251       putIReg32( R_RAX, mkexpr(t2) );
   8252       putIRegRexB(4, pfx, regLo3, mkexpr(t1) );
   8253    } else {
   8254       assign( t1, getIReg16(R_RAX) );
   8255       assign( t2, getIRegRexB(2, pfx, regLo3) );
   8256       putIReg16( R_RAX, mkexpr(t2) );
   8257       putIRegRexB(2, pfx, regLo3, mkexpr(t1) );
   8258    }
   8259    DIP("xchg%c %s, %s\n",
   8260        nameISize(sz), nameIRegRAX(sz),
   8261                       nameIRegRexB(sz,pfx, regLo3));
   8262 }
   8263 
   8264 
   8265 static
   8266 void codegen_SAHF ( void )
   8267 {
   8268    /* Set the flags to:
   8269       (amd64g_calculate_flags_all() & AMD64G_CC_MASK_O)
   8270                                     -- retain the old O flag
   8271       | (%AH & (AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
   8272                 |AMD64G_CC_MASK_P|AMD64G_CC_MASK_C)
   8273    */
   8274    ULong  mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
   8275                        |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
   8276    IRTemp oldflags   = newTemp(Ity_I64);
   8277    assign( oldflags, mk_amd64g_calculate_rflags_all() );
   8278    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   8279    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   8280    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   8281    stmt( IRStmt_Put( OFFB_CC_DEP1,
   8282          binop(Iop_Or64,
   8283                binop(Iop_And64, mkexpr(oldflags), mkU64(AMD64G_CC_MASK_O)),
   8284                binop(Iop_And64,
   8285                      binop(Iop_Shr64, getIReg64(R_RAX), mkU8(8)),
   8286                      mkU64(mask_SZACP))
   8287               )
   8288    ));
   8289 }
   8290 
   8291 
   8292 static
   8293 void codegen_LAHF ( void  )
   8294 {
   8295    /* AH <- EFLAGS(SF:ZF:0:AF:0:PF:1:CF) */
   8296    IRExpr* rax_with_hole;
   8297    IRExpr* new_byte;
   8298    IRExpr* new_rax;
   8299    ULong   mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
   8300                         |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
   8301 
   8302    IRTemp  flags = newTemp(Ity_I64);
   8303    assign( flags, mk_amd64g_calculate_rflags_all() );
   8304 
   8305    rax_with_hole
   8306       = binop(Iop_And64, getIReg64(R_RAX), mkU64(~0xFF00ULL));
   8307    new_byte
   8308       = binop(Iop_Or64, binop(Iop_And64, mkexpr(flags), mkU64(mask_SZACP)),
   8309                         mkU64(1<<1));
   8310    new_rax
   8311       = binop(Iop_Or64, rax_with_hole,
   8312                         binop(Iop_Shl64, new_byte, mkU8(8)));
   8313    putIReg64(R_RAX, new_rax);
   8314 }
   8315 
   8316 
   8317 static
   8318 ULong dis_cmpxchg_G_E ( /*OUT*/Bool* ok,
   8319                         VexAbiInfo*  vbi,
   8320                         Prefix       pfx,
   8321                         Int          size,
   8322                         Long         delta0 )
   8323 {
   8324    HChar dis_buf[50];
   8325    Int   len;
   8326 
   8327    IRType ty    = szToITy(size);
   8328    IRTemp acc   = newTemp(ty);
   8329    IRTemp src   = newTemp(ty);
   8330    IRTemp dest  = newTemp(ty);
   8331    IRTemp dest2 = newTemp(ty);
   8332    IRTemp acc2  = newTemp(ty);
   8333    IRTemp cond  = newTemp(Ity_I1);
   8334    IRTemp addr  = IRTemp_INVALID;
   8335    UChar  rm    = getUChar(delta0);
   8336 
   8337    /* There are 3 cases to consider:
   8338 
   8339       reg-reg: ignore any lock prefix, generate sequence based
   8340                on ITE
   8341 
   8342       reg-mem, not locked: ignore any lock prefix, generate sequence
   8343                            based on ITE
   8344 
   8345       reg-mem, locked: use IRCAS
   8346    */
   8347 
   8348    /* Decide whether F2 or F3 are acceptable.  Never for register
   8349       case, but for the memory case, one or the other is OK provided
   8350       LOCK is also present. */
   8351    if (epartIsReg(rm)) {
   8352       if (haveF2orF3(pfx)) {
   8353          *ok = False;
   8354          return delta0;
   8355       }
   8356    } else {
   8357       if (haveF2orF3(pfx)) {
   8358          if (haveF2andF3(pfx) || !haveLOCK(pfx)) {
   8359             *ok = False;
   8360             return delta0;
   8361          }
   8362       }
   8363    }
   8364 
   8365    if (epartIsReg(rm)) {
   8366       /* case 1 */
   8367       assign( dest, getIRegE(size, pfx, rm) );
   8368       delta0++;
   8369       assign( src, getIRegG(size, pfx, rm) );
   8370       assign( acc, getIRegRAX(size) );
   8371       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   8372       assign( cond, mk_amd64g_calculate_condition(AMD64CondZ) );
   8373       assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
   8374       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
   8375       putIRegRAX(size, mkexpr(acc2));
   8376       putIRegE(size, pfx, rm, mkexpr(dest2));
   8377       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   8378                                nameIRegG(size,pfx,rm),
   8379                                nameIRegE(size,pfx,rm) );
   8380    }
   8381    else if (!epartIsReg(rm) && !haveLOCK(pfx)) {
   8382       /* case 2 */
   8383       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8384       assign( dest, loadLE(ty, mkexpr(addr)) );
   8385       delta0 += len;
   8386       assign( src, getIRegG(size, pfx, rm) );
   8387       assign( acc, getIRegRAX(size) );
   8388       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   8389       assign( cond, mk_amd64g_calculate_condition(AMD64CondZ) );
   8390       assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
   8391       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
   8392       putIRegRAX(size, mkexpr(acc2));
   8393       storeLE( mkexpr(addr), mkexpr(dest2) );
   8394       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   8395                                nameIRegG(size,pfx,rm), dis_buf);
   8396    }
   8397    else if (!epartIsReg(rm) && haveLOCK(pfx)) {
   8398       /* case 3 */
   8399       /* src is new value.  acc is expected value.  dest is old value.
   8400          Compute success from the output of the IRCAS, and steer the
   8401          new value for RAX accordingly: in case of success, RAX is
   8402          unchanged. */
   8403       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8404       delta0 += len;
   8405       assign( src, getIRegG(size, pfx, rm) );
   8406       assign( acc, getIRegRAX(size) );
   8407       stmt( IRStmt_CAS(
   8408          mkIRCAS( IRTemp_INVALID, dest, Iend_LE, mkexpr(addr),
   8409                   NULL, mkexpr(acc), NULL, mkexpr(src) )
   8410       ));
   8411       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   8412       assign( cond, mk_amd64g_calculate_condition(AMD64CondZ) );
   8413       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
   8414       putIRegRAX(size, mkexpr(acc2));
   8415       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   8416                                nameIRegG(size,pfx,rm), dis_buf);
   8417    }
   8418    else vassert(0);
   8419 
   8420    *ok = True;
   8421    return delta0;
   8422 }
   8423 
   8424 
   8425 /* Handle conditional move instructions of the form
   8426       cmovcc E(reg-or-mem), G(reg)
   8427 
   8428    E(src) is reg-or-mem
   8429    G(dst) is reg.
   8430 
   8431    If E is reg, -->    GET %E, tmps
   8432                        GET %G, tmpd
   8433                        CMOVcc tmps, tmpd
   8434                        PUT tmpd, %G
   8435 
   8436    If E is mem  -->    (getAddr E) -> tmpa
   8437                        LD (tmpa), tmps
   8438                        GET %G, tmpd
   8439                        CMOVcc tmps, tmpd
   8440                        PUT tmpd, %G
   8441 */
   8442 static
   8443 ULong dis_cmov_E_G ( VexAbiInfo* vbi,
   8444                      Prefix        pfx,
   8445                      Int           sz,
   8446                      AMD64Condcode cond,
   8447                      Long          delta0 )
   8448 {
   8449    UChar rm  = getUChar(delta0);
   8450    HChar dis_buf[50];
   8451    Int   len;
   8452 
   8453    IRType ty   = szToITy(sz);
   8454    IRTemp tmps = newTemp(ty);
   8455    IRTemp tmpd = newTemp(ty);
   8456 
   8457    if (epartIsReg(rm)) {
   8458       assign( tmps, getIRegE(sz, pfx, rm) );
   8459       assign( tmpd, getIRegG(sz, pfx, rm) );
   8460 
   8461       putIRegG( sz, pfx, rm,
   8462                 IRExpr_ITE( mk_amd64g_calculate_condition(cond),
   8463                             mkexpr(tmps),
   8464                             mkexpr(tmpd) )
   8465               );
   8466       DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
   8467                             nameIRegE(sz,pfx,rm),
   8468                             nameIRegG(sz,pfx,rm));
   8469       return 1+delta0;
   8470    }
   8471 
   8472    /* E refers to memory */
   8473    {
   8474       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8475       assign( tmps, loadLE(ty, mkexpr(addr)) );
   8476       assign( tmpd, getIRegG(sz, pfx, rm) );
   8477 
   8478       putIRegG( sz, pfx, rm,
   8479                 IRExpr_ITE( mk_amd64g_calculate_condition(cond),
   8480                             mkexpr(tmps),
   8481                             mkexpr(tmpd) )
   8482               );
   8483 
   8484       DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
   8485                             dis_buf,
   8486                             nameIRegG(sz,pfx,rm));
   8487       return len+delta0;
   8488    }
   8489 }
   8490 
   8491 
   8492 static
   8493 ULong dis_xadd_G_E ( /*OUT*/Bool* decode_ok,
   8494                      VexAbiInfo* vbi,
   8495                      Prefix pfx, Int sz, Long delta0 )
   8496 {
   8497    Int   len;
   8498    UChar rm = getUChar(delta0);
   8499    HChar dis_buf[50];
   8500 
   8501    IRType ty    = szToITy(sz);
   8502    IRTemp tmpd  = newTemp(ty);
   8503    IRTemp tmpt0 = newTemp(ty);
   8504    IRTemp tmpt1 = newTemp(ty);
   8505 
   8506    /* There are 3 cases to consider:
   8507 
   8508       reg-reg: ignore any lock prefix,
   8509                generate 'naive' (non-atomic) sequence
   8510 
   8511       reg-mem, not locked: ignore any lock prefix, generate 'naive'
   8512                            (non-atomic) sequence
   8513 
   8514       reg-mem, locked: use IRCAS
   8515    */
   8516 
   8517    if (epartIsReg(rm)) {
   8518       /* case 1 */
   8519       assign( tmpd, getIRegE(sz, pfx, rm) );
   8520       assign( tmpt0, getIRegG(sz, pfx, rm) );
   8521       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   8522                            mkexpr(tmpd), mkexpr(tmpt0)) );
   8523       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   8524       putIRegG(sz, pfx, rm, mkexpr(tmpd));
   8525       putIRegE(sz, pfx, rm, mkexpr(tmpt1));
   8526       DIP("xadd%c %s, %s\n",
   8527           nameISize(sz), nameIRegG(sz,pfx,rm), nameIRegE(sz,pfx,rm));
   8528       *decode_ok = True;
   8529       return 1+delta0;
   8530    }
   8531    else if (!epartIsReg(rm) && !haveLOCK(pfx)) {
   8532       /* case 2 */
   8533       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8534       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   8535       assign( tmpt0, getIRegG(sz, pfx, rm) );
   8536       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   8537                            mkexpr(tmpd), mkexpr(tmpt0)) );
   8538       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   8539       storeLE( mkexpr(addr), mkexpr(tmpt1) );
   8540       putIRegG(sz, pfx, rm, mkexpr(tmpd));
   8541       DIP("xadd%c %s, %s\n",
   8542           nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
   8543       *decode_ok = True;
   8544       return len+delta0;
   8545    }
   8546    else if (!epartIsReg(rm) && haveLOCK(pfx)) {
   8547       /* case 3 */
   8548       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8549       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   8550       assign( tmpt0, getIRegG(sz, pfx, rm) );
   8551       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   8552                            mkexpr(tmpd), mkexpr(tmpt0)) );
   8553       casLE( mkexpr(addr), mkexpr(tmpd)/*expVal*/,
   8554                            mkexpr(tmpt1)/*newVal*/, guest_RIP_curr_instr );
   8555       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   8556       putIRegG(sz, pfx, rm, mkexpr(tmpd));
   8557       DIP("xadd%c %s, %s\n",
   8558           nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
   8559       *decode_ok = True;
   8560       return len+delta0;
   8561    }
   8562    /*UNREACHED*/
   8563    vassert(0);
   8564 }
   8565 
   8566 //.. /* Move 16 bits from Ew (ireg or mem) to G (a segment register). */
   8567 //..
   8568 //.. static
   8569 //.. UInt dis_mov_Ew_Sw ( UChar sorb, Long delta0 )
   8570 //.. {
   8571 //..    Int    len;
   8572 //..    IRTemp addr;
   8573 //..    UChar  rm  = getUChar(delta0);
   8574 //..    HChar  dis_buf[50];
   8575 //..
   8576 //..    if (epartIsReg(rm)) {
   8577 //..       putSReg( gregOfRM(rm), getIReg(2, eregOfRM(rm)) );
   8578 //..       DIP("movw %s,%s\n", nameIReg(2,eregOfRM(rm)), nameSReg(gregOfRM(rm)));
   8579 //..       return 1+delta0;
   8580 //..    } else {
   8581 //..       addr = disAMode ( &len, sorb, delta0, dis_buf );
   8582 //..       putSReg( gregOfRM(rm), loadLE(Ity_I16, mkexpr(addr)) );
   8583 //..       DIP("movw %s,%s\n", dis_buf, nameSReg(gregOfRM(rm)));
   8584 //..       return len+delta0;
   8585 //..    }
   8586 //.. }
   8587 //..
   8588 //.. /* Move 16 bits from G (a segment register) to Ew (ireg or mem).  If
   8589 //..    dst is ireg and sz==4, zero out top half of it.  */
   8590 //..
   8591 //.. static
   8592 //.. UInt dis_mov_Sw_Ew ( UChar sorb,
   8593 //..                      Int   sz,
   8594 //..                      UInt  delta0 )
   8595 //.. {
   8596 //..    Int    len;
   8597 //..    IRTemp addr;
   8598 //..    UChar  rm  = getUChar(delta0);
   8599 //..    HChar  dis_buf[50];
   8600 //..
   8601 //..    vassert(sz == 2 || sz == 4);
   8602 //..
   8603 //..    if (epartIsReg(rm)) {
   8604 //..       if (sz == 4)
   8605 //..          putIReg(4, eregOfRM(rm), unop(Iop_16Uto32, getSReg(gregOfRM(rm))));
   8606 //..       else
   8607 //..          putIReg(2, eregOfRM(rm), getSReg(gregOfRM(rm)));
   8608 //..
   8609 //..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), nameIReg(sz,eregOfRM(rm)));
   8610 //..       return 1+delta0;
   8611 //..    } else {
   8612 //..       addr = disAMode ( &len, sorb, delta0, dis_buf );
   8613 //..       storeLE( mkexpr(addr), getSReg(gregOfRM(rm)) );
   8614 //..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), dis_buf);
   8615 //..       return len+delta0;
   8616 //..    }
   8617 //.. }
   8618 //..
   8619 //..
   8620 //.. static
   8621 //.. void dis_push_segreg ( UInt sreg, Int sz )
   8622 //.. {
   8623 //..     IRTemp t1 = newTemp(Ity_I16);
   8624 //..     IRTemp ta = newTemp(Ity_I32);
   8625 //..     vassert(sz == 2 || sz == 4);
   8626 //..
   8627 //..     assign( t1, getSReg(sreg) );
   8628 //..     assign( ta, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)) );
   8629 //..     putIReg(4, R_ESP, mkexpr(ta));
   8630 //..     storeLE( mkexpr(ta), mkexpr(t1) );
   8631 //..
   8632 //..     DIP("pushw %s\n", nameSReg(sreg));
   8633 //.. }
   8634 //..
   8635 //.. static
   8636 //.. void dis_pop_segreg ( UInt sreg, Int sz )
   8637 //.. {
   8638 //..     IRTemp t1 = newTemp(Ity_I16);
   8639 //..     IRTemp ta = newTemp(Ity_I32);
   8640 //..     vassert(sz == 2 || sz == 4);
   8641 //..
   8642 //..     assign( ta, getIReg(4, R_ESP) );
   8643 //..     assign( t1, loadLE(Ity_I16, mkexpr(ta)) );
   8644 //..
   8645 //..     putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(ta), mkU32(sz)) );
   8646 //..     putSReg( sreg, mkexpr(t1) );
   8647 //..     DIP("pop %s\n", nameSReg(sreg));
   8648 //.. }
   8649 
   8650 static
   8651 void dis_ret ( /*MOD*/DisResult* dres, VexAbiInfo* vbi, ULong d64 )
   8652 {
   8653    IRTemp t1 = newTemp(Ity_I64);
   8654    IRTemp t2 = newTemp(Ity_I64);
   8655    IRTemp t3 = newTemp(Ity_I64);
   8656    assign(t1, getIReg64(R_RSP));
   8657    assign(t2, loadLE(Ity_I64,mkexpr(t1)));
   8658    assign(t3, binop(Iop_Add64, mkexpr(t1), mkU64(8+d64)));
   8659    putIReg64(R_RSP, mkexpr(t3));
   8660    make_redzone_AbiHint(vbi, t3, t2/*nia*/, "ret");
   8661    jmp_treg(dres, Ijk_Ret, t2);
   8662    vassert(dres->whatNext == Dis_StopHere);
   8663 }
   8664 
   8665 
   8666 /*------------------------------------------------------------*/
   8667 /*--- SSE/SSE2/SSE3 helpers                                ---*/
   8668 /*------------------------------------------------------------*/
   8669 
   8670 /* Indicates whether the op requires a rounding-mode argument.  Note
   8671    that this covers only vector floating point arithmetic ops, and
   8672    omits the scalar ones that need rounding modes.  Note also that
   8673    inconsistencies here will get picked up later by the IR sanity
   8674    checker, so this isn't correctness-critical. */
   8675 static Bool requiresRMode ( IROp op )
   8676 {
   8677    switch (op) {
   8678       /* 128 bit ops */
   8679       case Iop_Add32Fx4: case Iop_Sub32Fx4:
   8680       case Iop_Mul32Fx4: case Iop_Div32Fx4:
   8681       case Iop_Add64Fx2: case Iop_Sub64Fx2:
   8682       case Iop_Mul64Fx2: case Iop_Div64Fx2:
   8683       /* 256 bit ops */
   8684       case Iop_Add32Fx8: case Iop_Sub32Fx8:
   8685       case Iop_Mul32Fx8: case Iop_Div32Fx8:
   8686       case Iop_Add64Fx4: case Iop_Sub64Fx4:
   8687       case Iop_Mul64Fx4: case Iop_Div64Fx4:
   8688          return True;
   8689       default:
   8690          break;
   8691    }
   8692    return False;
   8693 }
   8694 
   8695 
   8696 /* Worker function; do not call directly.
   8697    Handles full width G = G `op` E   and   G = (not G) `op` E.
   8698 */
   8699 
   8700 static ULong dis_SSE_E_to_G_all_wrk (
   8701                 VexAbiInfo* vbi,
   8702                 Prefix pfx, Long delta,
   8703                 const HChar* opname, IROp op,
   8704                 Bool   invertG
   8705              )
   8706 {
   8707    HChar   dis_buf[50];
   8708    Int     alen;
   8709    IRTemp  addr;
   8710    UChar   rm = getUChar(delta);
   8711    Bool    needsRMode = requiresRMode(op);
   8712    IRExpr* gpart
   8713       = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRexRM(pfx,rm)))
   8714                 : getXMMReg(gregOfRexRM(pfx,rm));
   8715    if (epartIsReg(rm)) {
   8716       putXMMReg(
   8717          gregOfRexRM(pfx,rm),
   8718          needsRMode
   8719             ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   8720                         gpart,
   8721                         getXMMReg(eregOfRexRM(pfx,rm)))
   8722             : binop(op, gpart,
   8723                         getXMMReg(eregOfRexRM(pfx,rm)))
   8724       );
   8725       DIP("%s %s,%s\n", opname,
   8726                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8727                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8728       return delta+1;
   8729    } else {
   8730       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8731       putXMMReg(
   8732          gregOfRexRM(pfx,rm),
   8733          needsRMode
   8734             ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   8735                         gpart,
   8736                         loadLE(Ity_V128, mkexpr(addr)))
   8737             : binop(op, gpart,
   8738                         loadLE(Ity_V128, mkexpr(addr)))
   8739       );
   8740       DIP("%s %s,%s\n", opname,
   8741                         dis_buf,
   8742                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8743       return delta+alen;
   8744    }
   8745 }
   8746 
   8747 
   8748 /* All lanes SSE binary operation, G = G `op` E. */
   8749 
   8750 static
   8751 ULong dis_SSE_E_to_G_all ( VexAbiInfo* vbi,
   8752                            Prefix pfx, Long delta,
   8753                            const HChar* opname, IROp op )
   8754 {
   8755    return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, False );
   8756 }
   8757 
   8758 /* All lanes SSE binary operation, G = (not G) `op` E. */
   8759 
   8760 static
   8761 ULong dis_SSE_E_to_G_all_invG ( VexAbiInfo* vbi,
   8762                                 Prefix pfx, Long delta,
   8763                                 const HChar* opname, IROp op )
   8764 {
   8765    return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, True );
   8766 }
   8767 
   8768 
   8769 /* Lowest 32-bit lane only SSE binary operation, G = G `op` E. */
   8770 
   8771 static ULong dis_SSE_E_to_G_lo32 ( VexAbiInfo* vbi,
   8772                                    Prefix pfx, Long delta,
   8773                                    const HChar* opname, IROp op )
   8774 {
   8775    HChar   dis_buf[50];
   8776    Int     alen;
   8777    IRTemp  addr;
   8778    UChar   rm = getUChar(delta);
   8779    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
   8780    if (epartIsReg(rm)) {
   8781       putXMMReg( gregOfRexRM(pfx,rm),
   8782                  binop(op, gpart,
   8783                            getXMMReg(eregOfRexRM(pfx,rm))) );
   8784       DIP("%s %s,%s\n", opname,
   8785                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8786                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8787       return delta+1;
   8788    } else {
   8789       /* We can only do a 32-bit memory read, so the upper 3/4 of the
   8790          E operand needs to be made simply of zeroes. */
   8791       IRTemp epart = newTemp(Ity_V128);
   8792       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8793       assign( epart, unop( Iop_32UtoV128,
   8794                            loadLE(Ity_I32, mkexpr(addr))) );
   8795       putXMMReg( gregOfRexRM(pfx,rm),
   8796                  binop(op, gpart, mkexpr(epart)) );
   8797       DIP("%s %s,%s\n", opname,
   8798                         dis_buf,
   8799                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8800       return delta+alen;
   8801    }
   8802 }
   8803 
   8804 
   8805 /* Lower 64-bit lane only SSE binary operation, G = G `op` E. */
   8806 
   8807 static ULong dis_SSE_E_to_G_lo64 ( VexAbiInfo* vbi,
   8808                                    Prefix pfx, Long delta,
   8809                                    const HChar* opname, IROp op )
   8810 {
   8811    HChar   dis_buf[50];
   8812    Int     alen;
   8813    IRTemp  addr;
   8814    UChar   rm = getUChar(delta);
   8815    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
   8816    if (epartIsReg(rm)) {
   8817       putXMMReg( gregOfRexRM(pfx,rm),
   8818                  binop(op, gpart,
   8819                            getXMMReg(eregOfRexRM(pfx,rm))) );
   8820       DIP("%s %s,%s\n", opname,
   8821                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8822                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8823       return delta+1;
   8824    } else {
   8825       /* We can only do a 64-bit memory read, so the upper half of the
   8826          E operand needs to be made simply of zeroes. */
   8827       IRTemp epart = newTemp(Ity_V128);
   8828       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8829       assign( epart, unop( Iop_64UtoV128,
   8830                            loadLE(Ity_I64, mkexpr(addr))) );
   8831       putXMMReg( gregOfRexRM(pfx,rm),
   8832                  binop(op, gpart, mkexpr(epart)) );
   8833       DIP("%s %s,%s\n", opname,
   8834                         dis_buf,
   8835                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8836       return delta+alen;
   8837    }
   8838 }
   8839 
   8840 
   8841 /* All lanes unary SSE operation, G = op(E). */
   8842 
   8843 static ULong dis_SSE_E_to_G_unary_all (
   8844                 VexAbiInfo* vbi,
   8845                 Prefix pfx, Long delta,
   8846                 const HChar* opname, IROp op
   8847              )
   8848 {
   8849    HChar   dis_buf[50];
   8850    Int     alen;
   8851    IRTemp  addr;
   8852    UChar   rm = getUChar(delta);
   8853    if (epartIsReg(rm)) {
   8854       putXMMReg( gregOfRexRM(pfx,rm),
   8855                  unop(op, getXMMReg(eregOfRexRM(pfx,rm))) );
   8856       DIP("%s %s,%s\n", opname,
   8857                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8858                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8859       return delta+1;
   8860    } else {
   8861       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8862       putXMMReg( gregOfRexRM(pfx,rm),
   8863                  unop(op, loadLE(Ity_V128, mkexpr(addr))) );
   8864       DIP("%s %s,%s\n", opname,
   8865                         dis_buf,
   8866                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8867       return delta+alen;
   8868    }
   8869 }
   8870 
   8871 
   8872 /* Lowest 32-bit lane only unary SSE operation, G = op(E). */
   8873 
   8874 static ULong dis_SSE_E_to_G_unary_lo32 (
   8875                 VexAbiInfo* vbi,
   8876                 Prefix pfx, Long delta,
   8877                 const HChar* opname, IROp op
   8878              )
   8879 {
   8880    /* First we need to get the old G value and patch the low 32 bits
   8881       of the E operand into it.  Then apply op and write back to G. */
   8882    HChar   dis_buf[50];
   8883    Int     alen;
   8884    IRTemp  addr;
   8885    UChar   rm = getUChar(delta);
   8886    IRTemp  oldG0 = newTemp(Ity_V128);
   8887    IRTemp  oldG1 = newTemp(Ity_V128);
   8888 
   8889    assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
   8890 
   8891    if (epartIsReg(rm)) {
   8892       assign( oldG1,
   8893               binop( Iop_SetV128lo32,
   8894                      mkexpr(oldG0),
   8895                      getXMMRegLane32(eregOfRexRM(pfx,rm), 0)) );
   8896       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   8897       DIP("%s %s,%s\n", opname,
   8898                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8899                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8900       return delta+1;
   8901    } else {
   8902       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8903       assign( oldG1,
   8904               binop( Iop_SetV128lo32,
   8905                      mkexpr(oldG0),
   8906                      loadLE(Ity_I32, mkexpr(addr)) ));
   8907       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   8908       DIP("%s %s,%s\n", opname,
   8909                         dis_buf,
   8910                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8911       return delta+alen;
   8912    }
   8913 }
   8914 
   8915 
   8916 /* Lowest 64-bit lane only unary SSE operation, G = op(E). */
   8917 
   8918 static ULong dis_SSE_E_to_G_unary_lo64 (
   8919                 VexAbiInfo* vbi,
   8920                 Prefix pfx, Long delta,
   8921                 const HChar* opname, IROp op
   8922              )
   8923 {
   8924    /* First we need to get the old G value and patch the low 64 bits
   8925       of the E operand into it.  Then apply op and write back to G. */
   8926    HChar   dis_buf[50];
   8927    Int     alen;
   8928    IRTemp  addr;
   8929    UChar   rm = getUChar(delta);
   8930    IRTemp  oldG0 = newTemp(Ity_V128);
   8931    IRTemp  oldG1 = newTemp(Ity_V128);
   8932 
   8933    assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
   8934 
   8935    if (epartIsReg(rm)) {
   8936       assign( oldG1,
   8937               binop( Iop_SetV128lo64,
   8938                      mkexpr(oldG0),
   8939                      getXMMRegLane64(eregOfRexRM(pfx,rm), 0)) );
   8940       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   8941       DIP("%s %s,%s\n", opname,
   8942                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8943                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8944       return delta+1;
   8945    } else {
   8946       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8947       assign( oldG1,
   8948               binop( Iop_SetV128lo64,
   8949                      mkexpr(oldG0),
   8950                      loadLE(Ity_I64, mkexpr(addr)) ));
   8951       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   8952       DIP("%s %s,%s\n", opname,
   8953                         dis_buf,
   8954                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8955       return delta+alen;
   8956    }
   8957 }
   8958 
   8959 
   8960 /* SSE integer binary operation:
   8961       G = G `op` E   (eLeft == False)
   8962       G = E `op` G   (eLeft == True)
   8963 */
   8964 static ULong dis_SSEint_E_to_G(
   8965                 VexAbiInfo* vbi,
   8966                 Prefix pfx, Long delta,
   8967                 const HChar* opname, IROp op,
   8968                 Bool   eLeft
   8969              )
   8970 {
   8971    HChar   dis_buf[50];
   8972    Int     alen;
   8973    IRTemp  addr;
   8974    UChar   rm = getUChar(delta);
   8975    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
   8976    IRExpr* epart = NULL;
   8977    if (epartIsReg(rm)) {
   8978       epart = getXMMReg(eregOfRexRM(pfx,rm));
   8979       DIP("%s %s,%s\n", opname,
   8980                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8981                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8982       delta += 1;
   8983    } else {
   8984       addr  = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8985       epart = loadLE(Ity_V128, mkexpr(addr));
   8986       DIP("%s %s,%s\n", opname,
   8987                         dis_buf,
   8988                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8989       delta += alen;
   8990    }
   8991    putXMMReg( gregOfRexRM(pfx,rm),
   8992               eLeft ? binop(op, epart, gpart)
   8993                     : binop(op, gpart, epart) );
   8994    return delta;
   8995 }
   8996 
   8997 
   8998 /* Helper for doing SSE FP comparisons.  False return ==> unhandled.
   8999    This is all a bit of a kludge in that it ignores the subtleties of
   9000    ordered-vs-unordered and signalling-vs-nonsignalling in the Intel
   9001    spec. */
   9002 static Bool findSSECmpOp ( /*OUT*/Bool* preSwapP,
   9003                            /*OUT*/IROp* opP,
   9004                            /*OUT*/Bool* postNotP,
   9005                            UInt imm8, Bool all_lanes, Int sz )
   9006 {
   9007    if (imm8 >= 32) return False;
   9008 
   9009    /* First, compute a (preSwap, op, postNot) triple from
   9010       the supplied imm8. */
   9011    Bool pre = False;
   9012    IROp op  = Iop_INVALID;
   9013    Bool not = False;
   9014 
   9015 #  define XXX(_pre, _op, _not) { pre = _pre; op = _op; not = _not; }
   9016    // If you add a case here, add a corresponding test for both VCMPSD_128
   9017    // and VCMPSS_128 in avx-1.c.
   9018    switch (imm8) {
   9019       // "O" = ordered, "U" = unordered
   9020       // "Q" = non-signalling (quiet), "S" = signalling
   9021       //
   9022       //             swap operands?
   9023       //             |
   9024       //             |      cmp op          invert after?
   9025       //             |      |               |
   9026       //             v      v               v
   9027       case 0x0:  XXX(False, Iop_CmpEQ32Fx4, False); break; // EQ_OQ
   9028       case 0x1:  XXX(False, Iop_CmpLT32Fx4, False); break; // LT_OS
   9029       case 0x2:  XXX(False, Iop_CmpLE32Fx4, False); break; // LE_OS
   9030       case 0x3:  XXX(False, Iop_CmpUN32Fx4, False); break; // UNORD_Q
   9031       case 0x4:  XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_UQ
   9032       case 0x5:  XXX(False, Iop_CmpLT32Fx4, True);  break; // NLT_US
   9033       case 0x6:  XXX(False, Iop_CmpLE32Fx4, True);  break; // NLE_US
   9034       case 0x7:  XXX(False, Iop_CmpUN32Fx4, True);  break; // ORD_Q
   9035       case 0x8:  XXX(False, Iop_CmpEQ32Fx4, False); break; // EQ_UQ
   9036       case 0x9:  XXX(True,  Iop_CmpLE32Fx4, True);  break; // NGE_US
   9037       /* "Enhanced Comparison Predicate[s] for VEX-Encoded [insns] */
   9038       case 0xA:  XXX(True,  Iop_CmpLT32Fx4, True);  break; // NGT_US
   9039       // 0xB  FALSE_OQ
   9040       // 0xC: this isn't really right because it returns all-1s when
   9041       // either operand is a NaN, and it should return all-0s.
   9042       case 0xC:  XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_OQ
   9043       case 0xD:  XXX(True,  Iop_CmpLE32Fx4, False); break; // GE_OS
   9044       case 0xE:  XXX(True,  Iop_CmpLT32Fx4, False); break; // GT_OS
   9045       // 0xF  TRUE_UQ
   9046       // 0x10  EQ_OS
   9047       case 0x11: XXX(False, Iop_CmpLT32Fx4, False); break; // LT_OQ
   9048       case 0x12: XXX(False, Iop_CmpLE32Fx4, False); break; // LE_OQ
   9049       // 0x13  UNORD_S
   9050       // 0x14  NEQ_US
   9051       // 0x15  NLT_UQ
   9052       case 0x16: XXX(False, Iop_CmpLE32Fx4, True);  break; // NLE_UQ
   9053       // 0x17  ORD_S
   9054       // 0x18  EQ_US
   9055       // 0x19  NGE_UQ
   9056       // 0x1A  NGT_UQ
   9057       // 0x1B  FALSE_OS
   9058       // 0x1C  NEQ_OS
   9059       // 0x1D  GE_OQ
   9060       case 0x1E: XXX(True,  Iop_CmpLT32Fx4, False); break; // GT_OQ
   9061       // 0x1F  TRUE_US
   9062       /* Don't forget to add test cases to VCMPSS_128_<imm8> in
   9063          avx-1.c if new cases turn up. */
   9064       default: break;
   9065    }
   9066 #  undef XXX
   9067    if (op == Iop_INVALID) return False;
   9068 
   9069    /* Now convert the op into one with the same arithmetic but that is
   9070       correct for the width and laneage requirements. */
   9071 
   9072    /**/ if (sz == 4 && all_lanes) {
   9073       switch (op) {
   9074          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ32Fx4; break;
   9075          case Iop_CmpLT32Fx4: op = Iop_CmpLT32Fx4; break;
   9076          case Iop_CmpLE32Fx4: op = Iop_CmpLE32Fx4; break;
   9077          case Iop_CmpUN32Fx4: op = Iop_CmpUN32Fx4; break;
   9078          default: vassert(0);
   9079       }
   9080    }
   9081    else if (sz == 4 && !all_lanes) {
   9082       switch (op) {
   9083          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ32F0x4; break;
   9084          case Iop_CmpLT32Fx4: op = Iop_CmpLT32F0x4; break;
   9085          case Iop_CmpLE32Fx4: op = Iop_CmpLE32F0x4; break;
   9086          case Iop_CmpUN32Fx4: op = Iop_CmpUN32F0x4; break;
   9087          default: vassert(0);
   9088       }
   9089    }
   9090    else if (sz == 8 && all_lanes) {
   9091       switch (op) {
   9092          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ64Fx2; break;
   9093          case Iop_CmpLT32Fx4: op = Iop_CmpLT64Fx2; break;
   9094          case Iop_CmpLE32Fx4: op = Iop_CmpLE64Fx2; break;
   9095          case Iop_CmpUN32Fx4: op = Iop_CmpUN64Fx2; break;
   9096          default: vassert(0);
   9097       }
   9098    }
   9099    else if (sz == 8 && !all_lanes) {
   9100       switch (op) {
   9101          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ64F0x2; break;
   9102          case Iop_CmpLT32Fx4: op = Iop_CmpLT64F0x2; break;
   9103          case Iop_CmpLE32Fx4: op = Iop_CmpLE64F0x2; break;
   9104          case Iop_CmpUN32Fx4: op = Iop_CmpUN64F0x2; break;
   9105          default: vassert(0);
   9106       }
   9107    }
   9108    else {
   9109       vpanic("findSSECmpOp(amd64,guest)");
   9110    }
   9111 
   9112    *preSwapP = pre; *opP = op; *postNotP = not;
   9113    return True;
   9114 }
   9115 
   9116 
   9117 /* Handles SSE 32F/64F comparisons.  It can fail, in which case it
   9118    returns the original delta to indicate failure. */
   9119 
   9120 static Long dis_SSE_cmp_E_to_G ( VexAbiInfo* vbi,
   9121                                  Prefix pfx, Long delta,
   9122                                  const HChar* opname, Bool all_lanes, Int sz )
   9123 {
   9124    Long    delta0 = delta;
   9125    HChar   dis_buf[50];
   9126    Int     alen;
   9127    UInt    imm8;
   9128    IRTemp  addr;
   9129    Bool    preSwap = False;
   9130    IROp    op      = Iop_INVALID;
   9131    Bool    postNot = False;
   9132    IRTemp  plain   = newTemp(Ity_V128);
   9133    UChar   rm      = getUChar(delta);
   9134    UShort  mask    = 0;
   9135    vassert(sz == 4 || sz == 8);
   9136    if (epartIsReg(rm)) {
   9137       imm8 = getUChar(delta+1);
   9138       if (imm8 >= 8) return delta0; /* FAIL */
   9139       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
   9140       if (!ok) return delta0; /* FAIL */
   9141       vassert(!preSwap); /* never needed for imm8 < 8 */
   9142       assign( plain, binop(op, getXMMReg(gregOfRexRM(pfx,rm)),
   9143                                getXMMReg(eregOfRexRM(pfx,rm))) );
   9144       delta += 2;
   9145       DIP("%s $%d,%s,%s\n", opname,
   9146                             (Int)imm8,
   9147                             nameXMMReg(eregOfRexRM(pfx,rm)),
   9148                             nameXMMReg(gregOfRexRM(pfx,rm)) );
   9149    } else {
   9150       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   9151       imm8 = getUChar(delta+alen);
   9152       if (imm8 >= 8) return delta0; /* FAIL */
   9153       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
   9154       if (!ok) return delta0; /* FAIL */
   9155       vassert(!preSwap); /* never needed for imm8 < 8 */
   9156       assign( plain,
   9157               binop(
   9158                  op,
   9159                  getXMMReg(gregOfRexRM(pfx,rm)),
   9160                    all_lanes
   9161                       ? loadLE(Ity_V128, mkexpr(addr))
   9162                    : sz == 8
   9163                       ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
   9164                    : /*sz==4*/
   9165                       unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
   9166               )
   9167       );
   9168       delta += alen+1;
   9169       DIP("%s $%d,%s,%s\n", opname,
   9170                             (Int)imm8,
   9171                             dis_buf,
   9172                             nameXMMReg(gregOfRexRM(pfx,rm)) );
   9173    }
   9174 
   9175    if (postNot && all_lanes) {
   9176       putXMMReg( gregOfRexRM(pfx,rm),
   9177                  unop(Iop_NotV128, mkexpr(plain)) );
   9178    }
   9179    else
   9180    if (postNot && !all_lanes) {
   9181       mask = toUShort(sz==4 ? 0x000F : 0x00FF);
   9182       putXMMReg( gregOfRexRM(pfx,rm),
   9183                  binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) );
   9184    }
   9185    else {
   9186       putXMMReg( gregOfRexRM(pfx,rm), mkexpr(plain) );
   9187    }
   9188 
   9189    return delta;
   9190 }
   9191 
   9192 
   9193 /* Vector by scalar shift of G by the amount specified at the bottom
   9194    of E. */
   9195 
   9196 static ULong dis_SSE_shiftG_byE ( VexAbiInfo* vbi,
   9197                                   Prefix pfx, Long delta,
   9198                                   const HChar* opname, IROp op )
   9199 {
   9200    HChar   dis_buf[50];
   9201    Int     alen, size;
   9202    IRTemp  addr;
   9203    Bool    shl, shr, sar;
   9204    UChar   rm   = getUChar(delta);
   9205    IRTemp  g0   = newTemp(Ity_V128);
   9206    IRTemp  g1   = newTemp(Ity_V128);
   9207    IRTemp  amt  = newTemp(Ity_I64);
   9208    IRTemp  amt8 = newTemp(Ity_I8);
   9209    if (epartIsReg(rm)) {
   9210       assign( amt, getXMMRegLane64(eregOfRexRM(pfx,rm), 0) );
   9211       DIP("%s %s,%s\n", opname,
   9212                         nameXMMReg(eregOfRexRM(pfx,rm)),
   9213                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9214       delta++;
   9215    } else {
   9216       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9217       assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
   9218       DIP("%s %s,%s\n", opname,
   9219                         dis_buf,
   9220                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9221       delta += alen;
   9222    }
   9223    assign( g0,   getXMMReg(gregOfRexRM(pfx,rm)) );
   9224    assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
   9225 
   9226    shl = shr = sar = False;
   9227    size = 0;
   9228    switch (op) {
   9229       case Iop_ShlN16x8: shl = True; size = 32; break;
   9230       case Iop_ShlN32x4: shl = True; size = 32; break;
   9231       case Iop_ShlN64x2: shl = True; size = 64; break;
   9232       case Iop_SarN16x8: sar = True; size = 16; break;
   9233       case Iop_SarN32x4: sar = True; size = 32; break;
   9234       case Iop_ShrN16x8: shr = True; size = 16; break;
   9235       case Iop_ShrN32x4: shr = True; size = 32; break;
   9236       case Iop_ShrN64x2: shr = True; size = 64; break;
   9237       default: vassert(0);
   9238    }
   9239 
   9240    if (shl || shr) {
   9241      assign(
   9242         g1,
   9243         IRExpr_ITE(
   9244            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
   9245            binop(op, mkexpr(g0), mkexpr(amt8)),
   9246            mkV128(0x0000)
   9247         )
   9248      );
   9249    } else
   9250    if (sar) {
   9251      assign(
   9252         g1,
   9253         IRExpr_ITE(
   9254            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
   9255            binop(op, mkexpr(g0), mkexpr(amt8)),
   9256            binop(op, mkexpr(g0), mkU8(size-1))
   9257         )
   9258      );
   9259    } else {
   9260       vassert(0);
   9261    }
   9262 
   9263    putXMMReg( gregOfRexRM(pfx,rm), mkexpr(g1) );
   9264    return delta;
   9265 }
   9266 
   9267 
   9268 /* Vector by scalar shift of E by an immediate byte. */
   9269 
   9270 static
   9271 ULong dis_SSE_shiftE_imm ( Prefix pfx,
   9272                            Long delta, const HChar* opname, IROp op )
   9273 {
   9274    Bool    shl, shr, sar;
   9275    UChar   rm   = getUChar(delta);
   9276    IRTemp  e0   = newTemp(Ity_V128);
   9277    IRTemp  e1   = newTemp(Ity_V128);
   9278    UChar   amt, size;
   9279    vassert(epartIsReg(rm));
   9280    vassert(gregLO3ofRM(rm) == 2
   9281            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
   9282    amt = getUChar(delta+1);
   9283    delta += 2;
   9284    DIP("%s $%d,%s\n", opname,
   9285                       (Int)amt,
   9286                       nameXMMReg(eregOfRexRM(pfx,rm)) );
   9287    assign( e0, getXMMReg(eregOfRexRM(pfx,rm)) );
   9288 
   9289    shl = shr = sar = False;
   9290    size = 0;
   9291    switch (op) {
   9292       case Iop_ShlN16x8: shl = True; size = 16; break;
   9293       case Iop_ShlN32x4: shl = True; size = 32; break;
   9294       case Iop_ShlN64x2: shl = True; size = 64; break;
   9295       case Iop_SarN16x8: sar = True; size = 16; break;
   9296       case Iop_SarN32x4: sar = True; size = 32; break;
   9297       case Iop_ShrN16x8: shr = True; size = 16; break;
   9298       case Iop_ShrN32x4: shr = True; size = 32; break;
   9299       case Iop_ShrN64x2: shr = True; size = 64; break;
   9300       default: vassert(0);
   9301    }
   9302 
   9303    if (shl || shr) {
   9304      assign( e1, amt >= size
   9305                     ? mkV128(0x0000)
   9306                     : binop(op, mkexpr(e0), mkU8(amt))
   9307      );
   9308    } else
   9309    if (sar) {
   9310      assign( e1, amt >= size
   9311                     ? binop(op, mkexpr(e0), mkU8(size-1))
   9312                     : binop(op, mkexpr(e0), mkU8(amt))
   9313      );
   9314    } else {
   9315       vassert(0);
   9316    }
   9317 
   9318    putXMMReg( eregOfRexRM(pfx,rm), mkexpr(e1) );
   9319    return delta;
   9320 }
   9321 
   9322 
   9323 /* Get the current SSE rounding mode. */
   9324 
   9325 static IRExpr* /* :: Ity_I32 */ get_sse_roundingmode ( void )
   9326 {
   9327    return
   9328       unop( Iop_64to32,
   9329             binop( Iop_And64,
   9330                    IRExpr_Get( OFFB_SSEROUND, Ity_I64 ),
   9331                    mkU64(3) ));
   9332 }
   9333 
   9334 static void put_sse_roundingmode ( IRExpr* sseround )
   9335 {
   9336    vassert(typeOfIRExpr(irsb->tyenv, sseround) == Ity_I32);
   9337    stmt( IRStmt_Put( OFFB_SSEROUND,
   9338                      unop(Iop_32Uto64,sseround) ) );
   9339 }
   9340 
   9341 /* Break a V128-bit value up into four 32-bit ints. */
   9342 
   9343 static void breakupV128to32s ( IRTemp t128,
   9344                                /*OUTs*/
   9345                                IRTemp* t3, IRTemp* t2,
   9346                                IRTemp* t1, IRTemp* t0 )
   9347 {
   9348    IRTemp hi64 = newTemp(Ity_I64);
   9349    IRTemp lo64 = newTemp(Ity_I64);
   9350    assign( hi64, unop(Iop_V128HIto64, mkexpr(t128)) );
   9351    assign( lo64, unop(Iop_V128to64,   mkexpr(t128)) );
   9352 
   9353    vassert(t0 && *t0 == IRTemp_INVALID);
   9354    vassert(t1 && *t1 == IRTemp_INVALID);
   9355    vassert(t2 && *t2 == IRTemp_INVALID);
   9356    vassert(t3 && *t3 == IRTemp_INVALID);
   9357 
   9358    *t0 = newTemp(Ity_I32);
   9359    *t1 = newTemp(Ity_I32);
   9360    *t2 = newTemp(Ity_I32);
   9361    *t3 = newTemp(Ity_I32);
   9362    assign( *t0, unop(Iop_64to32,   mkexpr(lo64)) );
   9363    assign( *t1, unop(Iop_64HIto32, mkexpr(lo64)) );
   9364    assign( *t2, unop(Iop_64to32,   mkexpr(hi64)) );
   9365    assign( *t3, unop(Iop_64HIto32, mkexpr(hi64)) );
   9366 }
   9367 
   9368 /* Construct a V128-bit value from four 32-bit ints. */
   9369 
   9370 static IRExpr* mkV128from32s ( IRTemp t3, IRTemp t2,
   9371                                IRTemp t1, IRTemp t0 )
   9372 {
   9373    return
   9374       binop( Iop_64HLtoV128,
   9375              binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
   9376              binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0))
   9377    );
   9378 }
   9379 
   9380 /* Break a 64-bit value up into four 16-bit ints. */
   9381 
   9382 static void breakup64to16s ( IRTemp t64,
   9383                              /*OUTs*/
   9384                              IRTemp* t3, IRTemp* t2,
   9385                              IRTemp* t1, IRTemp* t0 )
   9386 {
   9387    IRTemp hi32 = newTemp(Ity_I32);
   9388    IRTemp lo32 = newTemp(Ity_I32);
   9389    assign( hi32, unop(Iop_64HIto32, mkexpr(t64)) );
   9390    assign( lo32, unop(Iop_64to32,   mkexpr(t64)) );
   9391 
   9392    vassert(t0 && *t0 == IRTemp_INVALID);
   9393    vassert(t1 && *t1 == IRTemp_INVALID);
   9394    vassert(t2 && *t2 == IRTemp_INVALID);
   9395    vassert(t3 && *t3 == IRTemp_INVALID);
   9396 
   9397    *t0 = newTemp(Ity_I16);
   9398    *t1 = newTemp(Ity_I16);
   9399    *t2 = newTemp(Ity_I16);
   9400    *t3 = newTemp(Ity_I16);
   9401    assign( *t0, unop(Iop_32to16,   mkexpr(lo32)) );
   9402    assign( *t1, unop(Iop_32HIto16, mkexpr(lo32)) );
   9403    assign( *t2, unop(Iop_32to16,   mkexpr(hi32)) );
   9404    assign( *t3, unop(Iop_32HIto16, mkexpr(hi32)) );
   9405 }
   9406 
   9407 /* Construct a 64-bit value from four 16-bit ints. */
   9408 
   9409 static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2,
   9410                              IRTemp t1, IRTemp t0 )
   9411 {
   9412    return
   9413       binop( Iop_32HLto64,
   9414              binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)),
   9415              binop(Iop_16HLto32, mkexpr(t1), mkexpr(t0))
   9416    );
   9417 }
   9418 
   9419 /* Break a V256-bit value up into four 64-bit ints. */
   9420 
   9421 static void breakupV256to64s ( IRTemp t256,
   9422                                /*OUTs*/
   9423                                IRTemp* t3, IRTemp* t2,
   9424                                IRTemp* t1, IRTemp* t0 )
   9425 {
   9426    vassert(t0 && *t0 == IRTemp_INVALID);
   9427    vassert(t1 && *t1 == IRTemp_INVALID);
   9428    vassert(t2 && *t2 == IRTemp_INVALID);
   9429    vassert(t3 && *t3 == IRTemp_INVALID);
   9430    *t0 = newTemp(Ity_I64);
   9431    *t1 = newTemp(Ity_I64);
   9432    *t2 = newTemp(Ity_I64);
   9433    *t3 = newTemp(Ity_I64);
   9434    assign( *t0, unop(Iop_V256to64_0, mkexpr(t256)) );
   9435    assign( *t1, unop(Iop_V256to64_1, mkexpr(t256)) );
   9436    assign( *t2, unop(Iop_V256to64_2, mkexpr(t256)) );
   9437    assign( *t3, unop(Iop_V256to64_3, mkexpr(t256)) );
   9438 }
   9439 
   9440 /* Break a V256-bit value up into two V128s. */
   9441 
   9442 static void breakupV256toV128s ( IRTemp t256,
   9443                                  /*OUTs*/
   9444                                  IRTemp* t1, IRTemp* t0 )
   9445 {
   9446    vassert(t0 && *t0 == IRTemp_INVALID);
   9447    vassert(t1 && *t1 == IRTemp_INVALID);
   9448    *t0 = newTemp(Ity_V128);
   9449    *t1 = newTemp(Ity_V128);
   9450    assign(*t1, unop(Iop_V256toV128_1, mkexpr(t256)));
   9451    assign(*t0, unop(Iop_V256toV128_0, mkexpr(t256)));
   9452 }
   9453 
   9454 /* Break a V256-bit value up into eight 32-bit ints.  */
   9455 
   9456 static void breakupV256to32s ( IRTemp t256,
   9457                                /*OUTs*/
   9458                                IRTemp* t7, IRTemp* t6,
   9459                                IRTemp* t5, IRTemp* t4,
   9460                                IRTemp* t3, IRTemp* t2,
   9461                                IRTemp* t1, IRTemp* t0 )
   9462 {
   9463    IRTemp t128_1 = IRTemp_INVALID;
   9464    IRTemp t128_0 = IRTemp_INVALID;
   9465    breakupV256toV128s( t256, &t128_1, &t128_0 );
   9466    breakupV128to32s( t128_1, t7, t6, t5, t4 );
   9467    breakupV128to32s( t128_0, t3, t2, t1, t0 );
   9468 }
   9469 
   9470 /* Break a V128-bit value up into two 64-bit ints. */
   9471 
   9472 static void breakupV128to64s ( IRTemp t128,
   9473                                /*OUTs*/
   9474                                IRTemp* t1, IRTemp* t0 )
   9475 {
   9476    vassert(t0 && *t0 == IRTemp_INVALID);
   9477    vassert(t1 && *t1 == IRTemp_INVALID);
   9478    *t0 = newTemp(Ity_I64);
   9479    *t1 = newTemp(Ity_I64);
   9480    assign( *t0, unop(Iop_V128to64,   mkexpr(t128)) );
   9481    assign( *t1, unop(Iop_V128HIto64, mkexpr(t128)) );
   9482 }
   9483 
   9484 /* Construct a V256-bit value from eight 32-bit ints. */
   9485 
   9486 static IRExpr* mkV256from32s ( IRTemp t7, IRTemp t6,
   9487                                IRTemp t5, IRTemp t4,
   9488                                IRTemp t3, IRTemp t2,
   9489                                IRTemp t1, IRTemp t0 )
   9490 {
   9491    return
   9492       binop( Iop_V128HLtoV256,
   9493              binop( Iop_64HLtoV128,
   9494                     binop(Iop_32HLto64, mkexpr(t7), mkexpr(t6)),
   9495                     binop(Iop_32HLto64, mkexpr(t5), mkexpr(t4)) ),
   9496              binop( Iop_64HLtoV128,
   9497                     binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
   9498                     binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0)) )
   9499    );
   9500 }
   9501 
   9502 /* Construct a V256-bit value from four 64-bit ints. */
   9503 
   9504 static IRExpr* mkV256from64s ( IRTemp t3, IRTemp t2,
   9505                                IRTemp t1, IRTemp t0 )
   9506 {
   9507    return
   9508       binop( Iop_V128HLtoV256,
   9509              binop(Iop_64HLtoV128, mkexpr(t3), mkexpr(t2)),
   9510              binop(Iop_64HLtoV128, mkexpr(t1), mkexpr(t0))
   9511    );
   9512 }
   9513 
   9514 /* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
   9515    values (aa,bb), computes, for each of the 4 16-bit lanes:
   9516 
   9517    (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
   9518 */
   9519 static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
   9520 {
   9521    IRTemp aa      = newTemp(Ity_I64);
   9522    IRTemp bb      = newTemp(Ity_I64);
   9523    IRTemp aahi32s = newTemp(Ity_I64);
   9524    IRTemp aalo32s = newTemp(Ity_I64);
   9525    IRTemp bbhi32s = newTemp(Ity_I64);
   9526    IRTemp bblo32s = newTemp(Ity_I64);
   9527    IRTemp rHi     = newTemp(Ity_I64);
   9528    IRTemp rLo     = newTemp(Ity_I64);
   9529    IRTemp one32x2 = newTemp(Ity_I64);
   9530    assign(aa, aax);
   9531    assign(bb, bbx);
   9532    assign( aahi32s,
   9533            binop(Iop_SarN32x2,
   9534                  binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
   9535                  mkU8(16) ));
   9536    assign( aalo32s,
   9537            binop(Iop_SarN32x2,
   9538                  binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
   9539                  mkU8(16) ));
   9540    assign( bbhi32s,
   9541            binop(Iop_SarN32x2,
   9542                  binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
   9543                  mkU8(16) ));
   9544    assign( bblo32s,
   9545            binop(Iop_SarN32x2,
   9546                  binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
   9547                  mkU8(16) ));
   9548    assign(one32x2, mkU64( (1ULL << 32) + 1 ));
   9549    assign(
   9550       rHi,
   9551       binop(
   9552          Iop_ShrN32x2,
   9553          binop(
   9554             Iop_Add32x2,
   9555             binop(
   9556                Iop_ShrN32x2,
   9557                binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
   9558                mkU8(14)
   9559             ),
   9560             mkexpr(one32x2)
   9561          ),
   9562          mkU8(1)
   9563       )
   9564    );
   9565    assign(
   9566       rLo,
   9567       binop(
   9568          Iop_ShrN32x2,
   9569          binop(
   9570             Iop_Add32x2,
   9571             binop(
   9572                Iop_ShrN32x2,
   9573                binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
   9574                mkU8(14)
   9575             ),
   9576             mkexpr(one32x2)
   9577          ),
   9578          mkU8(1)
   9579       )
   9580    );
   9581    return
   9582       binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
   9583 }
   9584 
   9585 /* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns.  Given two 64-bit
   9586    values (aa,bb), computes, for each lane:
   9587 
   9588           if aa_lane < 0 then - bb_lane
   9589      else if aa_lane > 0 then bb_lane
   9590      else 0
   9591 */
   9592 static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
   9593 {
   9594    IRTemp aa       = newTemp(Ity_I64);
   9595    IRTemp bb       = newTemp(Ity_I64);
   9596    IRTemp zero     = newTemp(Ity_I64);
   9597    IRTemp bbNeg    = newTemp(Ity_I64);
   9598    IRTemp negMask  = newTemp(Ity_I64);
   9599    IRTemp posMask  = newTemp(Ity_I64);
   9600    IROp   opSub    = Iop_INVALID;
   9601    IROp   opCmpGTS = Iop_INVALID;
   9602 
   9603    switch (laneszB) {
   9604       case 1: opSub = Iop_Sub8x8;  opCmpGTS = Iop_CmpGT8Sx8;  break;
   9605       case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
   9606       case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
   9607       default: vassert(0);
   9608    }
   9609 
   9610    assign( aa,      aax );
   9611    assign( bb,      bbx );
   9612    assign( zero,    mkU64(0) );
   9613    assign( bbNeg,   binop(opSub,    mkexpr(zero), mkexpr(bb)) );
   9614    assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
   9615    assign( posMask, binop(opCmpGTS, mkexpr(aa),   mkexpr(zero)) );
   9616 
   9617    return
   9618       binop(Iop_Or64,
   9619             binop(Iop_And64, mkexpr(bb),    mkexpr(posMask)),
   9620             binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
   9621 
   9622 }
   9623 
   9624 
   9625 /* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns.  Given a 64-bit
   9626    value aa, computes, for each lane
   9627 
   9628    if aa < 0 then -aa else aa
   9629 
   9630    Note that the result is interpreted as unsigned, so that the
   9631    absolute value of the most negative signed input can be
   9632    represented.
   9633 */
   9634 static IRTemp math_PABS_MMX ( IRTemp aa, Int laneszB )
   9635 {
   9636    IRTemp res     = newTemp(Ity_I64);
   9637    IRTemp zero    = newTemp(Ity_I64);
   9638    IRTemp aaNeg   = newTemp(Ity_I64);
   9639    IRTemp negMask = newTemp(Ity_I64);
   9640    IRTemp posMask = newTemp(Ity_I64);
   9641    IROp   opSub   = Iop_INVALID;
   9642    IROp   opSarN  = Iop_INVALID;
   9643 
   9644    switch (laneszB) {
   9645       case 1: opSub = Iop_Sub8x8;  opSarN = Iop_SarN8x8;  break;
   9646       case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
   9647       case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
   9648       default: vassert(0);
   9649    }
   9650 
   9651    assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
   9652    assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
   9653    assign( zero,    mkU64(0) );
   9654    assign( aaNeg,   binop(opSub, mkexpr(zero), mkexpr(aa)) );
   9655    assign( res,
   9656            binop(Iop_Or64,
   9657                  binop(Iop_And64, mkexpr(aa),    mkexpr(posMask)),
   9658                  binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) ));
   9659    return res;
   9660 }
   9661 
   9662 /* XMM version of math_PABS_MMX. */
   9663 static IRTemp math_PABS_XMM ( IRTemp aa, Int laneszB )
   9664 {
   9665    IRTemp res  = newTemp(Ity_V128);
   9666    IRTemp aaHi = newTemp(Ity_I64);
   9667    IRTemp aaLo = newTemp(Ity_I64);
   9668    assign(aaHi, unop(Iop_V128HIto64, mkexpr(aa)));
   9669    assign(aaLo, unop(Iop_V128to64, mkexpr(aa)));
   9670    assign(res, binop(Iop_64HLtoV128,
   9671                      mkexpr(math_PABS_MMX(aaHi, laneszB)),
   9672                      mkexpr(math_PABS_MMX(aaLo, laneszB))));
   9673    return res;
   9674 }
   9675 
   9676 /* Specialisations of math_PABS_XMM, since there's no easy way to do
   9677    partial applications in C :-( */
   9678 static IRTemp math_PABS_XMM_pap4 ( IRTemp aa ) {
   9679    return math_PABS_XMM(aa, 4);
   9680 }
   9681 
   9682 static IRTemp math_PABS_XMM_pap2 ( IRTemp aa ) {
   9683    return math_PABS_XMM(aa, 2);
   9684 }
   9685 
   9686 static IRTemp math_PABS_XMM_pap1 ( IRTemp aa ) {
   9687    return math_PABS_XMM(aa, 1);
   9688 }
   9689 
   9690 /* YMM version of math_PABS_XMM. */
   9691 static IRTemp math_PABS_YMM ( IRTemp aa, Int laneszB )
   9692 {
   9693    IRTemp res  = newTemp(Ity_V256);
   9694    IRTemp aaHi = IRTemp_INVALID;
   9695    IRTemp aaLo = IRTemp_INVALID;
   9696    breakupV256toV128s(aa, &aaHi, &aaLo);
   9697    assign(res, binop(Iop_V128HLtoV256,
   9698                      mkexpr(math_PABS_XMM(aaHi, laneszB)),
   9699                      mkexpr(math_PABS_XMM(aaLo, laneszB))));
   9700    return res;
   9701 }
   9702 
   9703 static IRTemp math_PABS_YMM_pap4 ( IRTemp aa ) {
   9704    return math_PABS_YMM(aa, 4);
   9705 }
   9706 
   9707 static IRTemp math_PABS_YMM_pap2 ( IRTemp aa ) {
   9708    return math_PABS_YMM(aa, 2);
   9709 }
   9710 
   9711 static IRTemp math_PABS_YMM_pap1 ( IRTemp aa ) {
   9712    return math_PABS_YMM(aa, 1);
   9713 }
   9714 
   9715 static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
   9716                                         IRTemp lo64, Long byteShift )
   9717 {
   9718    vassert(byteShift >= 1 && byteShift <= 7);
   9719    return
   9720       binop(Iop_Or64,
   9721             binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
   9722             binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
   9723       );
   9724 }
   9725 
   9726 static IRTemp math_PALIGNR_XMM ( IRTemp sV, IRTemp dV, UInt imm8 )
   9727 {
   9728    IRTemp res = newTemp(Ity_V128);
   9729    IRTemp sHi = newTemp(Ity_I64);
   9730    IRTemp sLo = newTemp(Ity_I64);
   9731    IRTemp dHi = newTemp(Ity_I64);
   9732    IRTemp dLo = newTemp(Ity_I64);
   9733    IRTemp rHi = newTemp(Ity_I64);
   9734    IRTemp rLo = newTemp(Ity_I64);
   9735 
   9736    assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   9737    assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   9738    assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   9739    assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   9740 
   9741    if (imm8 == 0) {
   9742       assign( rHi, mkexpr(sHi) );
   9743       assign( rLo, mkexpr(sLo) );
   9744    }
   9745    else if (imm8 >= 1 && imm8 <= 7) {
   9746       assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, imm8) );
   9747       assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, imm8) );
   9748    }
   9749    else if (imm8 == 8) {
   9750       assign( rHi, mkexpr(dLo) );
   9751       assign( rLo, mkexpr(sHi) );
   9752    }
   9753    else if (imm8 >= 9 && imm8 <= 15) {
   9754       assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, imm8-8) );
   9755       assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, imm8-8) );
   9756    }
   9757    else if (imm8 == 16) {
   9758       assign( rHi, mkexpr(dHi) );
   9759       assign( rLo, mkexpr(dLo) );
   9760    }
   9761    else if (imm8 >= 17 && imm8 <= 23) {
   9762       assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(imm8-16))) );
   9763       assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, imm8-16) );
   9764    }
   9765    else if (imm8 == 24) {
   9766       assign( rHi, mkU64(0) );
   9767       assign( rLo, mkexpr(dHi) );
   9768    }
   9769    else if (imm8 >= 25 && imm8 <= 31) {
   9770       assign( rHi, mkU64(0) );
   9771       assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(imm8-24))) );
   9772    }
   9773    else if (imm8 >= 32 && imm8 <= 255) {
   9774       assign( rHi, mkU64(0) );
   9775       assign( rLo, mkU64(0) );
   9776    }
   9777    else
   9778       vassert(0);
   9779 
   9780    assign( res, binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo)));
   9781    return res;
   9782 }
   9783 
   9784 
   9785 /* Generate a SIGSEGV followed by a restart of the current instruction
   9786    if effective_addr is not 16-aligned.  This is required behaviour
   9787    for some SSE3 instructions and all 128-bit SSSE3 instructions.
   9788    This assumes that guest_RIP_curr_instr is set correctly! */
   9789 static
   9790 void gen_SEGV_if_not_XX_aligned ( IRTemp effective_addr, ULong mask )
   9791 {
   9792    stmt(
   9793       IRStmt_Exit(
   9794          binop(Iop_CmpNE64,
   9795                binop(Iop_And64,mkexpr(effective_addr),mkU64(mask)),
   9796                mkU64(0)),
   9797          Ijk_SigSEGV,
   9798          IRConst_U64(guest_RIP_curr_instr),
   9799          OFFB_RIP
   9800       )
   9801    );
   9802 }
   9803 
   9804 static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr ) {
   9805    gen_SEGV_if_not_XX_aligned(effective_addr, 16-1);
   9806 }
   9807 
   9808 static void gen_SEGV_if_not_32_aligned ( IRTemp effective_addr ) {
   9809    gen_SEGV_if_not_XX_aligned(effective_addr, 32-1);
   9810 }
   9811 
   9812 /* Helper for deciding whether a given insn (starting at the opcode
   9813    byte) may validly be used with a LOCK prefix.  The following insns
   9814    may be used with LOCK when their destination operand is in memory.
   9815    AFAICS this is exactly the same for both 32-bit and 64-bit mode.
   9816 
   9817    ADD        80 /0,  81 /0,  82 /0,  83 /0,  00,  01
   9818    OR         80 /1,  81 /1,  82 /x,  83 /1,  08,  09
   9819    ADC        80 /2,  81 /2,  82 /2,  83 /2,  10,  11
   9820    SBB        81 /3,  81 /3,  82 /x,  83 /3,  18,  19
   9821    AND        80 /4,  81 /4,  82 /x,  83 /4,  20,  21
   9822    SUB        80 /5,  81 /5,  82 /x,  83 /5,  28,  29
   9823    XOR        80 /6,  81 /6,  82 /x,  83 /6,  30,  31
   9824 
   9825    DEC        FE /1,  FF /1
   9826    INC        FE /0,  FF /0
   9827 
   9828    NEG        F6 /3,  F7 /3
   9829    NOT        F6 /2,  F7 /2
   9830 
   9831    XCHG       86, 87
   9832 
   9833    BTC        0F BB,  0F BA /7
   9834    BTR        0F B3,  0F BA /6
   9835    BTS        0F AB,  0F BA /5
   9836 
   9837    CMPXCHG    0F B0,  0F B1
   9838    CMPXCHG8B  0F C7 /1
   9839 
   9840    XADD       0F C0,  0F C1
   9841 
   9842    ------------------------------
   9843 
   9844    80 /0  =  addb $imm8,  rm8
   9845    81 /0  =  addl $imm32, rm32  and  addw $imm16, rm16
   9846    82 /0  =  addb $imm8,  rm8
   9847    83 /0  =  addl $simm8, rm32  and  addw $simm8, rm16
   9848 
   9849    00     =  addb r8,  rm8
   9850    01     =  addl r32, rm32  and  addw r16, rm16
   9851 
   9852    Same for ADD OR ADC SBB AND SUB XOR
   9853 
   9854    FE /1  = dec rm8
   9855    FF /1  = dec rm32  and  dec rm16
   9856 
   9857    FE /0  = inc rm8
   9858    FF /0  = inc rm32  and  inc rm16
   9859 
   9860    F6 /3  = neg rm8
   9861    F7 /3  = neg rm32  and  neg rm16
   9862 
   9863    F6 /2  = not rm8
   9864    F7 /2  = not rm32  and  not rm16
   9865 
   9866    0F BB     = btcw r16, rm16    and  btcl r32, rm32
   9867    OF BA /7  = btcw $imm8, rm16  and  btcw $imm8, rm32
   9868 
   9869    Same for BTS, BTR
   9870 */
   9871 static Bool can_be_used_with_LOCK_prefix ( UChar* opc )
   9872 {
   9873    switch (opc[0]) {
   9874       case 0x00: case 0x01: case 0x08: case 0x09:
   9875       case 0x10: case 0x11: case 0x18: case 0x19:
   9876       case 0x20: case 0x21: case 0x28: case 0x29:
   9877       case 0x30: case 0x31:
   9878          if (!epartIsReg(opc[1]))
   9879             return True;
   9880          break;
   9881 
   9882       case 0x80: case 0x81: case 0x82: case 0x83:
   9883          if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 6
   9884              && !epartIsReg(opc[1]))
   9885             return True;
   9886          break;
   9887 
   9888       case 0xFE: case 0xFF:
   9889          if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 1
   9890              && !epartIsReg(opc[1]))
   9891             return True;
   9892          break;
   9893 
   9894       case 0xF6: case 0xF7:
   9895          if (gregLO3ofRM(opc[1]) >= 2 && gregLO3ofRM(opc[1]) <= 3
   9896              && !epartIsReg(opc[1]))
   9897             return True;
   9898          break;
   9899 
   9900       case 0x86: case 0x87:
   9901          if (!epartIsReg(opc[1]))
   9902             return True;
   9903          break;
   9904 
   9905       case 0x0F: {
   9906          switch (opc[1]) {
   9907             case 0xBB: case 0xB3: case 0xAB:
   9908                if (!epartIsReg(opc[2]))
   9909                   return True;
   9910                break;
   9911             case 0xBA:
   9912                if (gregLO3ofRM(opc[2]) >= 5 && gregLO3ofRM(opc[2]) <= 7
   9913                    && !epartIsReg(opc[2]))
   9914                   return True;
   9915                break;
   9916             case 0xB0: case 0xB1:
   9917                if (!epartIsReg(opc[2]))
   9918                   return True;
   9919                break;
   9920             case 0xC7:
   9921                if (gregLO3ofRM(opc[2]) == 1 && !epartIsReg(opc[2]) )
   9922                   return True;
   9923                break;
   9924             case 0xC0: case 0xC1:
   9925                if (!epartIsReg(opc[2]))
   9926                   return True;
   9927                break;
   9928             default:
   9929                break;
   9930          } /* switch (opc[1]) */
   9931          break;
   9932       }
   9933 
   9934       default:
   9935          break;
   9936    } /* switch (opc[0]) */
   9937 
   9938    return False;
   9939 }
   9940 
   9941 
   9942 /*------------------------------------------------------------*/
   9943 /*---                                                      ---*/
   9944 /*--- Top-level SSE/SSE2: dis_ESC_0F__SSE2                 ---*/
   9945 /*---                                                      ---*/
   9946 /*------------------------------------------------------------*/
   9947 
   9948 static Long dis_COMISD ( VexAbiInfo* vbi, Prefix pfx,
   9949                          Long delta, Bool isAvx, UChar opc )
   9950 {
   9951    vassert(opc == 0x2F/*COMISD*/ || opc == 0x2E/*UCOMISD*/);
   9952    Int    alen  = 0;
   9953    HChar  dis_buf[50];
   9954    IRTemp argL  = newTemp(Ity_F64);
   9955    IRTemp argR  = newTemp(Ity_F64);
   9956    UChar  modrm = getUChar(delta);
   9957    IRTemp addr  = IRTemp_INVALID;
   9958    if (epartIsReg(modrm)) {
   9959       assign( argR, getXMMRegLane64F( eregOfRexRM(pfx,modrm),
   9960                                       0/*lowest lane*/ ) );
   9961       delta += 1;
   9962       DIP("%s%scomisd %s,%s\n", isAvx ? "v" : "",
   9963                                 opc==0x2E ? "u" : "",
   9964                                 nameXMMReg(eregOfRexRM(pfx,modrm)),
   9965                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
   9966    } else {
   9967       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9968       assign( argR, loadLE(Ity_F64, mkexpr(addr)) );
   9969       delta += alen;
   9970       DIP("%s%scomisd %s,%s\n", isAvx ? "v" : "",
   9971                                 opc==0x2E ? "u" : "",
   9972                                 dis_buf,
   9973                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
   9974    }
   9975    assign( argL, getXMMRegLane64F( gregOfRexRM(pfx,modrm),
   9976                                    0/*lowest lane*/ ) );
   9977 
   9978    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   9979    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   9980    stmt( IRStmt_Put(
   9981             OFFB_CC_DEP1,
   9982             binop( Iop_And64,
   9983                    unop( Iop_32Uto64,
   9984                          binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)) ),
   9985                    mkU64(0x45)
   9986        )));
   9987    return delta;
   9988 }
   9989 
   9990 
   9991 static Long dis_COMISS ( VexAbiInfo* vbi, Prefix pfx,
   9992                          Long delta, Bool isAvx, UChar opc )
   9993 {
   9994    vassert(opc == 0x2F/*COMISS*/ || opc == 0x2E/*UCOMISS*/);
   9995    Int    alen  = 0;
   9996    HChar  dis_buf[50];
   9997    IRTemp argL  = newTemp(Ity_F32);
   9998    IRTemp argR  = newTemp(Ity_F32);
   9999    UChar  modrm = getUChar(delta);
   10000    IRTemp addr  = IRTemp_INVALID;
   10001    if (epartIsReg(modrm)) {
   10002       assign( argR, getXMMRegLane32F( eregOfRexRM(pfx,modrm),
   10003                                       0/*lowest lane*/ ) );
   10004       delta += 1;
   10005       DIP("%s%scomiss %s,%s\n", isAvx ? "v" : "",
   10006                                 opc==0x2E ? "u" : "",
   10007                                 nameXMMReg(eregOfRexRM(pfx,modrm)),
   10008                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10009    } else {
   10010       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10011       assign( argR, loadLE(Ity_F32, mkexpr(addr)) );
   10012       delta += alen;
   10013       DIP("%s%scomiss %s,%s\n", isAvx ? "v" : "",
   10014                                 opc==0x2E ? "u" : "",
   10015                                 dis_buf,
   10016                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10017    }
   10018    assign( argL, getXMMRegLane32F( gregOfRexRM(pfx,modrm),
   10019                                    0/*lowest lane*/ ) );
   10020 
   10021    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   10022    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   10023    stmt( IRStmt_Put(
   10024             OFFB_CC_DEP1,
   10025             binop( Iop_And64,
   10026                    unop( Iop_32Uto64,
   10027                          binop(Iop_CmpF64,
   10028                                unop(Iop_F32toF64,mkexpr(argL)),
   10029                                unop(Iop_F32toF64,mkexpr(argR)))),
   10030                    mkU64(0x45)
   10031        )));
   10032    return delta;
   10033 }
   10034 
   10035 
   10036 static Long dis_PSHUFD_32x4 ( VexAbiInfo* vbi, Prefix pfx,
   10037                               Long delta, Bool writesYmm )
   10038 {
   10039    Int    order;
   10040    Int    alen  = 0;
   10041    HChar  dis_buf[50];
   10042    IRTemp sV    = newTemp(Ity_V128);
   10043    UChar  modrm = getUChar(delta);
   10044    const HChar* strV  = writesYmm ? "v" : "";
   10045    IRTemp addr  = IRTemp_INVALID;
   10046    if (epartIsReg(modrm)) {
   10047       assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   10048       order = (Int)getUChar(delta+1);
   10049       delta += 1+1;
   10050       DIP("%spshufd $%d,%s,%s\n", strV, order,
   10051                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
   10052                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   10053    } else {
   10054       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   10055                         1/*byte after the amode*/ );
   10056       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   10057       order = (Int)getUChar(delta+alen);
   10058       delta += alen+1;
   10059       DIP("%spshufd $%d,%s,%s\n", strV, order,
   10060                                  dis_buf,
   10061                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   10062    }
   10063 
   10064    IRTemp s3, s2, s1, s0;
   10065    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   10066    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   10067 
   10068 #  define SEL(n)  ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   10069    IRTemp dV = newTemp(Ity_V128);
   10070    assign(dV,
   10071           mkV128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
   10072                          SEL((order>>2)&3), SEL((order>>0)&3) )
   10073    );
   10074 #  undef SEL
   10075 
   10076    (writesYmm ? putYMMRegLoAndZU : putXMMReg)
   10077       (gregOfRexRM(pfx,modrm), mkexpr(dV));
   10078    return delta;
   10079 }
   10080 
   10081 
   10082 static Long dis_PSHUFD_32x8 ( VexAbiInfo* vbi, Prefix pfx, Long delta )
   10083 {
   10084    Int    order;
   10085    Int    alen  = 0;
   10086    HChar  dis_buf[50];
   10087    IRTemp sV    = newTemp(Ity_V256);
   10088    UChar  modrm = getUChar(delta);
   10089    IRTemp addr  = IRTemp_INVALID;
   10090    UInt   rG    = gregOfRexRM(pfx,modrm);
   10091    if (epartIsReg(modrm)) {
   10092       UInt rE = eregOfRexRM(pfx,modrm);
   10093       assign( sV, getYMMReg(rE) );
   10094       order = (Int)getUChar(delta+1);
   10095       delta += 1+1;
   10096       DIP("vpshufd $%d,%s,%s\n", order, nameYMMReg(rE), nameYMMReg(rG));
   10097    } else {
   10098       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   10099                         1/*byte after the amode*/ );
   10100       assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   10101       order = (Int)getUChar(delta+alen);
   10102       delta += alen+1;
   10103       DIP("vpshufd $%d,%s,%s\n", order,  dis_buf, nameYMMReg(rG));
   10104    }
   10105 
   10106    IRTemp s[8];
   10107    s[7] = s[6] = s[5] = s[4] = s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID;
   10108    breakupV256to32s( sV, &s[7], &s[6], &s[5], &s[4],
   10109                          &s[3], &s[2], &s[1], &s[0] );
   10110 
   10111    putYMMReg( rG, mkV256from32s( s[4 + ((order>>6)&3)],
   10112                                  s[4 + ((order>>4)&3)],
   10113                                  s[4 + ((order>>2)&3)],
   10114                                  s[4 + ((order>>0)&3)],
   10115                                  s[0 + ((order>>6)&3)],
   10116                                  s[0 + ((order>>4)&3)],
   10117                                  s[0 + ((order>>2)&3)],
   10118                                  s[0 + ((order>>0)&3)] ) );
   10119    return delta;
   10120 }
   10121 
   10122 
   10123 static IRTemp math_PSRLDQ ( IRTemp sV, Int imm )
   10124 {
   10125    IRTemp dV    = newTemp(Ity_V128);
   10126    IRTemp hi64  = newTemp(Ity_I64);
   10127    IRTemp lo64  = newTemp(Ity_I64);
   10128    IRTemp hi64r = newTemp(Ity_I64);
   10129    IRTemp lo64r = newTemp(Ity_I64);
   10130 
   10131    vassert(imm >= 0 && imm <= 255);
   10132    if (imm >= 16) {
   10133       assign(dV, mkV128(0x0000));
   10134       return dV;
   10135    }
   10136 
   10137    assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   10138    assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   10139 
   10140    if (imm == 0) {
   10141       assign( lo64r, mkexpr(lo64) );
   10142       assign( hi64r, mkexpr(hi64) );
   10143    }
   10144    else
   10145    if (imm == 8) {
   10146       assign( hi64r, mkU64(0) );
   10147       assign( lo64r, mkexpr(hi64) );
   10148    }
   10149    else
   10150    if (imm > 8) {
   10151       assign( hi64r, mkU64(0) );
   10152       assign( lo64r, binop( Iop_Shr64, mkexpr(hi64), mkU8( 8*(imm-8) ) ));
   10153    } else {
   10154       assign( hi64r, binop( Iop_Shr64, mkexpr(hi64), mkU8(8 * imm) ));
   10155       assign( lo64r,
   10156               binop( Iop_Or64,
   10157                      binop(Iop_Shr64, mkexpr(lo64),
   10158                            mkU8(8 * imm)),
   10159                      binop(Iop_Shl64, mkexpr(hi64),
   10160                            mkU8(8 * (8 - imm)) )
   10161                      )
   10162               );
   10163    }
   10164 
   10165    assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   10166    return dV;
   10167 }
   10168 
   10169 
   10170 static IRTemp math_PSLLDQ ( IRTemp sV, Int imm )
   10171 {
   10172    IRTemp       dV    = newTemp(Ity_V128);
   10173    IRTemp       hi64  = newTemp(Ity_I64);
   10174    IRTemp       lo64  = newTemp(Ity_I64);
   10175    IRTemp       hi64r = newTemp(Ity_I64);
   10176    IRTemp       lo64r = newTemp(Ity_I64);
   10177 
   10178    vassert(imm >= 0 && imm <= 255);
   10179    if (imm >= 16) {
   10180       assign(dV, mkV128(0x0000));
   10181       return dV;
   10182    }
   10183 
   10184    assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   10185    assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   10186 
   10187    if (imm == 0) {
   10188       assign( lo64r, mkexpr(lo64) );
   10189       assign( hi64r, mkexpr(hi64) );
   10190    }
   10191    else
   10192    if (imm == 8) {
   10193       assign( lo64r, mkU64(0) );
   10194       assign( hi64r, mkexpr(lo64) );
   10195    }
   10196    else
   10197    if (imm > 8) {
   10198       assign( lo64r, mkU64(0) );
   10199       assign( hi64r, binop( Iop_Shl64, mkexpr(lo64), mkU8( 8*(imm-8) ) ));
   10200    } else {
   10201       assign( lo64r, binop( Iop_Shl64, mkexpr(lo64), mkU8(8 * imm) ));
   10202       assign( hi64r,
   10203               binop( Iop_Or64,
   10204                      binop(Iop_Shl64, mkexpr(hi64),
   10205                            mkU8(8 * imm)),
   10206                      binop(Iop_Shr64, mkexpr(lo64),
   10207                            mkU8(8 * (8 - imm)) )
   10208                      )
   10209               );
   10210    }
   10211 
   10212    assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   10213    return dV;
   10214 }
   10215 
   10216 
   10217 static Long dis_CVTxSD2SI ( VexAbiInfo* vbi, Prefix pfx,
   10218                             Long delta, Bool isAvx, UChar opc, Int sz )
   10219 {
   10220    vassert(opc == 0x2D/*CVTSD2SI*/ || opc == 0x2C/*CVTTSD2SI*/);
   10221    HChar  dis_buf[50];
   10222    Int    alen   = 0;
   10223    UChar  modrm  = getUChar(delta);
   10224    IRTemp addr   = IRTemp_INVALID;
   10225    IRTemp rmode  = newTemp(Ity_I32);
   10226    IRTemp f64lo  = newTemp(Ity_F64);
   10227    Bool   r2zero = toBool(opc == 0x2C);
   10228 
   10229    if (epartIsReg(modrm)) {
   10230       delta += 1;
   10231       assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
   10232       DIP("%scvt%ssd2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
   10233                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
   10234                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
   10235                                            False));
   10236    } else {
   10237       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10238       assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   10239       delta += alen;
   10240       DIP("%scvt%ssd2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
   10241                                   dis_buf,
   10242                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
   10243                                            False));
   10244    }
   10245 
   10246    if (r2zero) {
   10247       assign( rmode, mkU32((UInt)Irrm_ZERO) );
   10248    } else {
   10249       assign( rmode, get_sse_roundingmode() );
   10250    }
   10251 
   10252    if (sz == 4) {
   10253       putIReg32( gregOfRexRM(pfx,modrm),
   10254                  binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) );
   10255    } else {
   10256       vassert(sz == 8);
   10257       putIReg64( gregOfRexRM(pfx,modrm),
   10258                  binop( Iop_F64toI64S, mkexpr(rmode), mkexpr(f64lo)) );
   10259    }
   10260 
   10261    return delta;
   10262 }
   10263 
   10264 
   10265 static Long dis_CVTxSS2SI ( VexAbiInfo* vbi, Prefix pfx,
   10266                             Long delta, Bool isAvx, UChar opc, Int sz )
   10267 {
   10268    vassert(opc == 0x2D/*CVTSS2SI*/ || opc == 0x2C/*CVTTSS2SI*/);
   10269    HChar  dis_buf[50];
   10270    Int    alen   = 0;
   10271    UChar  modrm  = getUChar(delta);
   10272    IRTemp addr   = IRTemp_INVALID;
   10273    IRTemp rmode  = newTemp(Ity_I32);
   10274    IRTemp f32lo  = newTemp(Ity_F32);
   10275    Bool   r2zero = toBool(opc == 0x2C);
   10276 
   10277    if (epartIsReg(modrm)) {
   10278       delta += 1;
   10279       assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
   10280       DIP("%scvt%sss2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
   10281                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
   10282                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
   10283                                            False));
   10284    } else {
   10285       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10286       assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   10287       delta += alen;
   10288       DIP("%scvt%sss2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
   10289                                   dis_buf,
   10290                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
   10291                                            False));
   10292    }
   10293 
   10294    if (r2zero) {
   10295       assign( rmode, mkU32((UInt)Irrm_ZERO) );
   10296    } else {
   10297       assign( rmode, get_sse_roundingmode() );
   10298    }
   10299 
   10300    if (sz == 4) {
   10301       putIReg32( gregOfRexRM(pfx,modrm),
   10302                  binop( Iop_F64toI32S,
   10303                         mkexpr(rmode),
   10304                         unop(Iop_F32toF64, mkexpr(f32lo))) );
   10305    } else {
   10306       vassert(sz == 8);
   10307       putIReg64( gregOfRexRM(pfx,modrm),
   10308                  binop( Iop_F64toI64S,
   10309                         mkexpr(rmode),
   10310                         unop(Iop_F32toF64, mkexpr(f32lo))) );
   10311    }
   10312 
   10313    return delta;
   10314 }
   10315 
   10316 
   10317 static Long dis_CVTPS2PD_128 ( VexAbiInfo* vbi, Prefix pfx,
   10318                                Long delta, Bool isAvx )
   10319 {
   10320    IRTemp addr  = IRTemp_INVALID;
   10321    Int    alen  = 0;
   10322    HChar  dis_buf[50];
   10323    IRTemp f32lo = newTemp(Ity_F32);
   10324    IRTemp f32hi = newTemp(Ity_F32);
   10325    UChar  modrm = getUChar(delta);
   10326    UInt   rG    = gregOfRexRM(pfx,modrm);
   10327    if (epartIsReg(modrm)) {
   10328       UInt rE = eregOfRexRM(pfx,modrm);
   10329       assign( f32lo, getXMMRegLane32F(rE, 0) );
   10330       assign( f32hi, getXMMRegLane32F(rE, 1) );
   10331       delta += 1;
   10332       DIP("%scvtps2pd %s,%s\n",
   10333           isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
   10334    } else {
   10335       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10336       assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) );
   10337       assign( f32hi, loadLE(Ity_F32,
   10338                             binop(Iop_Add64,mkexpr(addr),mkU64(4))) );
   10339       delta += alen;
   10340       DIP("%scvtps2pd %s,%s\n",
   10341           isAvx ? "v" : "", dis_buf, nameXMMReg(rG));
   10342    }
   10343 
   10344    putXMMRegLane64F( rG, 1, unop(Iop_F32toF64, mkexpr(f32hi)) );
   10345    putXMMRegLane64F( rG, 0, unop(Iop_F32toF64, mkexpr(f32lo)) );
   10346    if (isAvx)
   10347       putYMMRegLane128( rG, 1, mkV128(0));
   10348    return delta;
   10349 }
   10350 
   10351 
   10352 static Long dis_CVTPS2PD_256 ( VexAbiInfo* vbi, Prefix pfx,
   10353                                Long delta )
   10354 {
   10355    IRTemp addr  = IRTemp_INVALID;
   10356    Int    alen  = 0;
   10357    HChar  dis_buf[50];
   10358    IRTemp f32_0 = newTemp(Ity_F32);
   10359    IRTemp f32_1 = newTemp(Ity_F32);
   10360    IRTemp f32_2 = newTemp(Ity_F32);
   10361    IRTemp f32_3 = newTemp(Ity_F32);
   10362    UChar  modrm = getUChar(delta);
   10363    UInt   rG    = gregOfRexRM(pfx,modrm);
   10364    if (epartIsReg(modrm)) {
   10365       UInt rE = eregOfRexRM(pfx,modrm);
   10366       assign( f32_0, getXMMRegLane32F(rE, 0) );
   10367       assign( f32_1, getXMMRegLane32F(rE, 1) );
   10368       assign( f32_2, getXMMRegLane32F(rE, 2) );
   10369       assign( f32_3, getXMMRegLane32F(rE, 3) );
   10370       delta += 1;
   10371       DIP("vcvtps2pd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   10372    } else {
   10373       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10374       assign( f32_0, loadLE(Ity_F32, mkexpr(addr)) );
   10375       assign( f32_1, loadLE(Ity_F32,
   10376                             binop(Iop_Add64,mkexpr(addr),mkU64(4))) );
   10377       assign( f32_2, loadLE(Ity_F32,
   10378                             binop(Iop_Add64,mkexpr(addr),mkU64(8))) );
   10379       assign( f32_3, loadLE(Ity_F32,
   10380                             binop(Iop_Add64,mkexpr(addr),mkU64(12))) );
   10381       delta += alen;
   10382       DIP("vcvtps2pd %s,%s\n", dis_buf, nameYMMReg(rG));
   10383    }
   10384 
   10385    putYMMRegLane64F( rG, 3, unop(Iop_F32toF64, mkexpr(f32_3)) );
   10386    putYMMRegLane64F( rG, 2, unop(Iop_F32toF64, mkexpr(f32_2)) );
   10387    putYMMRegLane64F( rG, 1, unop(Iop_F32toF64, mkexpr(f32_1)) );
   10388    putYMMRegLane64F( rG, 0, unop(Iop_F32toF64, mkexpr(f32_0)) );
   10389    return delta;
   10390 }
   10391 
   10392 
   10393 static Long dis_CVTPD2PS_128 ( VexAbiInfo* vbi, Prefix pfx,
   10394                                Long delta, Bool isAvx )
   10395 {
   10396    IRTemp addr  = IRTemp_INVALID;
   10397    Int    alen  = 0;
   10398    HChar  dis_buf[50];
   10399    UChar  modrm = getUChar(delta);
   10400    UInt   rG    = gregOfRexRM(pfx,modrm);
   10401    IRTemp argV  = newTemp(Ity_V128);
   10402    IRTemp rmode = newTemp(Ity_I32);
   10403    if (epartIsReg(modrm)) {
   10404       UInt rE = eregOfRexRM(pfx,modrm);
   10405       assign( argV, getXMMReg(rE) );
   10406       delta += 1;
   10407       DIP("%scvtpd2ps %s,%s\n", isAvx ? "v" : "",
   10408           nameXMMReg(rE), nameXMMReg(rG));
   10409    } else {
   10410       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10411       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10412       delta += alen;
   10413       DIP("%scvtpd2ps %s,%s\n", isAvx ? "v" : "",
   10414           dis_buf, nameXMMReg(rG) );
   10415    }
   10416 
   10417    assign( rmode, get_sse_roundingmode() );
   10418    IRTemp t0 = newTemp(Ity_F64);
   10419    IRTemp t1 = newTemp(Ity_F64);
   10420    assign( t0, unop(Iop_ReinterpI64asF64,
   10421                     unop(Iop_V128to64, mkexpr(argV))) );
   10422    assign( t1, unop(Iop_ReinterpI64asF64,
   10423                     unop(Iop_V128HIto64, mkexpr(argV))) );
   10424 
   10425 #  define CVT(_t)  binop( Iop_F64toF32, mkexpr(rmode), mkexpr(_t) )
   10426    putXMMRegLane32(  rG, 3, mkU32(0) );
   10427    putXMMRegLane32(  rG, 2, mkU32(0) );
   10428    putXMMRegLane32F( rG, 1, CVT(t1) );
   10429    putXMMRegLane32F( rG, 0, CVT(t0) );
   10430 #  undef CVT
   10431    if (isAvx)
   10432       putYMMRegLane128( rG, 1, mkV128(0) );
   10433 
   10434    return delta;
   10435 }
   10436 
   10437 
   10438 static Long dis_CVTxPS2DQ_128 ( VexAbiInfo* vbi, Prefix pfx,
   10439                                 Long delta, Bool isAvx, Bool r2zero )
   10440 {
   10441    IRTemp addr  = IRTemp_INVALID;
   10442    Int    alen  = 0;
   10443    HChar  dis_buf[50];
   10444    UChar  modrm = getUChar(delta);
   10445    IRTemp argV  = newTemp(Ity_V128);
   10446    IRTemp rmode = newTemp(Ity_I32);
   10447    UInt   rG    = gregOfRexRM(pfx,modrm);
   10448    IRTemp t0, t1, t2, t3;
   10449 
   10450    if (epartIsReg(modrm)) {
   10451       UInt rE = eregOfRexRM(pfx,modrm);
   10452       assign( argV, getXMMReg(rE) );
   10453       delta += 1;
   10454       DIP("%scvt%sps2dq %s,%s\n",
   10455           isAvx ? "v" : "", r2zero ? "t" : "", nameXMMReg(rE), nameXMMReg(rG));
   10456    } else {
   10457       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10458       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10459       delta += alen;
   10460       DIP("%scvt%sps2dq %s,%s\n",
   10461           isAvx ? "v" : "", r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
   10462    }
   10463 
   10464    assign( rmode, r2zero ? mkU32((UInt)Irrm_ZERO)
   10465                          : get_sse_roundingmode() );
   10466    t0 = t1 = t2 = t3 = IRTemp_INVALID;
   10467    breakupV128to32s( argV, &t3, &t2, &t1, &t0 );
   10468    /* This is less than ideal.  If it turns out to be a performance
   10469       bottleneck it can be improved. */
   10470 #  define CVT(_t)                             \
   10471       binop( Iop_F64toI32S,                   \
   10472              mkexpr(rmode),                   \
   10473              unop( Iop_F32toF64,              \
   10474                    unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
   10475 
   10476    putXMMRegLane32( rG, 3, CVT(t3) );
   10477    putXMMRegLane32( rG, 2, CVT(t2) );
   10478    putXMMRegLane32( rG, 1, CVT(t1) );
   10479    putXMMRegLane32( rG, 0, CVT(t0) );
   10480 #  undef CVT
   10481    if (isAvx)
   10482       putYMMRegLane128( rG, 1, mkV128(0) );
   10483 
   10484    return delta;
   10485 }
   10486 
   10487 
   10488 static Long dis_CVTxPS2DQ_256 ( VexAbiInfo* vbi, Prefix pfx,
   10489                                 Long delta, Bool r2zero )
   10490 {
   10491    IRTemp addr  = IRTemp_INVALID;
   10492    Int    alen  = 0;
   10493    HChar  dis_buf[50];
   10494    UChar  modrm = getUChar(delta);
   10495    IRTemp argV  = newTemp(Ity_V256);
   10496    IRTemp rmode = newTemp(Ity_I32);
   10497    UInt   rG    = gregOfRexRM(pfx,modrm);
   10498    IRTemp t0, t1, t2, t3, t4, t5, t6, t7;
   10499 
   10500    if (epartIsReg(modrm)) {
   10501       UInt rE = eregOfRexRM(pfx,modrm);
   10502       assign( argV, getYMMReg(rE) );
   10503       delta += 1;
   10504       DIP("vcvt%sps2dq %s,%s\n",
   10505           r2zero ? "t" : "", nameYMMReg(rE), nameYMMReg(rG));
   10506    } else {
   10507       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10508       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
   10509       delta += alen;
   10510       DIP("vcvt%sps2dq %s,%s\n",
   10511           r2zero ? "t" : "", dis_buf, nameYMMReg(rG) );
   10512    }
   10513 
   10514    assign( rmode, r2zero ? mkU32((UInt)Irrm_ZERO)
   10515                          : get_sse_roundingmode() );
   10516    t0 = t1 = t2 = t3 = t4 = t5 = t6 = t7 = IRTemp_INVALID;
   10517    breakupV256to32s( argV, &t7, &t6, &t5, &t4, &t3, &t2, &t1, &t0 );
   10518    /* This is less than ideal.  If it turns out to be a performance
   10519       bottleneck it can be improved. */
   10520 #  define CVT(_t)                             \
   10521       binop( Iop_F64toI32S,                   \
   10522              mkexpr(rmode),                   \
   10523              unop( Iop_F32toF64,              \
   10524                    unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
   10525 
   10526    putYMMRegLane32( rG, 7, CVT(t7) );
   10527    putYMMRegLane32( rG, 6, CVT(t6) );
   10528    putYMMRegLane32( rG, 5, CVT(t5) );
   10529    putYMMRegLane32( rG, 4, CVT(t4) );
   10530    putYMMRegLane32( rG, 3, CVT(t3) );
   10531    putYMMRegLane32( rG, 2, CVT(t2) );
   10532    putYMMRegLane32( rG, 1, CVT(t1) );
   10533    putYMMRegLane32( rG, 0, CVT(t0) );
   10534 #  undef CVT
   10535 
   10536    return delta;
   10537 }
   10538 
   10539 
   10540 static Long dis_CVTxPD2DQ_128 ( VexAbiInfo* vbi, Prefix pfx,
   10541                                 Long delta, Bool isAvx, Bool r2zero )
   10542 {
   10543    IRTemp addr  = IRTemp_INVALID;
   10544    Int    alen  = 0;
   10545    HChar  dis_buf[50];
   10546    UChar  modrm = getUChar(delta);
   10547    IRTemp argV  = newTemp(Ity_V128);
   10548    IRTemp rmode = newTemp(Ity_I32);
   10549    UInt   rG    = gregOfRexRM(pfx,modrm);
   10550    IRTemp t0, t1;
   10551 
   10552    if (epartIsReg(modrm)) {
   10553       UInt rE = eregOfRexRM(pfx,modrm);
   10554       assign( argV, getXMMReg(rE) );
   10555       delta += 1;
   10556       DIP("%scvt%spd2dq %s,%s\n",
   10557           isAvx ? "v" : "", r2zero ? "t" : "", nameXMMReg(rE), nameXMMReg(rG));
   10558    } else {
   10559       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10560       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10561       delta += alen;
   10562       DIP("%scvt%spd2dqx %s,%s\n",
   10563           isAvx ? "v" : "", r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
   10564    }
   10565 
   10566    if (r2zero) {
   10567       assign(rmode, mkU32((UInt)Irrm_ZERO) );
   10568    } else {
   10569       assign( rmode, get_sse_roundingmode() );
   10570    }
   10571 
   10572    t0 = newTemp(Ity_F64);
   10573    t1 = newTemp(Ity_F64);
   10574    assign( t0, unop(Iop_ReinterpI64asF64,
   10575                     unop(Iop_V128to64, mkexpr(argV))) );
   10576    assign( t1, unop(Iop_ReinterpI64asF64,
   10577                     unop(Iop_V128HIto64, mkexpr(argV))) );
   10578 
   10579 #  define CVT(_t)  binop( Iop_F64toI32S,                   \
   10580                           mkexpr(rmode),                   \
   10581                           mkexpr(_t) )
   10582 
   10583    putXMMRegLane32( rG, 3, mkU32(0) );
   10584    putXMMRegLane32( rG, 2, mkU32(0) );
   10585    putXMMRegLane32( rG, 1, CVT(t1) );
   10586    putXMMRegLane32( rG, 0, CVT(t0) );
   10587 #  undef CVT
   10588    if (isAvx)
   10589       putYMMRegLane128( rG, 1, mkV128(0) );
   10590 
   10591    return delta;
   10592 }
   10593 
   10594 
   10595 static Long dis_CVTxPD2DQ_256 ( VexAbiInfo* vbi, Prefix pfx,
   10596                                 Long delta, Bool r2zero )
   10597 {
   10598    IRTemp addr  = IRTemp_INVALID;
   10599    Int    alen  = 0;
   10600    HChar  dis_buf[50];
   10601    UChar  modrm = getUChar(delta);
   10602    IRTemp argV  = newTemp(Ity_V256);
   10603    IRTemp rmode = newTemp(Ity_I32);
   10604    UInt   rG    = gregOfRexRM(pfx,modrm);
   10605    IRTemp t0, t1, t2, t3;
   10606 
   10607    if (epartIsReg(modrm)) {
   10608       UInt rE = eregOfRexRM(pfx,modrm);
   10609       assign( argV, getYMMReg(rE) );
   10610       delta += 1;
   10611       DIP("vcvt%spd2dq %s,%s\n",
   10612           r2zero ? "t" : "", nameYMMReg(rE), nameXMMReg(rG));
   10613    } else {
   10614       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10615       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
   10616       delta += alen;
   10617       DIP("vcvt%spd2dqy %s,%s\n",
   10618           r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
   10619    }
   10620 
   10621    if (r2zero) {
   10622       assign(rmode, mkU32((UInt)Irrm_ZERO) );
   10623    } else {
   10624       assign( rmode, get_sse_roundingmode() );
   10625    }
   10626 
   10627    t0 = IRTemp_INVALID;
   10628    t1 = IRTemp_INVALID;
   10629    t2 = IRTemp_INVALID;
   10630    t3 = IRTemp_INVALID;
   10631    breakupV256to64s( argV, &t3, &t2, &t1, &t0 );
   10632 
   10633 #  define CVT(_t)  binop( Iop_F64toI32S,                   \
   10634                           mkexpr(rmode),                   \
   10635                           unop( Iop_ReinterpI64asF64,      \
   10636                                 mkexpr(_t) ) )
   10637 
   10638    putXMMRegLane32( rG, 3, CVT(t3) );
   10639    putXMMRegLane32( rG, 2, CVT(t2) );
   10640    putXMMRegLane32( rG, 1, CVT(t1) );
   10641    putXMMRegLane32( rG, 0, CVT(t0) );
   10642 #  undef CVT
   10643    putYMMRegLane128( rG, 1, mkV128(0) );
   10644 
   10645    return delta;
   10646 }
   10647 
   10648 
   10649 static Long dis_CVTDQ2PS_128 ( VexAbiInfo* vbi, Prefix pfx,
   10650                                Long delta, Bool isAvx )
   10651 {
   10652    IRTemp addr  = IRTemp_INVALID;
   10653    Int    alen  = 0;
   10654    HChar  dis_buf[50];
   10655    UChar  modrm = getUChar(delta);
   10656    IRTemp argV  = newTemp(Ity_V128);
   10657    IRTemp rmode = newTemp(Ity_I32);
   10658    UInt   rG    = gregOfRexRM(pfx,modrm);
   10659    IRTemp t0, t1, t2, t3;
   10660 
   10661    if (epartIsReg(modrm)) {
   10662       UInt rE = eregOfRexRM(pfx,modrm);
   10663       assign( argV, getXMMReg(rE) );
   10664       delta += 1;
   10665       DIP("%scvtdq2ps %s,%s\n",
   10666           isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
   10667    } else {
   10668       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10669       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10670       delta += alen;
   10671       DIP("%scvtdq2ps %s,%s\n",
   10672           isAvx ? "v" : "", dis_buf, nameXMMReg(rG) );
   10673    }
   10674 
   10675    assign( rmode, get_sse_roundingmode() );
   10676    t0 = IRTemp_INVALID;
   10677    t1 = IRTemp_INVALID;
   10678    t2 = IRTemp_INVALID;
   10679    t3 = IRTemp_INVALID;
   10680    breakupV128to32s( argV, &t3, &t2, &t1, &t0 );
   10681 
   10682 #  define CVT(_t)  binop( Iop_F64toF32,                    \
   10683                           mkexpr(rmode),                   \
   10684                           unop(Iop_I32StoF64,mkexpr(_t)))
   10685 
   10686    putXMMRegLane32F( rG, 3, CVT(t3) );
   10687    putXMMRegLane32F( rG, 2, CVT(t2) );
   10688    putXMMRegLane32F( rG, 1, CVT(t1) );
   10689    putXMMRegLane32F( rG, 0, CVT(t0) );
   10690 #  undef CVT
   10691    if (isAvx)
   10692       putYMMRegLane128( rG, 1, mkV128(0) );
   10693 
   10694    return delta;
   10695 }
   10696 
   10697 static Long dis_CVTDQ2PS_256 ( VexAbiInfo* vbi, Prefix pfx,
   10698                                Long delta )
   10699 {
   10700    IRTemp addr   = IRTemp_INVALID;
   10701    Int    alen   = 0;
   10702    HChar  dis_buf[50];
   10703    UChar  modrm  = getUChar(delta);
   10704    IRTemp argV   = newTemp(Ity_V256);
   10705    IRTemp rmode  = newTemp(Ity_I32);
   10706    UInt   rG     = gregOfRexRM(pfx,modrm);
   10707    IRTemp t0, t1, t2, t3, t4, t5, t6, t7;
   10708 
   10709    if (epartIsReg(modrm)) {
   10710       UInt rE = eregOfRexRM(pfx,modrm);
   10711       assign( argV, getYMMReg(rE) );
   10712       delta += 1;
   10713       DIP("vcvtdq2ps %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   10714    } else {
   10715       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10716       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
   10717       delta += alen;
   10718       DIP("vcvtdq2ps %s,%s\n", dis_buf, nameYMMReg(rG) );
   10719    }
   10720 
   10721    assign( rmode, get_sse_roundingmode() );
   10722    t0 = IRTemp_INVALID;
   10723    t1 = IRTemp_INVALID;
   10724    t2 = IRTemp_INVALID;
   10725    t3 = IRTemp_INVALID;
   10726    t4 = IRTemp_INVALID;
   10727    t5 = IRTemp_INVALID;
   10728    t6 = IRTemp_INVALID;
   10729    t7 = IRTemp_INVALID;
   10730    breakupV256to32s( argV, &t7, &t6, &t5, &t4, &t3, &t2, &t1, &t0 );
   10731 
   10732 #  define CVT(_t)  binop( Iop_F64toF32,                    \
   10733                           mkexpr(rmode),                   \
   10734                           unop(Iop_I32StoF64,mkexpr(_t)))
   10735 
   10736    putYMMRegLane32F( rG, 7, CVT(t7) );
   10737    putYMMRegLane32F( rG, 6, CVT(t6) );
   10738    putYMMRegLane32F( rG, 5, CVT(t5) );
   10739    putYMMRegLane32F( rG, 4, CVT(t4) );
   10740    putYMMRegLane32F( rG, 3, CVT(t3) );
   10741    putYMMRegLane32F( rG, 2, CVT(t2) );
   10742    putYMMRegLane32F( rG, 1, CVT(t1) );
   10743    putYMMRegLane32F( rG, 0, CVT(t0) );
   10744 #  undef CVT
   10745 
   10746    return delta;
   10747 }
   10748 
   10749 
   10750 static Long dis_PMOVMSKB_128 ( VexAbiInfo* vbi, Prefix pfx,
   10751                                Long delta, Bool isAvx )
   10752 {
   10753    UChar modrm = getUChar(delta);
   10754    vassert(epartIsReg(modrm)); /* ensured by caller */
   10755    UInt   rE = eregOfRexRM(pfx,modrm);
   10756    UInt   rG = gregOfRexRM(pfx,modrm);
   10757    IRTemp t0 = newTemp(Ity_V128);
   10758    IRTemp t1 = newTemp(Ity_I32);
   10759    assign(t0, getXMMReg(rE));
   10760    assign(t1, unop(Iop_16Uto32, unop(Iop_GetMSBs8x16, mkexpr(t0))));
   10761    putIReg32(rG, mkexpr(t1));
   10762    DIP("%spmovmskb %s,%s\n", isAvx ? "v" : "", nameXMMReg(rE),
   10763        nameIReg32(rG));
   10764    delta += 1;
   10765    return delta;
   10766 }
   10767 
   10768 
   10769 static Long dis_PMOVMSKB_256 ( VexAbiInfo* vbi, Prefix pfx,
   10770                                Long delta  )
   10771 {
   10772    UChar modrm = getUChar(delta);
   10773    vassert(epartIsReg(modrm)); /* ensured by caller */
   10774    UInt   rE = eregOfRexRM(pfx,modrm);
   10775    UInt   rG = gregOfRexRM(pfx,modrm);
   10776    IRTemp t0 = newTemp(Ity_V128);
   10777    IRTemp t1 = newTemp(Ity_V128);
   10778    IRTemp t2 = newTemp(Ity_I16);
   10779    IRTemp t3 = newTemp(Ity_I16);
   10780    assign(t0, getYMMRegLane128(rE, 0));
   10781    assign(t1, getYMMRegLane128(rE, 1));
   10782    assign(t2, unop(Iop_GetMSBs8x16, mkexpr(t0)));
   10783    assign(t3, unop(Iop_GetMSBs8x16, mkexpr(t1)));
   10784    putIReg32(rG, binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)));
   10785    DIP("vpmovmskb %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
   10786    delta += 1;
   10787    return delta;
   10788 }
   10789 
   10790 
   10791 /* FIXME: why not just use InterleaveLO / InterleaveHI?  I think the
   10792    relevant ops are "xIsH ? InterleaveHI32x4 : InterleaveLO32x4". */
   10793 /* Does the maths for 128 bit versions of UNPCKLPS and UNPCKHPS */
   10794 static IRTemp math_UNPCKxPS_128 ( IRTemp sV, IRTemp dV, Bool xIsH )
   10795 {
   10796    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   10797    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   10798    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   10799    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   10800    IRTemp res = newTemp(Ity_V128);
   10801    assign(res,  xIsH ? mkV128from32s( s3, d3, s2, d2 )
   10802                      : mkV128from32s( s1, d1, s0, d0 ));
   10803    return res;
   10804 }
   10805 
   10806 
   10807 /* FIXME: why not just use InterleaveLO / InterleaveHI ?? */
   10808 /* Does the maths for 128 bit versions of UNPCKLPD and UNPCKHPD */
   10809 static IRTemp math_UNPCKxPD_128 ( IRTemp sV, IRTemp dV, Bool xIsH )
   10810 {
   10811    IRTemp s1 = newTemp(Ity_I64);
   10812    IRTemp s0 = newTemp(Ity_I64);
   10813    IRTemp d1 = newTemp(Ity_I64);
   10814    IRTemp d0 = newTemp(Ity_I64);
   10815    assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   10816    assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   10817    assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   10818    assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   10819    IRTemp res = newTemp(Ity_V128);
   10820    assign(res, xIsH ? binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1))
   10821                     : binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)));
   10822    return res;
   10823 }
   10824 
   10825 
   10826 /* Does the maths for 256 bit versions of UNPCKLPD and UNPCKHPD.
   10827    Doesn't seem like this fits in either of the Iop_Interleave{LO,HI}
   10828    or the Iop_Cat{Odd,Even}Lanes idioms, hence just do it the stupid
   10829    way. */
   10830 static IRTemp math_UNPCKxPD_256 ( IRTemp sV, IRTemp dV, Bool xIsH )
   10831 {
   10832    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   10833    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   10834    breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
   10835    breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
   10836    IRTemp res = newTemp(Ity_V256);
   10837    assign(res, xIsH
   10838                ? IRExpr_Qop(Iop_64x4toV256, mkexpr(s3), mkexpr(d3),
   10839                                             mkexpr(s1), mkexpr(d1))
   10840                : IRExpr_Qop(Iop_64x4toV256, mkexpr(s2), mkexpr(d2),
   10841                                             mkexpr(s0), mkexpr(d0)));
   10842    return res;
   10843 }
   10844 
   10845 
   10846 /* FIXME: this is really bad.  Surely can do something better here?
   10847    One observation is that the steering in the upper and lower 128 bit
   10848    halves is the same as with math_UNPCKxPS_128, so we simply split
   10849    into two halves, and use that.  Consequently any improvement in
   10850    math_UNPCKxPS_128 (probably, to use interleave-style primops)
   10851    benefits this too. */
   10852 static IRTemp math_UNPCKxPS_256 ( IRTemp sV, IRTemp dV, Bool xIsH )
   10853 {
   10854    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   10855    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   10856    breakupV256toV128s( sV, &sVhi, &sVlo );
   10857    breakupV256toV128s( dV, &dVhi, &dVlo );
   10858    IRTemp rVhi = math_UNPCKxPS_128(sVhi, dVhi, xIsH);
   10859    IRTemp rVlo = math_UNPCKxPS_128(sVlo, dVlo, xIsH);
   10860    IRTemp rV   = newTemp(Ity_V256);
   10861    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   10862    return rV;
   10863 }
   10864 
   10865 
   10866 static IRTemp math_SHUFPS_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   10867 {
   10868    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   10869    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   10870    vassert(imm8 < 256);
   10871 
   10872    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   10873    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   10874 
   10875 #  define SELD(n) ((n)==0 ? d0 : ((n)==1 ? d1 : ((n)==2 ? d2 : d3)))
   10876 #  define SELS(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   10877    IRTemp res = newTemp(Ity_V128);
   10878    assign(res,
   10879           mkV128from32s( SELS((imm8>>6)&3), SELS((imm8>>4)&3),
   10880                          SELD((imm8>>2)&3), SELD((imm8>>0)&3) ) );
   10881 #  undef SELD
   10882 #  undef SELS
   10883    return res;
   10884 }
   10885 
   10886 
   10887 /* 256-bit SHUFPS appears to steer each of the 128-bit halves
   10888    identically.  Hence do the clueless thing and use math_SHUFPS_128
   10889    twice. */
   10890 static IRTemp math_SHUFPS_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
   10891 {
   10892    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   10893    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   10894    breakupV256toV128s( sV, &sVhi, &sVlo );
   10895    breakupV256toV128s( dV, &dVhi, &dVlo );
   10896    IRTemp rVhi = math_SHUFPS_128(sVhi, dVhi, imm8);
   10897    IRTemp rVlo = math_SHUFPS_128(sVlo, dVlo, imm8);
   10898    IRTemp rV   = newTemp(Ity_V256);
   10899    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   10900    return rV;
   10901 }
   10902 
   10903 
   10904 static IRTemp math_SHUFPD_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   10905 {
   10906    IRTemp s1 = newTemp(Ity_I64);
   10907    IRTemp s0 = newTemp(Ity_I64);
   10908    IRTemp d1 = newTemp(Ity_I64);
   10909    IRTemp d0 = newTemp(Ity_I64);
   10910 
   10911    assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   10912    assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   10913    assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   10914    assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   10915 
   10916 #  define SELD(n) mkexpr((n)==0 ? d0 : d1)
   10917 #  define SELS(n) mkexpr((n)==0 ? s0 : s1)
   10918 
   10919    IRTemp res = newTemp(Ity_V128);
   10920    assign(res, binop( Iop_64HLtoV128,
   10921                       SELS((imm8>>1)&1), SELD((imm8>>0)&1) ) );
   10922 
   10923 #  undef SELD
   10924 #  undef SELS
   10925    return res;
   10926 }
   10927 
   10928 
   10929 static IRTemp math_SHUFPD_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
   10930 {
   10931    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   10932    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   10933    breakupV256toV128s( sV, &sVhi, &sVlo );
   10934    breakupV256toV128s( dV, &dVhi, &dVlo );
   10935    IRTemp rVhi = math_SHUFPD_128(sVhi, dVhi, (imm8 >> 2) & 3);
   10936    IRTemp rVlo = math_SHUFPD_128(sVlo, dVlo, imm8 & 3);
   10937    IRTemp rV   = newTemp(Ity_V256);
   10938    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   10939    return rV;
   10940 }
   10941 
   10942 
   10943 static IRTemp math_BLENDPD_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   10944 {
   10945    UShort imm8_mask_16;
   10946    IRTemp imm8_mask = newTemp(Ity_V128);
   10947 
   10948    switch( imm8 & 3 ) {
   10949       case 0:  imm8_mask_16 = 0x0000; break;
   10950       case 1:  imm8_mask_16 = 0x00FF; break;
   10951       case 2:  imm8_mask_16 = 0xFF00; break;
   10952       case 3:  imm8_mask_16 = 0xFFFF; break;
   10953       default: vassert(0);            break;
   10954    }
   10955    assign( imm8_mask, mkV128( imm8_mask_16 ) );
   10956 
   10957    IRTemp res = newTemp(Ity_V128);
   10958    assign ( res, binop( Iop_OrV128,
   10959                         binop( Iop_AndV128, mkexpr(sV),
   10960                                             mkexpr(imm8_mask) ),
   10961                         binop( Iop_AndV128, mkexpr(dV),
   10962                                unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
   10963    return res;
   10964 }
   10965 
   10966 
   10967 static IRTemp math_BLENDPD_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
   10968 {
   10969    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   10970    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   10971    breakupV256toV128s( sV, &sVhi, &sVlo );
   10972    breakupV256toV128s( dV, &dVhi, &dVlo );
   10973    IRTemp rVhi = math_BLENDPD_128(sVhi, dVhi, (imm8 >> 2) & 3);
   10974    IRTemp rVlo = math_BLENDPD_128(sVlo, dVlo, imm8 & 3);
   10975    IRTemp rV   = newTemp(Ity_V256);
   10976    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   10977    return rV;
   10978 }
   10979 
   10980 
   10981 static IRTemp math_BLENDPS_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   10982 {
   10983    UShort imm8_perms[16] = { 0x0000, 0x000F, 0x00F0, 0x00FF, 0x0F00,
   10984                              0x0F0F, 0x0FF0, 0x0FFF, 0xF000, 0xF00F,
   10985                              0xF0F0, 0xF0FF, 0xFF00, 0xFF0F, 0xFFF0,
   10986                              0xFFFF };
   10987    IRTemp imm8_mask = newTemp(Ity_V128);
   10988    assign( imm8_mask, mkV128( imm8_perms[ (imm8 & 15) ] ) );
   10989 
   10990    IRTemp res = newTemp(Ity_V128);
   10991    assign ( res, binop( Iop_OrV128,
   10992                         binop( Iop_AndV128, mkexpr(sV),
   10993                                             mkexpr(imm8_mask) ),
   10994                         binop( Iop_AndV128, mkexpr(dV),
   10995                                unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
   10996    return res;
   10997 }
   10998 
   10999 
   11000 static IRTemp math_BLENDPS_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11001 {
   11002    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   11003    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   11004    breakupV256toV128s( sV, &sVhi, &sVlo );
   11005    breakupV256toV128s( dV, &dVhi, &dVlo );
   11006    IRTemp rVhi = math_BLENDPS_128(sVhi, dVhi, (imm8 >> 4) & 15);
   11007    IRTemp rVlo = math_BLENDPS_128(sVlo, dVlo, imm8 & 15);
   11008    IRTemp rV   = newTemp(Ity_V256);
   11009    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   11010    return rV;
   11011 }
   11012 
   11013 
   11014 static IRTemp math_PBLENDW_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11015 {
   11016    /* Make w be a 16-bit version of imm8, formed by duplicating each
   11017       bit in imm8. */
   11018    Int i;
   11019    UShort imm16 = 0;
   11020    for (i = 0; i < 8; i++) {
   11021       if (imm8 & (1 << i))
   11022          imm16 |= (3 << (2*i));
   11023    }
   11024    IRTemp imm16_mask = newTemp(Ity_V128);
   11025    assign( imm16_mask, mkV128( imm16 ));
   11026 
   11027    IRTemp res = newTemp(Ity_V128);
   11028    assign ( res, binop( Iop_OrV128,
   11029                         binop( Iop_AndV128, mkexpr(sV),
   11030                                             mkexpr(imm16_mask) ),
   11031                         binop( Iop_AndV128, mkexpr(dV),
   11032                                unop( Iop_NotV128, mkexpr(imm16_mask) ) ) ) );
   11033    return res;
   11034 }
   11035 
   11036 
   11037 static IRTemp math_PMULUDQ_128 ( IRTemp sV, IRTemp dV )
   11038 {
   11039    /* This is a really poor translation -- could be improved if
   11040       performance critical */
   11041    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   11042    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   11043    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   11044    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   11045    IRTemp res = newTemp(Ity_V128);
   11046    assign(res, binop(Iop_64HLtoV128,
   11047                      binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)),
   11048                      binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) ));
   11049    return res;
   11050 }
   11051 
   11052 
   11053 static IRTemp math_PMULUDQ_256 ( IRTemp sV, IRTemp dV )
   11054 {
   11055    /* This is a really poor translation -- could be improved if
   11056       performance critical */
   11057    IRTemp sHi, sLo, dHi, dLo;
   11058    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   11059    breakupV256toV128s( dV, &dHi, &dLo);
   11060    breakupV256toV128s( sV, &sHi, &sLo);
   11061    IRTemp res = newTemp(Ity_V256);
   11062    assign(res, binop(Iop_V128HLtoV256,
   11063                      mkexpr(math_PMULUDQ_128(sHi, dHi)),
   11064                      mkexpr(math_PMULUDQ_128(sLo, dLo))));
   11065    return res;
   11066 }
   11067 
   11068 
   11069 static IRTemp math_PMULDQ_128 ( IRTemp dV, IRTemp sV )
   11070 {
   11071    /* This is a really poor translation -- could be improved if
   11072       performance critical */
   11073    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   11074    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   11075    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   11076    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   11077    IRTemp res = newTemp(Ity_V128);
   11078    assign(res, binop(Iop_64HLtoV128,
   11079                      binop( Iop_MullS32, mkexpr(d2), mkexpr(s2)),
   11080                      binop( Iop_MullS32, mkexpr(d0), mkexpr(s0)) ));
   11081    return res;
   11082 }
   11083 
   11084 
   11085 static IRTemp math_PMULDQ_256 ( IRTemp sV, IRTemp dV )
   11086 {
   11087    /* This is a really poor translation -- could be improved if
   11088       performance critical */
   11089    IRTemp sHi, sLo, dHi, dLo;
   11090    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   11091    breakupV256toV128s( dV, &dHi, &dLo);
   11092    breakupV256toV128s( sV, &sHi, &sLo);
   11093    IRTemp res = newTemp(Ity_V256);
   11094    assign(res, binop(Iop_V128HLtoV256,
   11095                      mkexpr(math_PMULDQ_128(sHi, dHi)),
   11096                      mkexpr(math_PMULDQ_128(sLo, dLo))));
   11097    return res;
   11098 }
   11099 
   11100 
   11101 static IRTemp math_PMADDWD_128 ( IRTemp dV, IRTemp sV )
   11102 {
   11103    IRTemp sVhi, sVlo, dVhi, dVlo;
   11104    IRTemp resHi = newTemp(Ity_I64);
   11105    IRTemp resLo = newTemp(Ity_I64);
   11106    sVhi = sVlo = dVhi = dVlo = IRTemp_INVALID;
   11107    breakupV128to64s( sV, &sVhi, &sVlo );
   11108    breakupV128to64s( dV, &dVhi, &dVlo );
   11109    assign( resHi, mkIRExprCCall(Ity_I64, 0/*regparms*/,
   11110                                 "amd64g_calculate_mmx_pmaddwd",
   11111                                 &amd64g_calculate_mmx_pmaddwd,
   11112                                 mkIRExprVec_2( mkexpr(sVhi), mkexpr(dVhi))));
   11113    assign( resLo, mkIRExprCCall(Ity_I64, 0/*regparms*/,
   11114                                 "amd64g_calculate_mmx_pmaddwd",
   11115                                 &amd64g_calculate_mmx_pmaddwd,
   11116                                 mkIRExprVec_2( mkexpr(sVlo), mkexpr(dVlo))));
   11117    IRTemp res = newTemp(Ity_V128);
   11118    assign( res, binop(Iop_64HLtoV128, mkexpr(resHi), mkexpr(resLo))) ;
   11119    return res;
   11120 }
   11121 
   11122 
   11123 static IRTemp math_PMADDWD_256 ( IRTemp dV, IRTemp sV )
   11124 {
   11125    IRTemp sHi, sLo, dHi, dLo;
   11126    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   11127    breakupV256toV128s( dV, &dHi, &dLo);
   11128    breakupV256toV128s( sV, &sHi, &sLo);
   11129    IRTemp res = newTemp(Ity_V256);
   11130    assign(res, binop(Iop_V128HLtoV256,
   11131                      mkexpr(math_PMADDWD_128(dHi, sHi)),
   11132                      mkexpr(math_PMADDWD_128(dLo, sLo))));
   11133    return res;
   11134 }
   11135 
   11136 
   11137 static IRTemp math_ADDSUBPD_128 ( IRTemp dV, IRTemp sV )
   11138 {
   11139    IRTemp addV = newTemp(Ity_V128);
   11140    IRTemp subV = newTemp(Ity_V128);
   11141    IRTemp a1   = newTemp(Ity_I64);
   11142    IRTemp s0   = newTemp(Ity_I64);
   11143    IRTemp rm   = newTemp(Ity_I32);
   11144 
   11145    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11146    assign( addV, triop(Iop_Add64Fx2, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11147    assign( subV, triop(Iop_Sub64Fx2, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11148 
   11149    assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
   11150    assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
   11151 
   11152    IRTemp res = newTemp(Ity_V128);
   11153    assign( res, binop(Iop_64HLtoV128, mkexpr(a1), mkexpr(s0)) );
   11154    return res;
   11155 }
   11156 
   11157 
   11158 static IRTemp math_ADDSUBPD_256 ( IRTemp dV, IRTemp sV )
   11159 {
   11160    IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
   11161    IRTemp addV = newTemp(Ity_V256);
   11162    IRTemp subV = newTemp(Ity_V256);
   11163    IRTemp rm   = newTemp(Ity_I32);
   11164    a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11165 
   11166    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11167    assign( addV, triop(Iop_Add64Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11168    assign( subV, triop(Iop_Sub64Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11169 
   11170    breakupV256to64s( addV, &a3, &a2, &a1, &a0 );
   11171    breakupV256to64s( subV, &s3, &s2, &s1, &s0 );
   11172 
   11173    IRTemp res = newTemp(Ity_V256);
   11174    assign( res, mkV256from64s( a3, s2, a1, s0 ) );
   11175    return res;
   11176 }
   11177 
   11178 
   11179 static IRTemp math_ADDSUBPS_128 ( IRTemp dV, IRTemp sV )
   11180 {
   11181    IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
   11182    IRTemp addV = newTemp(Ity_V128);
   11183    IRTemp subV = newTemp(Ity_V128);
   11184    IRTemp rm   = newTemp(Ity_I32);
   11185    a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11186 
   11187    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11188    assign( addV, triop(Iop_Add32Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11189    assign( subV, triop(Iop_Sub32Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11190 
   11191    breakupV128to32s( addV, &a3, &a2, &a1, &a0 );
   11192    breakupV128to32s( subV, &s3, &s2, &s1, &s0 );
   11193 
   11194    IRTemp res = newTemp(Ity_V128);
   11195    assign( res, mkV128from32s( a3, s2, a1, s0 ) );
   11196    return res;
   11197 }
   11198 
   11199 
   11200 static IRTemp math_ADDSUBPS_256 ( IRTemp dV, IRTemp sV )
   11201 {
   11202    IRTemp a7, a6, a5, a4, a3, a2, a1, a0;
   11203    IRTemp s7, s6, s5, s4, s3, s2, s1, s0;
   11204    IRTemp addV = newTemp(Ity_V256);
   11205    IRTemp subV = newTemp(Ity_V256);
   11206    IRTemp rm   = newTemp(Ity_I32);
   11207    a7 = a6 = a5 = a4 = a3 = a2 = a1 = a0 = IRTemp_INVALID;
   11208    s7 = s6 = s5 = s4 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11209 
   11210    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11211    assign( addV, triop(Iop_Add32Fx8, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11212    assign( subV, triop(Iop_Sub32Fx8, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11213 
   11214    breakupV256to32s( addV, &a7, &a6, &a5, &a4, &a3, &a2, &a1, &a0 );
   11215    breakupV256to32s( subV, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
   11216 
   11217    IRTemp res = newTemp(Ity_V256);
   11218    assign( res, mkV256from32s( a7, s6, a5, s4, a3, s2, a1, s0 ) );
   11219    return res;
   11220 }
   11221 
   11222 
   11223 /* Handle 128 bit PSHUFLW and PSHUFHW. */
   11224 static Long dis_PSHUFxW_128 ( VexAbiInfo* vbi, Prefix pfx,
   11225                               Long delta, Bool isAvx, Bool xIsH )
   11226 {
   11227    IRTemp addr  = IRTemp_INVALID;
   11228    Int    alen  = 0;
   11229    HChar  dis_buf[50];
   11230    UChar  modrm = getUChar(delta);
   11231    UInt   rG = gregOfRexRM(pfx,modrm);
   11232    UInt   imm8;
   11233    IRTemp sVmut, dVmut, sVcon, sV, dV, s3, s2, s1, s0;
   11234    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11235    sV    = newTemp(Ity_V128);
   11236    dV    = newTemp(Ity_V128);
   11237    sVmut = newTemp(Ity_I64);
   11238    dVmut = newTemp(Ity_I64);
   11239    sVcon = newTemp(Ity_I64);
   11240    if (epartIsReg(modrm)) {
   11241       UInt rE = eregOfRexRM(pfx,modrm);
   11242       assign( sV, getXMMReg(rE) );
   11243       imm8 = (UInt)getUChar(delta+1);
   11244       delta += 1+1;
   11245       DIP("%spshuf%cw $%u,%s,%s\n",
   11246           isAvx ? "v" : "", xIsH ? 'h' : 'l',
   11247           imm8, nameXMMReg(rE), nameXMMReg(rG));
   11248    } else {
   11249       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   11250       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11251       imm8 = (UInt)getUChar(delta+alen);
   11252       delta += alen+1;
   11253       DIP("%spshuf%cw $%u,%s,%s\n",
   11254           isAvx ? "v" : "", xIsH ? 'h' : 'l',
   11255           imm8, dis_buf, nameXMMReg(rG));
   11256    }
   11257 
   11258    /* Get the to-be-changed (mut) and unchanging (con) bits of the
   11259       source. */
   11260    assign( sVmut, unop(xIsH ? Iop_V128HIto64 : Iop_V128to64,   mkexpr(sV)) );
   11261    assign( sVcon, unop(xIsH ? Iop_V128to64   : Iop_V128HIto64, mkexpr(sV)) );
   11262 
   11263    breakup64to16s( sVmut, &s3, &s2, &s1, &s0 );
   11264 #  define SEL(n) \
   11265              ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   11266    assign(dVmut, mk64from16s( SEL((imm8>>6)&3), SEL((imm8>>4)&3),
   11267                               SEL((imm8>>2)&3), SEL((imm8>>0)&3) ));
   11268 #  undef SEL
   11269 
   11270    assign(dV, xIsH ? binop(Iop_64HLtoV128, mkexpr(dVmut), mkexpr(sVcon))
   11271                    : binop(Iop_64HLtoV128, mkexpr(sVcon), mkexpr(dVmut)) );
   11272 
   11273    (isAvx ? putYMMRegLoAndZU : putXMMReg)(rG, mkexpr(dV));
   11274    return delta;
   11275 }
   11276 
   11277 
   11278 /* Handle 256 bit PSHUFLW and PSHUFHW. */
   11279 static Long dis_PSHUFxW_256 ( VexAbiInfo* vbi, Prefix pfx,
   11280                               Long delta, Bool xIsH )
   11281 {
   11282    IRTemp addr  = IRTemp_INVALID;
   11283    Int    alen  = 0;
   11284    HChar  dis_buf[50];
   11285    UChar  modrm = getUChar(delta);
   11286    UInt   rG = gregOfRexRM(pfx,modrm);
   11287    UInt   imm8;
   11288    IRTemp sV, s[8], sV64[4], dVhi, dVlo;
   11289    sV64[3] = sV64[2] = sV64[1] = sV64[0] = IRTemp_INVALID;
   11290    s[7] = s[6] = s[5] = s[4] = s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID;
   11291    sV    = newTemp(Ity_V256);
   11292    dVhi  = newTemp(Ity_I64);
   11293    dVlo  = newTemp(Ity_I64);
   11294    if (epartIsReg(modrm)) {
   11295       UInt rE = eregOfRexRM(pfx,modrm);
   11296       assign( sV, getYMMReg(rE) );
   11297       imm8 = (UInt)getUChar(delta+1);
   11298       delta += 1+1;
   11299       DIP("vpshuf%cw $%u,%s,%s\n", xIsH ? 'h' : 'l',
   11300           imm8, nameYMMReg(rE), nameYMMReg(rG));
   11301    } else {
   11302       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   11303       assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   11304       imm8 = (UInt)getUChar(delta+alen);
   11305       delta += alen+1;
   11306       DIP("vpshuf%cw $%u,%s,%s\n", xIsH ? 'h' : 'l',
   11307           imm8, dis_buf, nameYMMReg(rG));
   11308    }
   11309 
   11310    breakupV256to64s( sV, &sV64[3], &sV64[2], &sV64[1], &sV64[0] );
   11311    breakup64to16s( sV64[xIsH ? 3 : 2], &s[7], &s[6], &s[5], &s[4] );
   11312    breakup64to16s( sV64[xIsH ? 1 : 0], &s[3], &s[2], &s[1], &s[0] );
   11313 
   11314    assign( dVhi, mk64from16s( s[4 + ((imm8>>6)&3)], s[4 + ((imm8>>4)&3)],
   11315                               s[4 + ((imm8>>2)&3)], s[4 + ((imm8>>0)&3)] ) );
   11316    assign( dVlo, mk64from16s( s[0 + ((imm8>>6)&3)], s[0 + ((imm8>>4)&3)],
   11317                               s[0 + ((imm8>>2)&3)], s[0 + ((imm8>>0)&3)] ) );
   11318    putYMMReg( rG, mkV256from64s( xIsH ? dVhi : sV64[3],
   11319                                  xIsH ? sV64[2] : dVhi,
   11320                                  xIsH ? dVlo : sV64[1],
   11321                                  xIsH ? sV64[0] : dVlo ) );
   11322    return delta;
   11323 }
   11324 
   11325 
   11326 static Long dis_PEXTRW_128_EregOnly_toG ( VexAbiInfo* vbi, Prefix pfx,
   11327                                           Long delta, Bool isAvx )
   11328 {
   11329    Long   deltaIN = delta;
   11330    UChar  modrm   = getUChar(delta);
   11331    UInt   rG      = gregOfRexRM(pfx,modrm);
   11332    IRTemp sV      = newTemp(Ity_V128);
   11333    IRTemp d16     = newTemp(Ity_I16);
   11334    UInt   imm8;
   11335    IRTemp s0, s1, s2, s3;
   11336    if (epartIsReg(modrm)) {
   11337       UInt rE = eregOfRexRM(pfx,modrm);
   11338       assign(sV, getXMMReg(rE));
   11339       imm8 = getUChar(delta+1) & 7;
   11340       delta += 1+1;
   11341       DIP("%spextrw $%d,%s,%s\n", isAvx ? "v" : "",
   11342           (Int)imm8, nameXMMReg(rE), nameIReg32(rG));
   11343    } else {
   11344       /* The memory case is disallowed, apparently. */
   11345       return deltaIN; /* FAIL */
   11346    }
   11347    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11348    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   11349    switch (imm8) {
   11350       case 0:  assign(d16, unop(Iop_32to16,   mkexpr(s0))); break;
   11351       case 1:  assign(d16, unop(Iop_32HIto16, mkexpr(s0))); break;
   11352       case 2:  assign(d16, unop(Iop_32to16,   mkexpr(s1))); break;
   11353       case 3:  assign(d16, unop(Iop_32HIto16, mkexpr(s1))); break;
   11354       case 4:  assign(d16, unop(Iop_32to16,   mkexpr(s2))); break;
   11355       case 5:  assign(d16, unop(Iop_32HIto16, mkexpr(s2))); break;
   11356       case 6:  assign(d16, unop(Iop_32to16,   mkexpr(s3))); break;
   11357       case 7:  assign(d16, unop(Iop_32HIto16, mkexpr(s3))); break;
   11358       default: vassert(0);
   11359    }
   11360    putIReg32(rG, unop(Iop_16Uto32, mkexpr(d16)));
   11361    return delta;
   11362 }
   11363 
   11364 
   11365 static Long dis_CVTDQ2PD_128 ( VexAbiInfo* vbi, Prefix pfx,
   11366                                Long delta, Bool isAvx )
   11367 {
   11368    IRTemp addr  = IRTemp_INVALID;
   11369    Int    alen  = 0;
   11370    HChar  dis_buf[50];
   11371    UChar  modrm = getUChar(delta);
   11372    IRTemp arg64 = newTemp(Ity_I64);
   11373    UInt   rG    = gregOfRexRM(pfx,modrm);
   11374    const HChar* mbV   = isAvx ? "v" : "";
   11375    if (epartIsReg(modrm)) {
   11376       UInt rE = eregOfRexRM(pfx,modrm);
   11377       assign( arg64, getXMMRegLane64(rE, 0) );
   11378       delta += 1;
   11379       DIP("%scvtdq2pd %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG));
   11380    } else {
   11381       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11382       assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   11383       delta += alen;
   11384       DIP("%scvtdq2pd %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
   11385    }
   11386    putXMMRegLane64F(
   11387       rG, 0,
   11388       unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
   11389    );
   11390    putXMMRegLane64F(
   11391       rG, 1,
   11392       unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
   11393    );
   11394    if (isAvx)
   11395       putYMMRegLane128(rG, 1, mkV128(0));
   11396    return delta;
   11397 }
   11398 
   11399 
   11400 static Long dis_STMXCSR ( VexAbiInfo* vbi, Prefix pfx,
   11401                           Long delta, Bool isAvx )
   11402 {
   11403    IRTemp addr  = IRTemp_INVALID;
   11404    Int    alen  = 0;
   11405    HChar  dis_buf[50];
   11406    UChar  modrm = getUChar(delta);
   11407    vassert(!epartIsReg(modrm)); /* ensured by caller */
   11408    vassert(gregOfRexRM(pfx,modrm) == 3); /* ditto */
   11409 
   11410    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11411    delta += alen;
   11412 
   11413    /* Fake up a native SSE mxcsr word.  The only thing it depends on
   11414       is SSEROUND[1:0], so call a clean helper to cook it up.
   11415    */
   11416    /* ULong amd64h_create_mxcsr ( ULong sseround ) */
   11417    DIP("%sstmxcsr %s\n",  isAvx ? "v" : "", dis_buf);
   11418    storeLE(
   11419       mkexpr(addr),
   11420       unop(Iop_64to32,
   11421            mkIRExprCCall(
   11422               Ity_I64, 0/*regp*/,
   11423               "amd64g_create_mxcsr", &amd64g_create_mxcsr,
   11424               mkIRExprVec_1( unop(Iop_32Uto64,get_sse_roundingmode()) )
   11425            )
   11426       )
   11427    );
   11428    return delta;
   11429 }
   11430 
   11431 
   11432 static Long dis_LDMXCSR ( VexAbiInfo* vbi, Prefix pfx,
   11433                           Long delta, Bool isAvx )
   11434 {
   11435    IRTemp addr  = IRTemp_INVALID;
   11436    Int    alen  = 0;
   11437    HChar  dis_buf[50];
   11438    UChar  modrm = getUChar(delta);
   11439    vassert(!epartIsReg(modrm)); /* ensured by caller */
   11440    vassert(gregOfRexRM(pfx,modrm) == 2); /* ditto */
   11441 
   11442    IRTemp t64 = newTemp(Ity_I64);
   11443    IRTemp ew  = newTemp(Ity_I32);
   11444 
   11445    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11446    delta += alen;
   11447    DIP("%sldmxcsr %s\n",  isAvx ? "v" : "", dis_buf);
   11448 
   11449    /* The only thing we observe in %mxcsr is the rounding mode.
   11450       Therefore, pass the 32-bit value (SSE native-format control
   11451       word) to a clean helper, getting back a 64-bit value, the
   11452       lower half of which is the SSEROUND value to store, and the
   11453       upper half of which is the emulation-warning token which may
   11454       be generated.
   11455    */
   11456    /* ULong amd64h_check_ldmxcsr ( ULong ); */
   11457    assign( t64, mkIRExprCCall(
   11458                    Ity_I64, 0/*regparms*/,
   11459                    "amd64g_check_ldmxcsr",
   11460                    &amd64g_check_ldmxcsr,
   11461                    mkIRExprVec_1(
   11462                       unop(Iop_32Uto64,
   11463                            loadLE(Ity_I32, mkexpr(addr))
   11464                       )
   11465                    )
   11466                 )
   11467          );
   11468 
   11469    put_sse_roundingmode( unop(Iop_64to32, mkexpr(t64)) );
   11470    assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   11471    put_emwarn( mkexpr(ew) );
   11472    /* Finally, if an emulation warning was reported, side-exit to
   11473       the next insn, reporting the warning, so that Valgrind's
   11474       dispatcher sees the warning. */
   11475    stmt(
   11476       IRStmt_Exit(
   11477          binop(Iop_CmpNE64, unop(Iop_32Uto64,mkexpr(ew)), mkU64(0)),
   11478          Ijk_EmWarn,
   11479          IRConst_U64(guest_RIP_bbstart+delta),
   11480          OFFB_RIP
   11481       )
   11482    );
   11483    return delta;
   11484 }
   11485 
   11486 
   11487 static IRTemp math_PINSRW_128 ( IRTemp v128, IRTemp u16, UInt imm8 )
   11488 {
   11489    vassert(imm8 >= 0 && imm8 <= 7);
   11490 
   11491    // Create a V128 value which has the selected word in the
   11492    // specified lane, and zeroes everywhere else.
   11493    IRTemp tmp128    = newTemp(Ity_V128);
   11494    IRTemp halfshift = newTemp(Ity_I64);
   11495    assign(halfshift, binop(Iop_Shl64,
   11496                            unop(Iop_16Uto64, mkexpr(u16)),
   11497                            mkU8(16 * (imm8 & 3))));
   11498    if (imm8 < 4) {
   11499       assign(tmp128, binop(Iop_64HLtoV128, mkU64(0), mkexpr(halfshift)));
   11500    } else {
   11501       assign(tmp128, binop(Iop_64HLtoV128, mkexpr(halfshift), mkU64(0)));
   11502    }
   11503 
   11504    UShort mask = ~(3 << (imm8 * 2));
   11505    IRTemp res  = newTemp(Ity_V128);
   11506    assign( res, binop(Iop_OrV128,
   11507                       mkexpr(tmp128),
   11508                       binop(Iop_AndV128, mkexpr(v128), mkV128(mask))) );
   11509    return res;
   11510 }
   11511 
   11512 
   11513 static IRTemp math_PSADBW_128 ( IRTemp dV, IRTemp sV )
   11514 {
   11515    IRTemp s1, s0, d1, d0;
   11516    s1 = s0 = d1 = d0 = IRTemp_INVALID;
   11517 
   11518    breakupV128to64s( sV, &s1, &s0 );
   11519    breakupV128to64s( dV, &d1, &d0 );
   11520 
   11521    IRTemp res = newTemp(Ity_V128);
   11522    assign( res,
   11523            binop(Iop_64HLtoV128,
   11524                  mkIRExprCCall(Ity_I64, 0/*regparms*/,
   11525                                "amd64g_calculate_mmx_psadbw",
   11526                                &amd64g_calculate_mmx_psadbw,
   11527                                mkIRExprVec_2( mkexpr(s1), mkexpr(d1))),
   11528                  mkIRExprCCall(Ity_I64, 0/*regparms*/,
   11529                                "amd64g_calculate_mmx_psadbw",
   11530                                &amd64g_calculate_mmx_psadbw,
   11531                                mkIRExprVec_2( mkexpr(s0), mkexpr(d0)))) );
   11532    return res;
   11533 }
   11534 
   11535 
   11536 static IRTemp math_PSADBW_256 ( IRTemp dV, IRTemp sV )
   11537 {
   11538    IRTemp sHi, sLo, dHi, dLo;
   11539    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   11540    breakupV256toV128s( dV, &dHi, &dLo);
   11541    breakupV256toV128s( sV, &sHi, &sLo);
   11542    IRTemp res = newTemp(Ity_V256);
   11543    assign(res, binop(Iop_V128HLtoV256,
   11544                      mkexpr(math_PSADBW_128(dHi, sHi)),
   11545                      mkexpr(math_PSADBW_128(dLo, sLo))));
   11546    return res;
   11547 }
   11548 
   11549 
   11550 static Long dis_MASKMOVDQU ( VexAbiInfo* vbi, Prefix pfx,
   11551                              Long delta, Bool isAvx )
   11552 {
   11553    IRTemp regD    = newTemp(Ity_V128);
   11554    IRTemp mask    = newTemp(Ity_V128);
   11555    IRTemp olddata = newTemp(Ity_V128);
   11556    IRTemp newdata = newTemp(Ity_V128);
   11557    IRTemp addr    = newTemp(Ity_I64);
   11558    UChar  modrm   = getUChar(delta);
   11559    UInt   rG      = gregOfRexRM(pfx,modrm);
   11560    UInt   rE      = eregOfRexRM(pfx,modrm);
   11561 
   11562    assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
   11563    assign( regD, getXMMReg( rG ));
   11564 
   11565    /* Unfortunately can't do the obvious thing with SarN8x16
   11566       here since that can't be re-emitted as SSE2 code - no such
   11567       insn. */
   11568    assign( mask,
   11569            binop(Iop_64HLtoV128,
   11570                  binop(Iop_SarN8x8,
   11571                        getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ),
   11572                        mkU8(7) ),
   11573                  binop(Iop_SarN8x8,
   11574                        getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ),
   11575                        mkU8(7) ) ));
   11576    assign( olddata, loadLE( Ity_V128, mkexpr(addr) ));
   11577    assign( newdata, binop(Iop_OrV128,
   11578                           binop(Iop_AndV128,
   11579                                 mkexpr(regD),
   11580                                 mkexpr(mask) ),
   11581                           binop(Iop_AndV128,
   11582                                 mkexpr(olddata),
   11583                                 unop(Iop_NotV128, mkexpr(mask)))) );
   11584    storeLE( mkexpr(addr), mkexpr(newdata) );
   11585 
   11586    delta += 1;
   11587    DIP("%smaskmovdqu %s,%s\n", isAvx ? "v" : "",
   11588        nameXMMReg(rE), nameXMMReg(rG) );
   11589    return delta;
   11590 }
   11591 
   11592 
   11593 static Long dis_MOVMSKPS_128 ( VexAbiInfo* vbi, Prefix pfx,
   11594                                Long delta, Bool isAvx )
   11595 {
   11596    UChar modrm = getUChar(delta);
   11597    UInt   rG   = gregOfRexRM(pfx,modrm);
   11598    UInt   rE   = eregOfRexRM(pfx,modrm);
   11599    IRTemp t0   = newTemp(Ity_I32);
   11600    IRTemp t1   = newTemp(Ity_I32);
   11601    IRTemp t2   = newTemp(Ity_I32);
   11602    IRTemp t3   = newTemp(Ity_I32);
   11603    delta += 1;
   11604    assign( t0, binop( Iop_And32,
   11605                       binop(Iop_Shr32, getXMMRegLane32(rE,0), mkU8(31)),
   11606                       mkU32(1) ));
   11607    assign( t1, binop( Iop_And32,
   11608                       binop(Iop_Shr32, getXMMRegLane32(rE,1), mkU8(30)),
   11609                       mkU32(2) ));
   11610    assign( t2, binop( Iop_And32,
   11611                       binop(Iop_Shr32, getXMMRegLane32(rE,2), mkU8(29)),
   11612                       mkU32(4) ));
   11613    assign( t3, binop( Iop_And32,
   11614                       binop(Iop_Shr32, getXMMRegLane32(rE,3), mkU8(28)),
   11615                       mkU32(8) ));
   11616    putIReg32( rG, binop(Iop_Or32,
   11617                         binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
   11618                         binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ) );
   11619    DIP("%smovmskps %s,%s\n", isAvx ? "v" : "",
   11620        nameXMMReg(rE), nameIReg32(rG));
   11621    return delta;
   11622 }
   11623 
   11624 
   11625 static Long dis_MOVMSKPS_256 ( VexAbiInfo* vbi, Prefix pfx, Long delta )
   11626 {
   11627    UChar modrm = getUChar(delta);
   11628    UInt   rG   = gregOfRexRM(pfx,modrm);
   11629    UInt   rE   = eregOfRexRM(pfx,modrm);
   11630    IRTemp t0   = newTemp(Ity_I32);
   11631    IRTemp t1   = newTemp(Ity_I32);
   11632    IRTemp t2   = newTemp(Ity_I32);
   11633    IRTemp t3   = newTemp(Ity_I32);
   11634    IRTemp t4   = newTemp(Ity_I32);
   11635    IRTemp t5   = newTemp(Ity_I32);
   11636    IRTemp t6   = newTemp(Ity_I32);
   11637    IRTemp t7   = newTemp(Ity_I32);
   11638    delta += 1;
   11639    assign( t0, binop( Iop_And32,
   11640                       binop(Iop_Shr32, getYMMRegLane32(rE,0), mkU8(31)),
   11641                       mkU32(1) ));
   11642    assign( t1, binop( Iop_And32,
   11643                       binop(Iop_Shr32, getYMMRegLane32(rE,1), mkU8(30)),
   11644                       mkU32(2) ));
   11645    assign( t2, binop( Iop_And32,
   11646                       binop(Iop_Shr32, getYMMRegLane32(rE,2), mkU8(29)),
   11647                       mkU32(4) ));
   11648    assign( t3, binop( Iop_And32,
   11649                       binop(Iop_Shr32, getYMMRegLane32(rE,3), mkU8(28)),
   11650                       mkU32(8) ));
   11651    assign( t4, binop( Iop_And32,
   11652                       binop(Iop_Shr32, getYMMRegLane32(rE,4), mkU8(27)),
   11653                       mkU32(16) ));
   11654    assign( t5, binop( Iop_And32,
   11655                       binop(Iop_Shr32, getYMMRegLane32(rE,5), mkU8(26)),
   11656                       mkU32(32) ));
   11657    assign( t6, binop( Iop_And32,
   11658                       binop(Iop_Shr32, getYMMRegLane32(rE,6), mkU8(25)),
   11659                       mkU32(64) ));
   11660    assign( t7, binop( Iop_And32,
   11661                       binop(Iop_Shr32, getYMMRegLane32(rE,7), mkU8(24)),
   11662                       mkU32(128) ));
   11663    putIReg32( rG, binop(Iop_Or32,
   11664                         binop(Iop_Or32,
   11665                               binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
   11666                               binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ),
   11667                         binop(Iop_Or32,
   11668                               binop(Iop_Or32, mkexpr(t4), mkexpr(t5)),
   11669                               binop(Iop_Or32, mkexpr(t6), mkexpr(t7)) ) ) );
   11670    DIP("vmovmskps %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
   11671    return delta;
   11672 }
   11673 
   11674 
   11675 static Long dis_MOVMSKPD_128 ( VexAbiInfo* vbi, Prefix pfx,
   11676                                Long delta, Bool isAvx )
   11677 {
   11678    UChar modrm = getUChar(delta);
   11679    UInt   rG   = gregOfRexRM(pfx,modrm);
   11680    UInt   rE   = eregOfRexRM(pfx,modrm);
   11681    IRTemp t0   = newTemp(Ity_I32);
   11682    IRTemp t1   = newTemp(Ity_I32);
   11683    delta += 1;
   11684    assign( t0, binop( Iop_And32,
   11685                       binop(Iop_Shr32, getXMMRegLane32(rE,1), mkU8(31)),
   11686                       mkU32(1) ));
   11687    assign( t1, binop( Iop_And32,
   11688                       binop(Iop_Shr32, getXMMRegLane32(rE,3), mkU8(30)),
   11689                       mkU32(2) ));
   11690    putIReg32( rG, binop(Iop_Or32, mkexpr(t0), mkexpr(t1) ) );
   11691    DIP("%smovmskpd %s,%s\n", isAvx ? "v" : "",
   11692        nameXMMReg(rE), nameIReg32(rG));
   11693    return delta;
   11694 }
   11695 
   11696 
   11697 static Long dis_MOVMSKPD_256 ( VexAbiInfo* vbi, Prefix pfx, Long delta )
   11698 {
   11699    UChar modrm = getUChar(delta);
   11700    UInt   rG   = gregOfRexRM(pfx,modrm);
   11701    UInt   rE   = eregOfRexRM(pfx,modrm);
   11702    IRTemp t0   = newTemp(Ity_I32);
   11703    IRTemp t1   = newTemp(Ity_I32);
   11704    IRTemp t2   = newTemp(Ity_I32);
   11705    IRTemp t3   = newTemp(Ity_I32);
   11706    delta += 1;
   11707    assign( t0, binop( Iop_And32,
   11708                       binop(Iop_Shr32, getYMMRegLane32(rE,1), mkU8(31)),
   11709                       mkU32(1) ));
   11710    assign( t1, binop( Iop_And32,
   11711                       binop(Iop_Shr32, getYMMRegLane32(rE,3), mkU8(30)),
   11712                       mkU32(2) ));
   11713    assign( t2, binop( Iop_And32,
   11714                       binop(Iop_Shr32, getYMMRegLane32(rE,5), mkU8(29)),
   11715                       mkU32(4) ));
   11716    assign( t3, binop( Iop_And32,
   11717                       binop(Iop_Shr32, getYMMRegLane32(rE,7), mkU8(28)),
   11718                       mkU32(8) ));
   11719    putIReg32( rG, binop(Iop_Or32,
   11720                         binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
   11721                         binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ) );
   11722    DIP("vmovmskps %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
   11723    return delta;
   11724 }
   11725 
   11726 
   11727 /* Note, this also handles SSE(1) insns. */
   11728 __attribute__((noinline))
   11729 static
   11730 Long dis_ESC_0F__SSE2 ( Bool* decode_OK,
   11731                         VexAbiInfo* vbi,
   11732                         Prefix pfx, Int sz, Long deltaIN,
   11733                         DisResult* dres )
   11734 {
   11735    IRTemp addr  = IRTemp_INVALID;
   11736    IRTemp t0    = IRTemp_INVALID;
   11737    IRTemp t1    = IRTemp_INVALID;
   11738    IRTemp t2    = IRTemp_INVALID;
   11739    IRTemp t3    = IRTemp_INVALID;
   11740    IRTemp t4    = IRTemp_INVALID;
   11741    IRTemp t5    = IRTemp_INVALID;
   11742    IRTemp t6    = IRTemp_INVALID;
   11743    UChar  modrm = 0;
   11744    Int    alen  = 0;
   11745    HChar  dis_buf[50];
   11746 
   11747    *decode_OK = False;
   11748 
   11749    Long   delta = deltaIN;
   11750    UChar  opc   = getUChar(delta);
   11751    delta++;
   11752    switch (opc) {
   11753 
   11754    case 0x10:
   11755       if (have66noF2noF3(pfx)
   11756           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   11757          /* 66 0F 10 = MOVUPD -- move from E (mem or xmm) to G (xmm). */
   11758          modrm = getUChar(delta);
   11759          if (epartIsReg(modrm)) {
   11760             putXMMReg( gregOfRexRM(pfx,modrm),
   11761                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   11762             DIP("movupd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   11763                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   11764             delta += 1;
   11765          } else {
   11766             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11767             putXMMReg( gregOfRexRM(pfx,modrm),
   11768                        loadLE(Ity_V128, mkexpr(addr)) );
   11769             DIP("movupd %s,%s\n", dis_buf,
   11770                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   11771             delta += alen;
   11772          }
   11773          goto decode_success;
   11774       }
   11775       /* F2 0F 10 = MOVSD -- move 64 bits from E (mem or lo half xmm) to
   11776          G (lo half xmm).  If E is mem, upper half of G is zeroed out.
   11777          If E is reg, upper half of G is unchanged. */
   11778       if (haveF2no66noF3(pfx)
   11779           && (sz == 4 || /* ignore redundant REX.W */ sz == 8) ) {
   11780          modrm = getUChar(delta);
   11781          if (epartIsReg(modrm)) {
   11782             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   11783                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
   11784             DIP("movsd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   11785                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   11786             delta += 1;
   11787          } else {
   11788             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11789             putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
   11790             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   11791                              loadLE(Ity_I64, mkexpr(addr)) );
   11792             DIP("movsd %s,%s\n", dis_buf,
   11793                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   11794             delta += alen;
   11795          }
   11796          goto decode_success;
   11797       }
   11798       /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
   11799          (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
   11800       if (haveF3no66noF2(pfx)
   11801           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   11802          modrm = getUChar(delta);
   11803          if (epartIsReg(modrm)) {
   11804             putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
   11805                              getXMMRegLane32( eregOfRexRM(pfx,modrm), 0 ));
   11806             DIP("movss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   11807                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   11808             delta += 1;
   11809          } else {
   11810             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11811             putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
   11812             putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
   11813                              loadLE(Ity_I32, mkexpr(addr)) );
   11814             DIP("movss %s,%s\n", dis_buf,
   11815                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   11816             delta += alen;
   11817          }
   11818          goto decode_success;
   11819       }
   11820       /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
   11821       if (haveNo66noF2noF3(pfx)
   11822           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   11823          modrm = getUChar(delta);
   11824          if (epartIsReg(modrm)) {
   11825             putXMMReg( gregOfRexRM(pfx,modrm),
   11826                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   11827             DIP("movups %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   11828                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   11829             delta += 1;
   11830          } else {
   11831             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11832             putXMMReg( gregOfRexRM(pfx,modrm),
   11833                        loadLE(Ity_V128, mkexpr(addr)) );
   11834             DIP("movups %s,%s\n", dis_buf,
   11835                                      nameXMMReg(gregOfRexRM(pfx,modrm)));
   11836             delta += alen;
   11837          }
   11838          goto decode_success;
   11839       }
   11840       break;
   11841 
   11842    case 0x11:
   11843       /* F2 0F 11 = MOVSD -- move 64 bits from G (lo half xmm) to E (mem
   11844          or lo half xmm). */
   11845       if (haveF2no66noF3(pfx)
   11846           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   11847          modrm = getUChar(delta);
   11848          if (epartIsReg(modrm)) {
   11849             putXMMRegLane64( eregOfRexRM(pfx,modrm), 0,
   11850                              getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
   11851             DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11852                                  nameXMMReg(eregOfRexRM(pfx,modrm)));
   11853             delta += 1;
   11854          } else {
   11855             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11856             storeLE( mkexpr(addr),
   11857                      getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
   11858             DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11859                                  dis_buf);
   11860             delta += alen;
   11861          }
   11862          goto decode_success;
   11863       }
   11864       /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
   11865          or lo 1/4 xmm). */
   11866       if (haveF3no66noF2(pfx) && sz == 4) {
   11867          modrm = getUChar(delta);
   11868          if (epartIsReg(modrm)) {
   11869             /* fall through, we don't yet have a test case */
   11870          } else {
   11871             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11872             storeLE( mkexpr(addr),
   11873                      getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
   11874             DIP("movss %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11875                                  dis_buf);
   11876             delta += alen;
   11877             goto decode_success;
   11878          }
   11879       }
   11880       /* 66 0F 11 = MOVUPD -- move from G (xmm) to E (mem or xmm). */
   11881       if (have66noF2noF3(pfx)
   11882           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   11883          modrm = getUChar(delta);
   11884          if (epartIsReg(modrm)) {
   11885             putXMMReg( eregOfRexRM(pfx,modrm),
   11886                        getXMMReg( gregOfRexRM(pfx,modrm) ) );
   11887             DIP("movupd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11888                                   nameXMMReg(eregOfRexRM(pfx,modrm)));
   11889             delta += 1;
   11890          } else {
   11891             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11892             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   11893             DIP("movupd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11894                                   dis_buf );
   11895             delta += alen;
   11896          }
   11897          goto decode_success;
   11898       }
   11899       /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
   11900       if (haveNo66noF2noF3(pfx)
   11901           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   11902          modrm = getUChar(delta);
   11903          if (epartIsReg(modrm)) {
   11904             /* fall through; awaiting test case */
   11905          } else {
   11906             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11907             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   11908             DIP("movups %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   11909                                   dis_buf );
   11910             delta += alen;
   11911             goto decode_success;
   11912          }
   11913       }
   11914       break;
   11915 
   11916    case 0x12:
   11917       /* 66 0F 12 = MOVLPD -- move from mem to low half of XMM. */
   11918       /* Identical to MOVLPS ? */
   11919       if (have66noF2noF3(pfx)
   11920           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   11921          modrm = getUChar(delta);
   11922          if (epartIsReg(modrm)) {
   11923             /* fall through; apparently reg-reg is not possible */
   11924          } else {
   11925             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11926             delta += alen;
   11927             putXMMRegLane64( gregOfRexRM(pfx,modrm),
   11928                              0/*lower lane*/,
   11929                              loadLE(Ity_I64, mkexpr(addr)) );
   11930             DIP("movlpd %s, %s\n",
   11931                 dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
   11932             goto decode_success;
   11933          }
   11934       }
   11935       /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
   11936       /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
   11937       if (haveNo66noF2noF3(pfx)
   11938           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   11939          modrm = getUChar(delta);
   11940          if (epartIsReg(modrm)) {
   11941             delta += 1;
   11942             putXMMRegLane64( gregOfRexRM(pfx,modrm),
   11943                              0/*lower lane*/,
   11944                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ));
   11945             DIP("movhlps %s, %s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   11946                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   11947          } else {
   11948             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11949             delta += alen;
   11950             putXMMRegLane64( gregOfRexRM(pfx,modrm),  0/*lower lane*/,
   11951                              loadLE(Ity_I64, mkexpr(addr)) );
   11952             DIP("movlps %s, %s\n",
   11953                 dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
   11954          }
   11955          goto decode_success;
   11956       }
   11957       break;
   11958 
   11959    case 0x13:
   11960       /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
   11961       if (haveNo66noF2noF3(pfx)
   11962           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   11963          modrm = getUChar(delta);
   11964          if (!epartIsReg(modrm)) {
   11965             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11966             delta += alen;
   11967             storeLE( mkexpr(addr),
   11968                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
   11969                                       0/*lower lane*/ ) );
   11970             DIP("movlps %s, %s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   11971                                    dis_buf);
   11972             goto decode_success;
   11973          }
   11974          /* else fall through */
   11975       }
   11976       /* 66 0F 13 = MOVLPD -- move from low half of XMM to mem. */
   11977       /* Identical to MOVLPS ? */
   11978       if (have66noF2noF3(pfx)
   11979           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   11980          modrm = getUChar(delta);
   11981          if (!epartIsReg(modrm)) {
   11982             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11983             delta += alen;
   11984             storeLE( mkexpr(addr),
   11985                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
   11986                                       0/*lower lane*/ ) );
   11987             DIP("movlpd %s, %s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   11988                                    dis_buf);
   11989             goto decode_success;
   11990          }
   11991          /* else fall through */
   11992       }
   11993       break;
   11994 
   11995    case 0x14:
   11996    case 0x15:
   11997       /* 0F 14 = UNPCKLPS -- unpack and interleave low part F32s */
   11998       /* 0F 15 = UNPCKHPS -- unpack and interleave high part F32s */
   11999       /* These just appear to be special cases of SHUFPS */
   12000       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12001          Bool   hi = toBool(opc == 0x15);
   12002          IRTemp sV = newTemp(Ity_V128);
   12003          IRTemp dV = newTemp(Ity_V128);
   12004          modrm = getUChar(delta);
   12005          UInt   rG = gregOfRexRM(pfx,modrm);
   12006          assign( dV, getXMMReg(rG) );
   12007          if (epartIsReg(modrm)) {
   12008             UInt rE = eregOfRexRM(pfx,modrm);
   12009             assign( sV, getXMMReg(rE) );
   12010             delta += 1;
   12011             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   12012                 nameXMMReg(rE), nameXMMReg(rG));
   12013          } else {
   12014             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12015             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12016             delta += alen;
   12017             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   12018                 dis_buf, nameXMMReg(rG));
   12019          }
   12020          IRTemp res = math_UNPCKxPS_128( sV, dV, hi );
   12021          putXMMReg( rG, mkexpr(res) );
   12022          goto decode_success;
   12023       }
   12024       /* 66 0F 15 = UNPCKHPD -- unpack and interleave high part F64s */
   12025       /* 66 0F 14 = UNPCKLPD -- unpack and interleave low part F64s */
   12026       /* These just appear to be special cases of SHUFPS */
   12027       if (have66noF2noF3(pfx)
   12028           && sz == 2 /* could be 8 if rex also present */) {
   12029          Bool   hi = toBool(opc == 0x15);
   12030          IRTemp sV = newTemp(Ity_V128);
   12031          IRTemp dV = newTemp(Ity_V128);
   12032          modrm = getUChar(delta);
   12033          UInt   rG = gregOfRexRM(pfx,modrm);
   12034          assign( dV, getXMMReg(rG) );
   12035          if (epartIsReg(modrm)) {
   12036             UInt rE = eregOfRexRM(pfx,modrm);
   12037             assign( sV, getXMMReg(rE) );
   12038             delta += 1;
   12039             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   12040                 nameXMMReg(rE), nameXMMReg(rG));
   12041          } else {
   12042             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12043             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12044             delta += alen;
   12045             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   12046                 dis_buf, nameXMMReg(rG));
   12047          }
   12048          IRTemp res = math_UNPCKxPD_128( sV, dV, hi );
   12049          putXMMReg( rG, mkexpr(res) );
   12050          goto decode_success;
   12051       }
   12052       break;
   12053 
   12054    case 0x16:
   12055       /* 66 0F 16 = MOVHPD -- move from mem to high half of XMM. */
   12056       /* These seems identical to MOVHPS.  This instruction encoding is
   12057          completely crazy. */
   12058       if (have66noF2noF3(pfx)
   12059           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12060          modrm = getUChar(delta);
   12061          if (epartIsReg(modrm)) {
   12062             /* fall through; apparently reg-reg is not possible */
   12063          } else {
   12064             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12065             delta += alen;
   12066             putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
   12067                              loadLE(Ity_I64, mkexpr(addr)) );
   12068             DIP("movhpd %s,%s\n", dis_buf,
   12069                                   nameXMMReg( gregOfRexRM(pfx,modrm) ));
   12070             goto decode_success;
   12071          }
   12072       }
   12073       /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
   12074       /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
   12075       if (haveNo66noF2noF3(pfx)
   12076           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12077          modrm = getUChar(delta);
   12078          if (epartIsReg(modrm)) {
   12079             delta += 1;
   12080             putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
   12081                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ) );
   12082             DIP("movhps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12083                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12084          } else {
   12085             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12086             delta += alen;
   12087             putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
   12088                              loadLE(Ity_I64, mkexpr(addr)) );
   12089             DIP("movhps %s,%s\n", dis_buf,
   12090                                   nameXMMReg( gregOfRexRM(pfx,modrm) ));
   12091          }
   12092          goto decode_success;
   12093       }
   12094       break;
   12095 
   12096    case 0x17:
   12097       /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
   12098       if (haveNo66noF2noF3(pfx)
   12099           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12100          modrm = getUChar(delta);
   12101          if (!epartIsReg(modrm)) {
   12102             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12103             delta += alen;
   12104             storeLE( mkexpr(addr),
   12105                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
   12106                                       1/*upper lane*/ ) );
   12107             DIP("movhps %s,%s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   12108                                   dis_buf);
   12109             goto decode_success;
   12110          }
   12111          /* else fall through */
   12112       }
   12113       /* 66 0F 17 = MOVHPD -- move from high half of XMM to mem. */
   12114       /* Again, this seems identical to MOVHPS. */
   12115       if (have66noF2noF3(pfx)
   12116           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12117          modrm = getUChar(delta);
   12118          if (!epartIsReg(modrm)) {
   12119             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12120             delta += alen;
   12121             storeLE( mkexpr(addr),
   12122                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
   12123                                       1/*upper lane*/ ) );
   12124             DIP("movhpd %s,%s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   12125                                   dis_buf);
   12126             goto decode_success;
   12127          }
   12128          /* else fall through */
   12129       }
   12130       break;
   12131 
   12132    case 0x18:
   12133       /* 0F 18 /0 = PREFETCHNTA -- prefetch into caches, */
   12134       /* 0F 18 /1 = PREFETCH0   -- with various different hints */
   12135       /* 0F 18 /2 = PREFETCH1 */
   12136       /* 0F 18 /3 = PREFETCH2 */
   12137       if (haveNo66noF2noF3(pfx)
   12138           && !epartIsReg(getUChar(delta))
   12139           && gregLO3ofRM(getUChar(delta)) >= 0
   12140           && gregLO3ofRM(getUChar(delta)) <= 3) {
   12141          const HChar* hintstr = "??";
   12142 
   12143          modrm = getUChar(delta);
   12144          vassert(!epartIsReg(modrm));
   12145 
   12146          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12147          delta += alen;
   12148 
   12149          switch (gregLO3ofRM(modrm)) {
   12150             case 0: hintstr = "nta"; break;
   12151             case 1: hintstr = "t0"; break;
   12152             case 2: hintstr = "t1"; break;
   12153             case 3: hintstr = "t2"; break;
   12154             default: vassert(0);
   12155          }
   12156 
   12157          DIP("prefetch%s %s\n", hintstr, dis_buf);
   12158          goto decode_success;
   12159       }
   12160       break;
   12161 
   12162    case 0x28:
   12163       /* 66 0F 28 = MOVAPD -- move from E (mem or xmm) to G (xmm). */
   12164       if (have66noF2noF3(pfx)
   12165           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12166          modrm = getUChar(delta);
   12167          if (epartIsReg(modrm)) {
   12168             putXMMReg( gregOfRexRM(pfx,modrm),
   12169                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   12170             DIP("movapd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12171                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12172             delta += 1;
   12173          } else {
   12174             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12175             gen_SEGV_if_not_16_aligned( addr );
   12176             putXMMReg( gregOfRexRM(pfx,modrm),
   12177                        loadLE(Ity_V128, mkexpr(addr)) );
   12178             DIP("movapd %s,%s\n", dis_buf,
   12179                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12180             delta += alen;
   12181          }
   12182          goto decode_success;
   12183       }
   12184       /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
   12185       if (haveNo66noF2noF3(pfx)
   12186           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12187          modrm = getUChar(delta);
   12188          if (epartIsReg(modrm)) {
   12189             putXMMReg( gregOfRexRM(pfx,modrm),
   12190                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   12191             DIP("movaps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12192                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12193             delta += 1;
   12194          } else {
   12195             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12196             gen_SEGV_if_not_16_aligned( addr );
   12197             putXMMReg( gregOfRexRM(pfx,modrm),
   12198                        loadLE(Ity_V128, mkexpr(addr)) );
   12199             DIP("movaps %s,%s\n", dis_buf,
   12200                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12201             delta += alen;
   12202          }
   12203          goto decode_success;
   12204       }
   12205       break;
   12206 
   12207    case 0x29:
   12208       /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
   12209       if (haveNo66noF2noF3(pfx)
   12210           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12211          modrm = getUChar(delta);
   12212          if (epartIsReg(modrm)) {
   12213             putXMMReg( eregOfRexRM(pfx,modrm),
   12214                        getXMMReg( gregOfRexRM(pfx,modrm) ));
   12215             DIP("movaps %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12216                                   nameXMMReg(eregOfRexRM(pfx,modrm)));
   12217             delta += 1;
   12218          } else {
   12219             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12220             gen_SEGV_if_not_16_aligned( addr );
   12221             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   12222             DIP("movaps %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12223                                   dis_buf );
   12224             delta += alen;
   12225          }
   12226          goto decode_success;
   12227       }
   12228       /* 66 0F 29 = MOVAPD -- move from G (xmm) to E (mem or xmm). */
   12229       if (have66noF2noF3(pfx)
   12230           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12231          modrm = getUChar(delta);
   12232          if (epartIsReg(modrm)) {
   12233             putXMMReg( eregOfRexRM(pfx,modrm),
   12234                        getXMMReg( gregOfRexRM(pfx,modrm) ) );
   12235             DIP("movapd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12236                                   nameXMMReg(eregOfRexRM(pfx,modrm)));
   12237             delta += 1;
   12238          } else {
   12239             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12240             gen_SEGV_if_not_16_aligned( addr );
   12241             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   12242             DIP("movapd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12243                                   dis_buf );
   12244             delta += alen;
   12245          }
   12246          goto decode_success;
   12247       }
   12248       break;
   12249 
   12250    case 0x2A:
   12251       /* 0F 2A = CVTPI2PS -- convert 2 x I32 in mem/mmx to 2 x F32 in low
   12252          half xmm */
   12253       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12254          IRTemp arg64 = newTemp(Ity_I64);
   12255          IRTemp rmode = newTemp(Ity_I32);
   12256 
   12257          modrm = getUChar(delta);
   12258          do_MMX_preamble();
   12259          if (epartIsReg(modrm)) {
   12260             assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
   12261             delta += 1;
   12262             DIP("cvtpi2ps %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   12263                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12264          } else {
   12265             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12266             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   12267             delta += alen;
   12268             DIP("cvtpi2ps %s,%s\n", dis_buf,
   12269                                     nameXMMReg(gregOfRexRM(pfx,modrm)) );
   12270          }
   12271 
   12272          assign( rmode, get_sse_roundingmode() );
   12273 
   12274          putXMMRegLane32F(
   12275             gregOfRexRM(pfx,modrm), 0,
   12276             binop(Iop_F64toF32,
   12277                   mkexpr(rmode),
   12278                   unop(Iop_I32StoF64,
   12279                        unop(Iop_64to32, mkexpr(arg64)) )) );
   12280 
   12281          putXMMRegLane32F(
   12282             gregOfRexRM(pfx,modrm), 1,
   12283             binop(Iop_F64toF32,
   12284                   mkexpr(rmode),
   12285                   unop(Iop_I32StoF64,
   12286                        unop(Iop_64HIto32, mkexpr(arg64)) )) );
   12287 
   12288          goto decode_success;
   12289       }
   12290       /* F3 0F 2A = CVTSI2SS
   12291          -- sz==4: convert I32 in mem/ireg to F32 in low quarter xmm
   12292          -- sz==8: convert I64 in mem/ireg to F32 in low quarter xmm */
   12293       if (haveF3no66noF2(pfx) && (sz == 4 || sz == 8)) {
   12294          IRTemp rmode = newTemp(Ity_I32);
   12295          assign( rmode, get_sse_roundingmode() );
   12296          modrm = getUChar(delta);
   12297          if (sz == 4) {
   12298             IRTemp arg32 = newTemp(Ity_I32);
   12299             if (epartIsReg(modrm)) {
   12300                assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
   12301                delta += 1;
   12302                DIP("cvtsi2ss %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   12303                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
   12304             } else {
   12305                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12306                assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   12307                delta += alen;
   12308                DIP("cvtsi2ss %s,%s\n", dis_buf,
   12309                                        nameXMMReg(gregOfRexRM(pfx,modrm)) );
   12310             }
   12311             putXMMRegLane32F(
   12312                gregOfRexRM(pfx,modrm), 0,
   12313                binop(Iop_F64toF32,
   12314                      mkexpr(rmode),
   12315                      unop(Iop_I32StoF64, mkexpr(arg32)) ) );
   12316          } else {
   12317             /* sz == 8 */
   12318             IRTemp arg64 = newTemp(Ity_I64);
   12319             if (epartIsReg(modrm)) {
   12320                assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
   12321                delta += 1;
   12322                DIP("cvtsi2ssq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   12323                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   12324             } else {
   12325                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12326                assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   12327                delta += alen;
   12328                DIP("cvtsi2ssq %s,%s\n", dis_buf,
   12329                                         nameXMMReg(gregOfRexRM(pfx,modrm)) );
   12330             }
   12331             putXMMRegLane32F(
   12332                gregOfRexRM(pfx,modrm), 0,
   12333                binop(Iop_F64toF32,
   12334                      mkexpr(rmode),
   12335                      binop(Iop_I64StoF64, mkexpr(rmode), mkexpr(arg64)) ) );
   12336          }
   12337          goto decode_success;
   12338       }
   12339       /* F2 0F 2A = CVTSI2SD
   12340          when sz==4 -- convert I32 in mem/ireg to F64 in low half xmm
   12341          when sz==8 -- convert I64 in mem/ireg to F64 in low half xmm
   12342       */
   12343       if (haveF2no66noF3(pfx) && (sz == 4 || sz == 8)) {
   12344          modrm = getUChar(delta);
   12345          if (sz == 4) {
   12346             IRTemp arg32 = newTemp(Ity_I32);
   12347             if (epartIsReg(modrm)) {
   12348                assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
   12349                delta += 1;
   12350                DIP("cvtsi2sdl %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   12351                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   12352             } else {
   12353                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12354                assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   12355                delta += alen;
   12356                DIP("cvtsi2sdl %s,%s\n", dis_buf,
   12357                                         nameXMMReg(gregOfRexRM(pfx,modrm)) );
   12358             }
   12359             putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
   12360                               unop(Iop_I32StoF64, mkexpr(arg32))
   12361             );
   12362          } else {
   12363             /* sz == 8 */
   12364             IRTemp arg64 = newTemp(Ity_I64);
   12365             if (epartIsReg(modrm)) {
   12366                assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
   12367                delta += 1;
   12368                DIP("cvtsi2sdq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   12369                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   12370             } else {
   12371                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12372                assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   12373                delta += alen;
   12374                DIP("cvtsi2sdq %s,%s\n", dis_buf,
   12375                                         nameXMMReg(gregOfRexRM(pfx,modrm)) );
   12376             }
   12377             putXMMRegLane64F(
   12378                gregOfRexRM(pfx,modrm),
   12379                0,
   12380                binop( Iop_I64StoF64,
   12381                       get_sse_roundingmode(),
   12382                       mkexpr(arg64)
   12383                )
   12384             );
   12385          }
   12386          goto decode_success;
   12387       }
   12388       /* 66 0F 2A = CVTPI2PD -- convert 2 x I32 in mem/mmx to 2 x F64 in
   12389          xmm(G) */
   12390       if (have66noF2noF3(pfx) && sz == 2) {
   12391          IRTemp arg64 = newTemp(Ity_I64);
   12392 
   12393          modrm = getUChar(delta);
   12394          if (epartIsReg(modrm)) {
   12395             /* Only switch to MMX mode if the source is a MMX register.
   12396                This is inconsistent with all other instructions which
   12397                convert between XMM and (M64 or MMX), which always switch
   12398                to MMX mode even if 64-bit operand is M64 and not MMX.  At
   12399                least, that's what the Intel docs seem to me to say.
   12400                Fixes #210264. */
   12401             do_MMX_preamble();
   12402             assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
   12403             delta += 1;
   12404             DIP("cvtpi2pd %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   12405                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12406          } else {
   12407             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12408             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   12409             delta += alen;
   12410             DIP("cvtpi2pd %s,%s\n", dis_buf,
   12411                                     nameXMMReg(gregOfRexRM(pfx,modrm)) );
   12412          }
   12413 
   12414          putXMMRegLane64F(
   12415             gregOfRexRM(pfx,modrm), 0,
   12416             unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)) )
   12417          );
   12418 
   12419          putXMMRegLane64F(
   12420             gregOfRexRM(pfx,modrm), 1,
   12421             unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)) )
   12422          );
   12423 
   12424          goto decode_success;
   12425       }
   12426       break;
   12427 
   12428    case 0x2B:
   12429       /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
   12430       /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
   12431       if ( (haveNo66noF2noF3(pfx) && sz == 4)
   12432            || (have66noF2noF3(pfx) && sz == 2) ) {
   12433          modrm = getUChar(delta);
   12434          if (!epartIsReg(modrm)) {
   12435             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12436             gen_SEGV_if_not_16_aligned( addr );
   12437             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   12438             DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
   12439                                     dis_buf,
   12440                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12441             delta += alen;
   12442             goto decode_success;
   12443          }
   12444          /* else fall through */
   12445       }
   12446       break;
   12447 
   12448    case 0x2C:
   12449    case 0x2D:
   12450       /* 0F 2D = CVTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   12451          I32 in mmx, according to prevailing SSE rounding mode */
   12452       /* 0F 2C = CVTTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   12453          I32 in mmx, rounding towards zero */
   12454       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12455          IRTemp dst64  = newTemp(Ity_I64);
   12456          IRTemp rmode  = newTemp(Ity_I32);
   12457          IRTemp f32lo  = newTemp(Ity_F32);
   12458          IRTemp f32hi  = newTemp(Ity_F32);
   12459          Bool   r2zero = toBool(opc == 0x2C);
   12460 
   12461          do_MMX_preamble();
   12462          modrm = getUChar(delta);
   12463 
   12464          if (epartIsReg(modrm)) {
   12465             delta += 1;
   12466             assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
   12467             assign(f32hi, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 1));
   12468             DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   12469                                       nameXMMReg(eregOfRexRM(pfx,modrm)),
   12470                                       nameMMXReg(gregLO3ofRM(modrm)));
   12471          } else {
   12472             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12473             assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   12474             assign(f32hi, loadLE(Ity_F32, binop( Iop_Add64,
   12475                                                  mkexpr(addr),
   12476                                                  mkU64(4) )));
   12477             delta += alen;
   12478             DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   12479                                       dis_buf,
   12480                                       nameMMXReg(gregLO3ofRM(modrm)));
   12481          }
   12482 
   12483          if (r2zero) {
   12484             assign(rmode, mkU32((UInt)Irrm_ZERO) );
   12485          } else {
   12486             assign( rmode, get_sse_roundingmode() );
   12487          }
   12488 
   12489          assign(
   12490             dst64,
   12491             binop( Iop_32HLto64,
   12492                    binop( Iop_F64toI32S,
   12493                           mkexpr(rmode),
   12494                           unop( Iop_F32toF64, mkexpr(f32hi) ) ),
   12495                    binop( Iop_F64toI32S,
   12496                           mkexpr(rmode),
   12497                           unop( Iop_F32toF64, mkexpr(f32lo) ) )
   12498                  )
   12499          );
   12500 
   12501          putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
   12502          goto decode_success;
   12503       }
   12504       /* F3 0F 2D = CVTSS2SI
   12505          when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg,
   12506                        according to prevailing SSE rounding mode
   12507          when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg,
   12508                        according to prevailing SSE rounding mode
   12509       */
   12510       /* F3 0F 2C = CVTTSS2SI
   12511          when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg,
   12512                        truncating towards zero
   12513          when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg,
   12514                        truncating towards zero
   12515       */
   12516       if (haveF3no66noF2(pfx) && (sz == 4 || sz == 8)) {
   12517          delta = dis_CVTxSS2SI( vbi, pfx, delta, False/*!isAvx*/, opc, sz);
   12518          goto decode_success;
   12519       }
   12520       /* F2 0F 2D = CVTSD2SI
   12521          when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg,
   12522                        according to prevailing SSE rounding mode
   12523          when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg,
   12524                        according to prevailing SSE rounding mode
   12525       */
   12526       /* F2 0F 2C = CVTTSD2SI
   12527          when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg,
   12528                        truncating towards zero
   12529          when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg,
   12530                        truncating towards zero
   12531       */
   12532       if (haveF2no66noF3(pfx) && (sz == 4 || sz == 8)) {
   12533          delta = dis_CVTxSD2SI( vbi, pfx, delta, False/*!isAvx*/, opc, sz);
   12534          goto decode_success;
   12535       }
   12536       /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   12537          I32 in mmx, according to prevailing SSE rounding mode */
   12538       /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   12539          I32 in mmx, rounding towards zero */
   12540       if (have66noF2noF3(pfx) && sz == 2) {
   12541          IRTemp dst64  = newTemp(Ity_I64);
   12542          IRTemp rmode  = newTemp(Ity_I32);
   12543          IRTemp f64lo  = newTemp(Ity_F64);
   12544          IRTemp f64hi  = newTemp(Ity_F64);
   12545          Bool   r2zero = toBool(opc == 0x2C);
   12546 
   12547          do_MMX_preamble();
   12548          modrm = getUChar(delta);
   12549 
   12550          if (epartIsReg(modrm)) {
   12551             delta += 1;
   12552             assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
   12553             assign(f64hi, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 1));
   12554             DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "",
   12555                                       nameXMMReg(eregOfRexRM(pfx,modrm)),
   12556                                       nameMMXReg(gregLO3ofRM(modrm)));
   12557          } else {
   12558             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12559             assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   12560             assign(f64hi, loadLE(Ity_F64, binop( Iop_Add64,
   12561                                                  mkexpr(addr),
   12562                                                  mkU64(8) )));
   12563             delta += alen;
   12564             DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "",
   12565                                       dis_buf,
   12566                                       nameMMXReg(gregLO3ofRM(modrm)));
   12567          }
   12568 
   12569          if (r2zero) {
   12570             assign(rmode, mkU32((UInt)Irrm_ZERO) );
   12571          } else {
   12572             assign( rmode, get_sse_roundingmode() );
   12573          }
   12574 
   12575          assign(
   12576             dst64,
   12577             binop( Iop_32HLto64,
   12578                    binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64hi) ),
   12579                    binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo) )
   12580                  )
   12581          );
   12582 
   12583          putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
   12584          goto decode_success;
   12585       }
   12586       break;
   12587 
   12588    case 0x2E:
   12589    case 0x2F:
   12590       /* 66 0F 2E = UCOMISD -- 64F0x2 comparison G,E, and set ZCP */
   12591       /* 66 0F 2F = COMISD  -- 64F0x2 comparison G,E, and set ZCP */
   12592       if (have66noF2noF3(pfx) && sz == 2) {
   12593          delta = dis_COMISD( vbi, pfx, delta, False/*!isAvx*/, opc );
   12594          goto decode_success;
   12595       }
   12596       /* 0F 2E = UCOMISS -- 32F0x4 comparison G,E, and set ZCP */
   12597       /* 0F 2F = COMISS  -- 32F0x4 comparison G,E, and set ZCP */
   12598       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12599          delta = dis_COMISS( vbi, pfx, delta, False/*!isAvx*/, opc );
   12600          goto decode_success;
   12601       }
   12602       break;
   12603 
   12604    case 0x50:
   12605       /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
   12606          to 4 lowest bits of ireg(G) */
   12607       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   12608           && epartIsReg(getUChar(delta))) {
   12609          /* sz == 8 is a kludge to handle insns with REX.W redundantly
   12610             set to 1, which has been known to happen:
   12611 
   12612             4c 0f 50 d9             rex64X movmskps %xmm1,%r11d
   12613 
   12614             20071106: Intel docs say that REX.W isn't redundant: when
   12615             present, a 64-bit register is written; when not present, only
   12616             the 32-bit half is written.  However, testing on a Core2
   12617             machine suggests the entire 64 bit register is written
   12618             irrespective of the status of REX.W.  That could be because
   12619             of the default rule that says "if the lower half of a 32-bit
   12620             register is written, the upper half is zeroed".  By using
   12621             putIReg32 here we inadvertantly produce the same behaviour as
   12622             the Core2, for the same reason -- putIReg32 implements said
   12623             rule.
   12624 
   12625             AMD docs give no indication that REX.W is even valid for this
   12626             insn. */
   12627          delta = dis_MOVMSKPS_128( vbi, pfx, delta, False/*!isAvx*/ );
   12628          goto decode_success;
   12629       }
   12630       /* 66 0F 50 = MOVMSKPD - move 2 sign bits from 2 x F64 in xmm(E) to
   12631          2 lowest bits of ireg(G) */
   12632       if (have66noF2noF3(pfx) && (sz == 2 || sz == 8)) {
   12633          /* sz == 8 is a kludge to handle insns with REX.W redundantly
   12634             set to 1, which has been known to happen:
   12635             66 4c 0f 50 d9          rex64X movmskpd %xmm1,%r11d
   12636             20071106: see further comments on MOVMSKPS implementation above.
   12637          */
   12638          delta = dis_MOVMSKPD_128( vbi, pfx, delta, False/*!isAvx*/ );
   12639          goto decode_success;
   12640       }
   12641       break;
   12642 
   12643    case 0x51:
   12644       /* F3 0F 51 = SQRTSS -- approx sqrt 32F0x4 from R/M to R */
   12645       if (haveF3no66noF2(pfx) && sz == 4) {
   12646          delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
   12647                                             "sqrtss", Iop_Sqrt32F0x4 );
   12648          goto decode_success;
   12649       }
   12650       /* 0F 51 = SQRTPS -- approx sqrt 32Fx4 from R/M to R */
   12651       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12652          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
   12653                                            "sqrtps", Iop_Sqrt32Fx4 );
   12654          goto decode_success;
   12655       }
   12656       /* F2 0F 51 = SQRTSD -- approx sqrt 64F0x2 from R/M to R */
   12657       if (haveF2no66noF3(pfx) && sz == 4) {
   12658          delta = dis_SSE_E_to_G_unary_lo64( vbi, pfx, delta,
   12659                                             "sqrtsd", Iop_Sqrt64F0x2 );
   12660          goto decode_success;
   12661       }
   12662       /* 66 0F 51 = SQRTPD -- approx sqrt 64Fx2 from R/M to R */
   12663       if (have66noF2noF3(pfx) && sz == 2) {
   12664          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
   12665                                            "sqrtpd", Iop_Sqrt64Fx2 );
   12666          goto decode_success;
   12667       }
   12668       break;
   12669 
   12670    case 0x52:
   12671       /* F3 0F 52 = RSQRTSS -- approx reciprocal sqrt 32F0x4 from R/M to R */
   12672       if (haveF3no66noF2(pfx) && sz == 4) {
   12673          delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
   12674                                             "rsqrtss", Iop_RSqrt32F0x4 );
   12675          goto decode_success;
   12676       }
   12677       /* 0F 52 = RSQRTPS -- approx reciprocal sqrt 32Fx4 from R/M to R */
   12678       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12679          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
   12680                                            "rsqrtps", Iop_RSqrt32Fx4 );
   12681          goto decode_success;
   12682       }
   12683       break;
   12684 
   12685    case 0x53:
   12686       /* F3 0F 53 = RCPSS -- approx reciprocal 32F0x4 from R/M to R */
   12687       if (haveF3no66noF2(pfx) && sz == 4) {
   12688          delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
   12689                                             "rcpss", Iop_Recip32F0x4 );
   12690          goto decode_success;
   12691       }
   12692       /* 0F 53 = RCPPS -- approx reciprocal 32Fx4 from R/M to R */
   12693       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12694          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
   12695                                            "rcpps", Iop_Recip32Fx4 );
   12696          goto decode_success;
   12697       }
   12698       break;
   12699 
   12700    case 0x54:
   12701       /* 0F 54 = ANDPS -- G = G and E */
   12702       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12703          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "andps", Iop_AndV128 );
   12704          goto decode_success;
   12705       }
   12706       /* 66 0F 54 = ANDPD -- G = G and E */
   12707       if (have66noF2noF3(pfx) && sz == 2) {
   12708          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "andpd", Iop_AndV128 );
   12709          goto decode_success;
   12710       }
   12711       break;
   12712 
   12713    case 0x55:
   12714       /* 0F 55 = ANDNPS -- G = (not G) and E */
   12715       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12716          delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "andnps",
   12717                                                            Iop_AndV128 );
   12718          goto decode_success;
   12719       }
   12720       /* 66 0F 55 = ANDNPD -- G = (not G) and E */
   12721       if (have66noF2noF3(pfx) && sz == 2) {
   12722          delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "andnpd",
   12723                                                            Iop_AndV128 );
   12724          goto decode_success;
   12725       }
   12726       break;
   12727 
   12728    case 0x56:
   12729       /* 0F 56 = ORPS -- G = G and E */
   12730       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12731          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "orps", Iop_OrV128 );
   12732          goto decode_success;
   12733       }
   12734       /* 66 0F 56 = ORPD -- G = G and E */
   12735       if (have66noF2noF3(pfx) && sz == 2) {
   12736          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "orpd", Iop_OrV128 );
   12737          goto decode_success;
   12738       }
   12739       break;
   12740 
   12741    case 0x57:
   12742       /* 66 0F 57 = XORPD -- G = G xor E */
   12743       if (have66noF2noF3(pfx) && sz == 2) {
   12744          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "xorpd", Iop_XorV128 );
   12745          goto decode_success;
   12746       }
   12747       /* 0F 57 = XORPS -- G = G xor E */
   12748       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12749          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "xorps", Iop_XorV128 );
   12750          goto decode_success;
   12751       }
   12752       break;
   12753 
   12754    case 0x58:
   12755       /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
   12756       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12757          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "addps", Iop_Add32Fx4 );
   12758          goto decode_success;
   12759       }
   12760       /* F3 0F 58 = ADDSS -- add 32F0x4 from R/M to R */
   12761       if (haveF3no66noF2(pfx) && sz == 4) {
   12762          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "addss", Iop_Add32F0x4 );
   12763          goto decode_success;
   12764       }
   12765       /* F2 0F 58 = ADDSD -- add 64F0x2 from R/M to R */
   12766       if (haveF2no66noF3(pfx)
   12767           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12768          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "addsd", Iop_Add64F0x2 );
   12769          goto decode_success;
   12770       }
   12771       /* 66 0F 58 = ADDPD -- add 32Fx4 from R/M to R */
   12772       if (have66noF2noF3(pfx)
   12773           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12774          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "addpd", Iop_Add64Fx2 );
   12775          goto decode_success;
   12776       }
   12777       break;
   12778 
   12779    case 0x59:
   12780       /* F2 0F 59 = MULSD -- mul 64F0x2 from R/M to R */
   12781       if (haveF2no66noF3(pfx)
   12782           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12783          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "mulsd", Iop_Mul64F0x2 );
   12784          goto decode_success;
   12785       }
   12786       /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
   12787       if (haveF3no66noF2(pfx) && sz == 4) {
   12788          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "mulss", Iop_Mul32F0x4 );
   12789          goto decode_success;
   12790       }
   12791       /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
   12792       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12793          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "mulps", Iop_Mul32Fx4 );
   12794          goto decode_success;
   12795       }
   12796       /* 66 0F 59 = MULPD -- mul 64Fx2 from R/M to R */
   12797       if (have66noF2noF3(pfx)
   12798           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12799          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "mulpd", Iop_Mul64Fx2 );
   12800          goto decode_success;
   12801       }
   12802       break;
   12803 
   12804    case 0x5A:
   12805       /* 0F 5A = CVTPS2PD -- convert 2 x F32 in low half mem/xmm to 2 x
   12806          F64 in xmm(G). */
   12807       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12808          delta = dis_CVTPS2PD_128( vbi, pfx, delta, False/*!isAvx*/ );
   12809          goto decode_success;
   12810       }
   12811       /* F3 0F 5A = CVTSS2SD -- convert F32 in mem/low 1/4 xmm to F64 in
   12812          low half xmm(G) */
   12813       if (haveF3no66noF2(pfx) && sz == 4) {
   12814          IRTemp f32lo = newTemp(Ity_F32);
   12815 
   12816          modrm = getUChar(delta);
   12817          if (epartIsReg(modrm)) {
   12818             delta += 1;
   12819             assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
   12820             DIP("cvtss2sd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12821                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12822          } else {
   12823             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12824             assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   12825             delta += alen;
   12826             DIP("cvtss2sd %s,%s\n", dis_buf,
   12827                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12828          }
   12829 
   12830          putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
   12831                            unop( Iop_F32toF64, mkexpr(f32lo) ) );
   12832 
   12833          goto decode_success;
   12834       }
   12835       /* F2 0F 5A = CVTSD2SS -- convert F64 in mem/low half xmm to F32 in
   12836          low 1/4 xmm(G), according to prevailing SSE rounding mode */
   12837       if (haveF2no66noF3(pfx) && sz == 4) {
   12838          IRTemp rmode = newTemp(Ity_I32);
   12839          IRTemp f64lo = newTemp(Ity_F64);
   12840 
   12841          modrm = getUChar(delta);
   12842          if (epartIsReg(modrm)) {
   12843             delta += 1;
   12844             assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
   12845             DIP("cvtsd2ss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12846                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12847          } else {
   12848             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12849             assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   12850             delta += alen;
   12851             DIP("cvtsd2ss %s,%s\n", dis_buf,
   12852                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12853          }
   12854 
   12855          assign( rmode, get_sse_roundingmode() );
   12856          putXMMRegLane32F(
   12857             gregOfRexRM(pfx,modrm), 0,
   12858             binop( Iop_F64toF32, mkexpr(rmode), mkexpr(f64lo) )
   12859          );
   12860 
   12861          goto decode_success;
   12862       }
   12863       /* 66 0F 5A = CVTPD2PS -- convert 2 x F64 in mem/xmm to 2 x F32 in
   12864          lo half xmm(G), rounding according to prevailing SSE rounding
   12865          mode, and zero upper half */
   12866       /* Note, this is practically identical to CVTPD2DQ.  It would have
   12867          be nice to merge them together. */
   12868       if (have66noF2noF3(pfx) && sz == 2) {
   12869          delta = dis_CVTPD2PS_128( vbi, pfx, delta, False/*!isAvx*/ );
   12870          goto decode_success;
   12871       }
   12872       break;
   12873 
   12874    case 0x5B:
   12875       /* F3 0F 5B = CVTTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   12876          xmm(G), rounding towards zero */
   12877       /* 66 0F 5B = CVTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   12878          xmm(G), as per the prevailing rounding mode */
   12879       if ( (have66noF2noF3(pfx) && sz == 2)
   12880            || (haveF3no66noF2(pfx) && sz == 4) ) {
   12881          Bool r2zero = toBool(sz == 4); // FIXME -- unreliable (???)
   12882          delta = dis_CVTxPS2DQ_128( vbi, pfx, delta, False/*!isAvx*/, r2zero );
   12883          goto decode_success;
   12884       }
   12885       /* 0F 5B = CVTDQ2PS -- convert 4 x I32 in mem/xmm to 4 x F32 in
   12886          xmm(G) */
   12887       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12888          delta = dis_CVTDQ2PS_128( vbi, pfx, delta, False/*!isAvx*/ );
   12889          goto decode_success;
   12890       }
   12891       break;
   12892 
   12893    case 0x5C:
   12894       /* F3 0F 5C = SUBSS -- sub 32F0x4 from R/M to R */
   12895       if (haveF3no66noF2(pfx) && sz == 4) {
   12896          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "subss", Iop_Sub32F0x4 );
   12897          goto decode_success;
   12898       }
   12899       /* F2 0F 5C = SUBSD -- sub 64F0x2 from R/M to R */
   12900       if (haveF2no66noF3(pfx)
   12901           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12902          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "subsd", Iop_Sub64F0x2 );
   12903          goto decode_success;
   12904       }
   12905       /* 0F 5C = SUBPS -- sub 32Fx4 from R/M to R */
   12906       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12907          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "subps", Iop_Sub32Fx4 );
   12908          goto decode_success;
   12909       }
   12910       /* 66 0F 5C = SUBPD -- sub 64Fx2 from R/M to R */
   12911       if (have66noF2noF3(pfx) && sz == 2) {
   12912          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "subpd", Iop_Sub64Fx2 );
   12913          goto decode_success;
   12914       }
   12915       break;
   12916 
   12917    case 0x5D:
   12918       /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
   12919       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12920          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "minps", Iop_Min32Fx4 );
   12921          goto decode_success;
   12922       }
   12923       /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
   12924       if (haveF3no66noF2(pfx) && sz == 4) {
   12925          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "minss", Iop_Min32F0x4 );
   12926          goto decode_success;
   12927       }
   12928       /* F2 0F 5D = MINSD -- min 64F0x2 from R/M to R */
   12929       if (haveF2no66noF3(pfx) && sz == 4) {
   12930          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "minsd", Iop_Min64F0x2 );
   12931          goto decode_success;
   12932       }
   12933       /* 66 0F 5D = MINPD -- min 64Fx2 from R/M to R */
   12934       if (have66noF2noF3(pfx) && sz == 2) {
   12935          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "minpd", Iop_Min64Fx2 );
   12936          goto decode_success;
   12937       }
   12938       break;
   12939 
   12940    case 0x5E:
   12941       /* F2 0F 5E = DIVSD -- div 64F0x2 from R/M to R */
   12942       if (haveF2no66noF3(pfx) && sz == 4) {
   12943          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "divsd", Iop_Div64F0x2 );
   12944          goto decode_success;
   12945       }
   12946       /* 0F 5E = DIVPS -- div 32Fx4 from R/M to R */
   12947       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12948          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "divps", Iop_Div32Fx4 );
   12949          goto decode_success;
   12950       }
   12951       /* F3 0F 5E = DIVSS -- div 32F0x4 from R/M to R */
   12952       if (haveF3no66noF2(pfx) && sz == 4) {
   12953          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "divss", Iop_Div32F0x4 );
   12954          goto decode_success;
   12955       }
   12956       /* 66 0F 5E = DIVPD -- div 64Fx2 from R/M to R */
   12957       if (have66noF2noF3(pfx) && sz == 2) {
   12958          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "divpd", Iop_Div64Fx2 );
   12959          goto decode_success;
   12960       }
   12961       break;
   12962 
   12963    case 0x5F:
   12964       /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
   12965       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12966          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "maxps", Iop_Max32Fx4 );
   12967          goto decode_success;
   12968       }
   12969       /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
   12970       if (haveF3no66noF2(pfx) && sz == 4) {
   12971          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "maxss", Iop_Max32F0x4 );
   12972          goto decode_success;
   12973       }
   12974       /* F2 0F 5F = MAXSD -- max 64F0x2 from R/M to R */
   12975       if (haveF2no66noF3(pfx) && sz == 4) {
   12976          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "maxsd", Iop_Max64F0x2 );
   12977          goto decode_success;
   12978       }
   12979       /* 66 0F 5F = MAXPD -- max 64Fx2 from R/M to R */
   12980       if (have66noF2noF3(pfx) && sz == 2) {
   12981          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "maxpd", Iop_Max64Fx2 );
   12982          goto decode_success;
   12983       }
   12984       break;
   12985 
   12986    case 0x60:
   12987       /* 66 0F 60 = PUNPCKLBW */
   12988       if (have66noF2noF3(pfx) && sz == 2) {
   12989          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   12990                                     "punpcklbw",
   12991                                     Iop_InterleaveLO8x16, True );
   12992          goto decode_success;
   12993       }
   12994       break;
   12995 
   12996    case 0x61:
   12997       /* 66 0F 61 = PUNPCKLWD */
   12998       if (have66noF2noF3(pfx) && sz == 2) {
   12999          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13000                                     "punpcklwd",
   13001                                     Iop_InterleaveLO16x8, True );
   13002          goto decode_success;
   13003       }
   13004       break;
   13005 
   13006    case 0x62:
   13007       /* 66 0F 62 = PUNPCKLDQ */
   13008       if (have66noF2noF3(pfx) && sz == 2) {
   13009          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13010                                     "punpckldq",
   13011                                     Iop_InterleaveLO32x4, True );
   13012          goto decode_success;
   13013       }
   13014       break;
   13015 
   13016    case 0x63:
   13017       /* 66 0F 63 = PACKSSWB */
   13018       if (have66noF2noF3(pfx) && sz == 2) {
   13019          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13020                                     "packsswb",
   13021                                     Iop_QNarrowBin16Sto8Sx16, True );
   13022          goto decode_success;
   13023       }
   13024       break;
   13025 
   13026    case 0x64:
   13027       /* 66 0F 64 = PCMPGTB */
   13028       if (have66noF2noF3(pfx) && sz == 2) {
   13029          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13030                                     "pcmpgtb", Iop_CmpGT8Sx16, False );
   13031          goto decode_success;
   13032       }
   13033       break;
   13034 
   13035    case 0x65:
   13036       /* 66 0F 65 = PCMPGTW */
   13037       if (have66noF2noF3(pfx) && sz == 2) {
   13038          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13039                                     "pcmpgtw", Iop_CmpGT16Sx8, False );
   13040          goto decode_success;
   13041       }
   13042       break;
   13043 
   13044    case 0x66:
   13045       /* 66 0F 66 = PCMPGTD */
   13046       if (have66noF2noF3(pfx) && sz == 2) {
   13047          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13048                                     "pcmpgtd", Iop_CmpGT32Sx4, False );
   13049          goto decode_success;
   13050       }
   13051       break;
   13052 
   13053    case 0x67:
   13054       /* 66 0F 67 = PACKUSWB */
   13055       if (have66noF2noF3(pfx) && sz == 2) {
   13056          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13057                                     "packuswb",
   13058                                     Iop_QNarrowBin16Sto8Ux16, True );
   13059          goto decode_success;
   13060       }
   13061       break;
   13062 
   13063    case 0x68:
   13064       /* 66 0F 68 = PUNPCKHBW */
   13065       if (have66noF2noF3(pfx) && sz == 2) {
   13066          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13067                                     "punpckhbw",
   13068                                     Iop_InterleaveHI8x16, True );
   13069          goto decode_success;
   13070       }
   13071       break;
   13072 
   13073    case 0x69:
   13074       /* 66 0F 69 = PUNPCKHWD */
   13075       if (have66noF2noF3(pfx) && sz == 2) {
   13076          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13077                                     "punpckhwd",
   13078                                     Iop_InterleaveHI16x8, True );
   13079          goto decode_success;
   13080       }
   13081       break;
   13082 
   13083    case 0x6A:
   13084       /* 66 0F 6A = PUNPCKHDQ */
   13085       if (have66noF2noF3(pfx) && sz == 2) {
   13086          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13087                                     "punpckhdq",
   13088                                     Iop_InterleaveHI32x4, True );
   13089          goto decode_success;
   13090       }
   13091       break;
   13092 
   13093    case 0x6B:
   13094       /* 66 0F 6B = PACKSSDW */
   13095       if (have66noF2noF3(pfx) && sz == 2) {
   13096          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13097                                     "packssdw",
   13098                                     Iop_QNarrowBin32Sto16Sx8, True );
   13099          goto decode_success;
   13100       }
   13101       break;
   13102 
   13103    case 0x6C:
   13104       /* 66 0F 6C = PUNPCKLQDQ */
   13105       if (have66noF2noF3(pfx) && sz == 2) {
   13106          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13107                                     "punpcklqdq",
   13108                                     Iop_InterleaveLO64x2, True );
   13109          goto decode_success;
   13110       }
   13111       break;
   13112 
   13113    case 0x6D:
   13114       /* 66 0F 6D = PUNPCKHQDQ */
   13115       if (have66noF2noF3(pfx) && sz == 2) {
   13116          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13117                                     "punpckhqdq",
   13118                                     Iop_InterleaveHI64x2, True );
   13119          goto decode_success;
   13120       }
   13121       break;
   13122 
   13123    case 0x6E:
   13124       /* 66 0F 6E = MOVD from ireg32/m32 to xmm lo 1/4,
   13125                     zeroing high 3/4 of xmm. */
   13126       /*              or from ireg64/m64 to xmm lo 1/2,
   13127                     zeroing high 1/2 of xmm. */
   13128       if (have66noF2noF3(pfx)) {
   13129          vassert(sz == 2 || sz == 8);
   13130          if (sz == 2) sz = 4;
   13131          modrm = getUChar(delta);
   13132          if (epartIsReg(modrm)) {
   13133             delta += 1;
   13134             if (sz == 4) {
   13135                putXMMReg(
   13136                   gregOfRexRM(pfx,modrm),
   13137                   unop( Iop_32UtoV128, getIReg32(eregOfRexRM(pfx,modrm)) )
   13138                );
   13139                DIP("movd %s, %s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   13140                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13141             } else {
   13142                putXMMReg(
   13143                   gregOfRexRM(pfx,modrm),
   13144                   unop( Iop_64UtoV128, getIReg64(eregOfRexRM(pfx,modrm)) )
   13145                );
   13146                DIP("movq %s, %s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   13147                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13148             }
   13149          } else {
   13150             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   13151             delta += alen;
   13152             putXMMReg(
   13153                gregOfRexRM(pfx,modrm),
   13154                sz == 4
   13155                   ?  unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)) )
   13156                   :  unop( Iop_64UtoV128,loadLE(Ity_I64, mkexpr(addr)) )
   13157             );
   13158             DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q', dis_buf,
   13159                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13160          }
   13161          goto decode_success;
   13162       }
   13163       break;
   13164 
   13165    case 0x6F:
   13166       if (have66noF2noF3(pfx)
   13167           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   13168          /* 66 0F 6F = MOVDQA -- move from E (mem or xmm) to G (xmm). */
   13169          modrm = getUChar(delta);
   13170          if (epartIsReg(modrm)) {
   13171             putXMMReg( gregOfRexRM(pfx,modrm),
   13172                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   13173             DIP("movdqa %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13174                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13175             delta += 1;
   13176          } else {
   13177             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13178             gen_SEGV_if_not_16_aligned( addr );
   13179             putXMMReg( gregOfRexRM(pfx,modrm),
   13180                        loadLE(Ity_V128, mkexpr(addr)) );
   13181             DIP("movdqa %s,%s\n", dis_buf,
   13182                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13183             delta += alen;
   13184          }
   13185          goto decode_success;
   13186       }
   13187       if (haveF3no66noF2(pfx) && sz == 4) {
   13188          /* F3 0F 6F = MOVDQU -- move from E (mem or xmm) to G (xmm). */
   13189          modrm = getUChar(delta);
   13190          if (epartIsReg(modrm)) {
   13191             putXMMReg( gregOfRexRM(pfx,modrm),
   13192                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   13193             DIP("movdqu %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13194                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13195             delta += 1;
   13196          } else {
   13197             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13198             putXMMReg( gregOfRexRM(pfx,modrm),
   13199                        loadLE(Ity_V128, mkexpr(addr)) );
   13200             DIP("movdqu %s,%s\n", dis_buf,
   13201                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13202             delta += alen;
   13203          }
   13204          goto decode_success;
   13205       }
   13206       break;
   13207 
   13208    case 0x70:
   13209       /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */
   13210       if (have66noF2noF3(pfx) && sz == 2) {
   13211          delta = dis_PSHUFD_32x4( vbi, pfx, delta, False/*!writesYmm*/);
   13212          goto decode_success;
   13213       }
   13214       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   13215       /* 0F 70 = PSHUFW -- rearrange 4x16 from E(mmx or mem) to G(mmx) */
   13216       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13217          Int order;
   13218          IRTemp sV, dV, s3, s2, s1, s0;
   13219          s3 = s2 = s1 = s0 = IRTemp_INVALID;
   13220          sV = newTemp(Ity_I64);
   13221          dV = newTemp(Ity_I64);
   13222          do_MMX_preamble();
   13223          modrm = getUChar(delta);
   13224          if (epartIsReg(modrm)) {
   13225             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   13226             order = (Int)getUChar(delta+1);
   13227             delta += 1+1;
   13228             DIP("pshufw $%d,%s,%s\n", order,
   13229                                       nameMMXReg(eregLO3ofRM(modrm)),
   13230                                       nameMMXReg(gregLO3ofRM(modrm)));
   13231          } else {
   13232             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   13233                               1/*extra byte after amode*/ );
   13234             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   13235             order = (Int)getUChar(delta+alen);
   13236             delta += 1+alen;
   13237             DIP("pshufw $%d,%s,%s\n", order,
   13238                                       dis_buf,
   13239                                       nameMMXReg(gregLO3ofRM(modrm)));
   13240          }
   13241          breakup64to16s( sV, &s3, &s2, &s1, &s0 );
   13242 #        define SEL(n) \
   13243                    ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   13244          assign(dV,
   13245                 mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   13246                              SEL((order>>2)&3), SEL((order>>0)&3) )
   13247          );
   13248          putMMXReg(gregLO3ofRM(modrm), mkexpr(dV));
   13249 #        undef SEL
   13250          goto decode_success;
   13251       }
   13252       /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or
   13253          mem) to G(xmm), and copy upper half */
   13254       if (haveF2no66noF3(pfx) && sz == 4) {
   13255          delta = dis_PSHUFxW_128( vbi, pfx, delta,
   13256                                   False/*!isAvx*/, False/*!xIsH*/ );
   13257          goto decode_success;
   13258       }
   13259       /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or
   13260          mem) to G(xmm), and copy lower half */
   13261       if (haveF3no66noF2(pfx) && sz == 4) {
   13262          delta = dis_PSHUFxW_128( vbi, pfx, delta,
   13263                                   False/*!isAvx*/, True/*xIsH*/ );
   13264          goto decode_success;
   13265       }
   13266       break;
   13267 
   13268    case 0x71:
   13269       /* 66 0F 71 /2 ib = PSRLW by immediate */
   13270       if (have66noF2noF3(pfx) && sz == 2
   13271           && epartIsReg(getUChar(delta))
   13272           && gregLO3ofRM(getUChar(delta)) == 2) {
   13273          delta = dis_SSE_shiftE_imm( pfx, delta, "psrlw", Iop_ShrN16x8 );
   13274          goto decode_success;
   13275       }
   13276       /* 66 0F 71 /4 ib = PSRAW by immediate */
   13277       if (have66noF2noF3(pfx) && sz == 2
   13278           && epartIsReg(getUChar(delta))
   13279           && gregLO3ofRM(getUChar(delta)) == 4) {
   13280          delta = dis_SSE_shiftE_imm( pfx, delta, "psraw", Iop_SarN16x8 );
   13281          goto decode_success;
   13282       }
   13283       /* 66 0F 71 /6 ib = PSLLW by immediate */
   13284       if (have66noF2noF3(pfx) && sz == 2
   13285           && epartIsReg(getUChar(delta))
   13286           && gregLO3ofRM(getUChar(delta)) == 6) {
   13287          delta = dis_SSE_shiftE_imm( pfx, delta, "psllw", Iop_ShlN16x8 );
   13288          goto decode_success;
   13289       }
   13290       break;
   13291 
   13292    case 0x72:
   13293       /* 66 0F 72 /2 ib = PSRLD by immediate */
   13294       if (have66noF2noF3(pfx) && sz == 2
   13295           && epartIsReg(getUChar(delta))
   13296           && gregLO3ofRM(getUChar(delta)) == 2) {
   13297          delta = dis_SSE_shiftE_imm( pfx, delta, "psrld", Iop_ShrN32x4 );
   13298          goto decode_success;
   13299       }
   13300       /* 66 0F 72 /4 ib = PSRAD by immediate */
   13301       if (have66noF2noF3(pfx) && sz == 2
   13302           && epartIsReg(getUChar(delta))
   13303           && gregLO3ofRM(getUChar(delta)) == 4) {
   13304          delta = dis_SSE_shiftE_imm( pfx, delta, "psrad", Iop_SarN32x4 );
   13305          goto decode_success;
   13306       }
   13307       /* 66 0F 72 /6 ib = PSLLD by immediate */
   13308       if (have66noF2noF3(pfx) && sz == 2
   13309           && epartIsReg(getUChar(delta))
   13310           && gregLO3ofRM(getUChar(delta)) == 6) {
   13311          delta = dis_SSE_shiftE_imm( pfx, delta, "pslld", Iop_ShlN32x4 );
   13312          goto decode_success;
   13313       }
   13314       break;
   13315 
   13316    case 0x73:
   13317       /* 66 0F 73 /3 ib = PSRLDQ by immediate */
   13318       /* note, if mem case ever filled in, 1 byte after amode */
   13319       if (have66noF2noF3(pfx) && sz == 2
   13320           && epartIsReg(getUChar(delta))
   13321           && gregLO3ofRM(getUChar(delta)) == 3) {
   13322          Int imm = (Int)getUChar(delta+1);
   13323          Int reg = eregOfRexRM(pfx,getUChar(delta));
   13324          DIP("psrldq $%d,%s\n", imm, nameXMMReg(reg));
   13325          delta += 2;
   13326          IRTemp sV = newTemp(Ity_V128);
   13327          assign( sV, getXMMReg(reg) );
   13328          putXMMReg(reg, mkexpr(math_PSRLDQ( sV, imm )));
   13329          goto decode_success;
   13330       }
   13331       /* 66 0F 73 /7 ib = PSLLDQ by immediate */
   13332       /* note, if mem case ever filled in, 1 byte after amode */
   13333       if (have66noF2noF3(pfx) && sz == 2
   13334           && epartIsReg(getUChar(delta))
   13335           && gregLO3ofRM(getUChar(delta)) == 7) {
   13336          Int imm = (Int)getUChar(delta+1);
   13337          Int reg = eregOfRexRM(pfx,getUChar(delta));
   13338          DIP("pslldq $%d,%s\n", imm, nameXMMReg(reg));
   13339          vassert(imm >= 0 && imm <= 255);
   13340          delta += 2;
   13341          IRTemp sV = newTemp(Ity_V128);
   13342          assign( sV, getXMMReg(reg) );
   13343          putXMMReg(reg, mkexpr(math_PSLLDQ( sV, imm )));
   13344          goto decode_success;
   13345       }
   13346       /* 66 0F 73 /2 ib = PSRLQ by immediate */
   13347       if (have66noF2noF3(pfx) && sz == 2
   13348           && epartIsReg(getUChar(delta))
   13349           && gregLO3ofRM(getUChar(delta)) == 2) {
   13350          delta = dis_SSE_shiftE_imm( pfx, delta, "psrlq", Iop_ShrN64x2 );
   13351          goto decode_success;
   13352       }
   13353       /* 66 0F 73 /6 ib = PSLLQ by immediate */
   13354       if (have66noF2noF3(pfx) && sz == 2
   13355           && epartIsReg(getUChar(delta))
   13356           && gregLO3ofRM(getUChar(delta)) == 6) {
   13357          delta = dis_SSE_shiftE_imm( pfx, delta, "psllq", Iop_ShlN64x2 );
   13358          goto decode_success;
   13359       }
   13360       break;
   13361 
   13362    case 0x74:
   13363       /* 66 0F 74 = PCMPEQB */
   13364       if (have66noF2noF3(pfx) && sz == 2) {
   13365          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13366                                     "pcmpeqb", Iop_CmpEQ8x16, False );
   13367          goto decode_success;
   13368       }
   13369       break;
   13370 
   13371    case 0x75:
   13372       /* 66 0F 75 = PCMPEQW */
   13373       if (have66noF2noF3(pfx) && sz == 2) {
   13374          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13375                                     "pcmpeqw", Iop_CmpEQ16x8, False );
   13376          goto decode_success;
   13377       }
   13378       break;
   13379 
   13380    case 0x76:
   13381       /* 66 0F 76 = PCMPEQD */
   13382       if (have66noF2noF3(pfx) && sz == 2) {
   13383          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13384                                     "pcmpeqd", Iop_CmpEQ32x4, False );
   13385          goto decode_success;
   13386       }
   13387       break;
   13388 
   13389    case 0x7E:
   13390       /* F3 0F 7E = MOVQ -- move 64 bits from E (mem or lo half xmm) to
   13391          G (lo half xmm).  Upper half of G is zeroed out. */
   13392       if (haveF3no66noF2(pfx)
   13393           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   13394          modrm = getUChar(delta);
   13395          if (epartIsReg(modrm)) {
   13396             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   13397                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
   13398                /* zero bits 127:64 */
   13399                putXMMRegLane64( gregOfRexRM(pfx,modrm), 1, mkU64(0) );
   13400             DIP("movsd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13401                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   13402             delta += 1;
   13403          } else {
   13404             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13405             putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
   13406             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   13407                              loadLE(Ity_I64, mkexpr(addr)) );
   13408             DIP("movsd %s,%s\n", dis_buf,
   13409                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   13410             delta += alen;
   13411          }
   13412          goto decode_success;
   13413       }
   13414       /* 66 0F 7E = MOVD from xmm low 1/4 to ireg32 or m32. */
   13415       /*              or from xmm low 1/2 to ireg64 or m64. */
   13416          if (have66noF2noF3(pfx) && (sz == 2 || sz == 8)) {
   13417          if (sz == 2) sz = 4;
   13418          modrm = getUChar(delta);
   13419          if (epartIsReg(modrm)) {
   13420             delta += 1;
   13421             if (sz == 4) {
   13422                putIReg32( eregOfRexRM(pfx,modrm),
   13423                           getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
   13424                DIP("movd %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   13425                                     nameIReg32(eregOfRexRM(pfx,modrm)));
   13426             } else {
   13427                putIReg64( eregOfRexRM(pfx,modrm),
   13428                           getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
   13429                DIP("movq %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   13430                                     nameIReg64(eregOfRexRM(pfx,modrm)));
   13431             }
   13432          } else {
   13433             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   13434             delta += alen;
   13435             storeLE( mkexpr(addr),
   13436                      sz == 4
   13437                         ? getXMMRegLane32(gregOfRexRM(pfx,modrm),0)
   13438                         : getXMMRegLane64(gregOfRexRM(pfx,modrm),0) );
   13439             DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q',
   13440                                   nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
   13441          }
   13442          goto decode_success;
   13443       }
   13444       break;
   13445 
   13446    case 0x7F:
   13447       /* F3 0F 7F = MOVDQU -- move from G (xmm) to E (mem or xmm). */
   13448       if (haveF3no66noF2(pfx) && sz == 4) {
   13449          modrm = getUChar(delta);
   13450          if (epartIsReg(modrm)) {
   13451             goto decode_failure; /* awaiting test case */
   13452             delta += 1;
   13453             putXMMReg( eregOfRexRM(pfx,modrm),
   13454                        getXMMReg(gregOfRexRM(pfx,modrm)) );
   13455             DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   13456                                    nameXMMReg(eregOfRexRM(pfx,modrm)));
   13457          } else {
   13458             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   13459             delta += alen;
   13460             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   13461             DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
   13462          }
   13463          goto decode_success;
   13464       }
   13465       /* 66 0F 7F = MOVDQA -- move from G (xmm) to E (mem or xmm). */
   13466       if (have66noF2noF3(pfx) && sz == 2) {
   13467          modrm = getUChar(delta);
   13468          if (epartIsReg(modrm)) {
   13469             delta += 1;
   13470             putXMMReg( eregOfRexRM(pfx,modrm),
   13471                        getXMMReg(gregOfRexRM(pfx,modrm)) );
   13472             DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   13473                                    nameXMMReg(eregOfRexRM(pfx,modrm)));
   13474          } else {
   13475             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   13476             gen_SEGV_if_not_16_aligned( addr );
   13477             delta += alen;
   13478             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   13479             DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
   13480          }
   13481          goto decode_success;
   13482       }
   13483       break;
   13484 
   13485    case 0xAE:
   13486       /* 0F AE /7 = SFENCE -- flush pending operations to memory */
   13487       if (haveNo66noF2noF3(pfx)
   13488           && epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 7
   13489           && sz == 4) {
   13490          delta += 1;
   13491          /* Insert a memory fence.  It's sometimes important that these
   13492             are carried through to the generated code. */
   13493          stmt( IRStmt_MBE(Imbe_Fence) );
   13494          DIP("sfence\n");
   13495          goto decode_success;
   13496       }
   13497       /* mindless duplication follows .. */
   13498       /* 0F AE /5 = LFENCE -- flush pending operations to memory */
   13499       /* 0F AE /6 = MFENCE -- flush pending operations to memory */
   13500       if (haveNo66noF2noF3(pfx)
   13501           && epartIsReg(getUChar(delta))
   13502           && (gregLO3ofRM(getUChar(delta)) == 5
   13503               || gregLO3ofRM(getUChar(delta)) == 6)
   13504           && sz == 4) {
   13505          delta += 1;
   13506          /* Insert a memory fence.  It's sometimes important that these
   13507             are carried through to the generated code. */
   13508          stmt( IRStmt_MBE(Imbe_Fence) );
   13509          DIP("%sfence\n", gregLO3ofRM(getUChar(delta-1))==5 ? "l" : "m");
   13510          goto decode_success;
   13511       }
   13512 
   13513       /* 0F AE /7 = CLFLUSH -- flush cache line */
   13514       if (haveNo66noF2noF3(pfx)
   13515           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 7
   13516           && sz == 4) {
   13517 
   13518          /* This is something of a hack.  We need to know the size of
   13519             the cache line containing addr.  Since we don't (easily),
   13520             assume 256 on the basis that no real cache would have a
   13521             line that big.  It's safe to invalidate more stuff than we
   13522             need, just inefficient. */
   13523          ULong lineszB = 256ULL;
   13524 
   13525          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13526          delta += alen;
   13527 
   13528          /* Round addr down to the start of the containing block. */
   13529          stmt( IRStmt_Put(
   13530                   OFFB_CMSTART,
   13531                   binop( Iop_And64,
   13532                          mkexpr(addr),
   13533                          mkU64( ~(lineszB-1) ))) );
   13534 
   13535          stmt( IRStmt_Put(OFFB_CMLEN, mkU64(lineszB) ) );
   13536 
   13537          jmp_lit(dres, Ijk_InvalICache, (Addr64)(guest_RIP_bbstart+delta));
   13538 
   13539          DIP("clflush %s\n", dis_buf);
   13540          goto decode_success;
   13541       }
   13542 
   13543       /* 0F AE /3 = STMXCSR m32 -- store %mxcsr */
   13544       if (haveNo66noF2noF3(pfx)
   13545           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 3
   13546           && sz == 4) {
   13547          delta = dis_STMXCSR(vbi, pfx, delta, False/*!isAvx*/);
   13548          goto decode_success;
   13549       }
   13550       /* 0F AE /2 = LDMXCSR m32 -- load %mxcsr */
   13551       if (haveNo66noF2noF3(pfx)
   13552           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 2
   13553           && sz == 4) {
   13554          delta = dis_LDMXCSR(vbi, pfx, delta, False/*!isAvx*/);
   13555          goto decode_success;
   13556       }
   13557       /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory.
   13558          Note that the presence or absence of REX.W slightly affects the
   13559          written format: whether the saved FPU IP and DP pointers are 64
   13560          or 32 bits.  But the helper function we call simply writes zero
   13561          bits in the relevant fields (which are 64 bits regardless of
   13562          what REX.W is) and so it's good enough (iow, equally broken) in
   13563          both cases. */
   13564       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   13565           && !epartIsReg(getUChar(delta))
   13566           && gregOfRexRM(pfx,getUChar(delta)) == 0) {
   13567           IRDirty* d;
   13568          modrm = getUChar(delta);
   13569          vassert(!epartIsReg(modrm));
   13570 
   13571          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13572          delta += alen;
   13573          gen_SEGV_if_not_16_aligned(addr);
   13574 
   13575          DIP("%sfxsave %s\n", sz==8 ? "rex64/" : "", dis_buf);
   13576 
   13577          /* Uses dirty helper:
   13578                void amd64g_do_FXSAVE ( VexGuestAMD64State*, ULong ) */
   13579          d = unsafeIRDirty_0_N (
   13580                 0/*regparms*/,
   13581                 "amd64g_dirtyhelper_FXSAVE",
   13582                 &amd64g_dirtyhelper_FXSAVE,
   13583                 mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   13584              );
   13585 
   13586          /* declare we're writing memory */
   13587          d->mFx   = Ifx_Write;
   13588          d->mAddr = mkexpr(addr);
   13589          d->mSize = 464; /* according to recent Intel docs */
   13590 
   13591          /* declare we're reading guest state */
   13592          d->nFxState = 7;
   13593          vex_bzero(&d->fxState, sizeof(d->fxState));
   13594 
   13595          d->fxState[0].fx     = Ifx_Read;
   13596          d->fxState[0].offset = OFFB_FTOP;
   13597          d->fxState[0].size   = sizeof(UInt);
   13598 
   13599          d->fxState[1].fx     = Ifx_Read;
   13600          d->fxState[1].offset = OFFB_FPREGS;
   13601          d->fxState[1].size   = 8 * sizeof(ULong);
   13602 
   13603          d->fxState[2].fx     = Ifx_Read;
   13604          d->fxState[2].offset = OFFB_FPTAGS;
   13605          d->fxState[2].size   = 8 * sizeof(UChar);
   13606 
   13607          d->fxState[3].fx     = Ifx_Read;
   13608          d->fxState[3].offset = OFFB_FPROUND;
   13609          d->fxState[3].size   = sizeof(ULong);
   13610 
   13611          d->fxState[4].fx     = Ifx_Read;
   13612          d->fxState[4].offset = OFFB_FC3210;
   13613          d->fxState[4].size   = sizeof(ULong);
   13614 
   13615          d->fxState[5].fx     = Ifx_Read;
   13616          d->fxState[5].offset = OFFB_YMM0;
   13617          d->fxState[5].size   = sizeof(U128);
   13618          /* plus 15 more of the above, spaced out in YMM sized steps */
   13619          d->fxState[5].nRepeats  = 15;
   13620          d->fxState[5].repeatLen = sizeof(U256);
   13621 
   13622          d->fxState[6].fx     = Ifx_Read;
   13623          d->fxState[6].offset = OFFB_SSEROUND;
   13624          d->fxState[6].size   = sizeof(ULong);
   13625 
   13626          /* Be paranoid ... this assertion tries to ensure the 16 %ymm
   13627             images are packed back-to-back.  If not, the settings for
   13628             d->fxState[5] are wrong. */
   13629          vassert(32 == sizeof(U256));
   13630          vassert(OFFB_YMM15 == (OFFB_YMM0 + 15 * 32));
   13631 
   13632          stmt( IRStmt_Dirty(d) );
   13633 
   13634          goto decode_success;
   13635       }
   13636       /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory.
   13637          As with FXSAVE above we ignore the value of REX.W since we're
   13638          not bothering with the FPU DP and IP fields. */
   13639       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   13640           && !epartIsReg(getUChar(delta))
   13641           && gregOfRexRM(pfx,getUChar(delta)) == 1) {
   13642          IRDirty* d;
   13643          modrm = getUChar(delta);
   13644          vassert(!epartIsReg(modrm));
   13645 
   13646          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13647          delta += alen;
   13648          gen_SEGV_if_not_16_aligned(addr);
   13649 
   13650          DIP("%sfxrstor %s\n", sz==8 ? "rex64/" : "", dis_buf);
   13651 
   13652          /* Uses dirty helper:
   13653                VexEmNote amd64g_do_FXRSTOR ( VexGuestAMD64State*, ULong )
   13654             NOTE:
   13655                the VexEmNote value is simply ignored
   13656          */
   13657          d = unsafeIRDirty_0_N (
   13658                 0/*regparms*/,
   13659                 "amd64g_dirtyhelper_FXRSTOR",
   13660                 &amd64g_dirtyhelper_FXRSTOR,
   13661                 mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   13662              );
   13663 
   13664          /* declare we're reading memory */
   13665          d->mFx   = Ifx_Read;
   13666          d->mAddr = mkexpr(addr);
   13667          d->mSize = 464; /* according to recent Intel docs */
   13668 
   13669          /* declare we're writing guest state */
   13670          d->nFxState = 7;
   13671          vex_bzero(&d->fxState, sizeof(d->fxState));
   13672 
   13673          d->fxState[0].fx     = Ifx_Write;
   13674          d->fxState[0].offset = OFFB_FTOP;
   13675          d->fxState[0].size   = sizeof(UInt);
   13676 
   13677          d->fxState[1].fx     = Ifx_Write;
   13678          d->fxState[1].offset = OFFB_FPREGS;
   13679          d->fxState[1].size   = 8 * sizeof(ULong);
   13680 
   13681          d->fxState[2].fx     = Ifx_Write;
   13682          d->fxState[2].offset = OFFB_FPTAGS;
   13683          d->fxState[2].size   = 8 * sizeof(UChar);
   13684 
   13685          d->fxState[3].fx     = Ifx_Write;
   13686          d->fxState[3].offset = OFFB_FPROUND;
   13687          d->fxState[3].size   = sizeof(ULong);
   13688 
   13689          d->fxState[4].fx     = Ifx_Write;
   13690          d->fxState[4].offset = OFFB_FC3210;
   13691          d->fxState[4].size   = sizeof(ULong);
   13692 
   13693          d->fxState[5].fx     = Ifx_Write;
   13694          d->fxState[5].offset = OFFB_YMM0;
   13695          d->fxState[5].size   = sizeof(U128);
   13696          /* plus 15 more of the above, spaced out in YMM sized steps */
   13697          d->fxState[5].nRepeats  = 15;
   13698          d->fxState[5].repeatLen = sizeof(U256);
   13699 
   13700          d->fxState[6].fx     = Ifx_Write;
   13701          d->fxState[6].offset = OFFB_SSEROUND;
   13702          d->fxState[6].size   = sizeof(ULong);
   13703 
   13704          /* Be paranoid ... this assertion tries to ensure the 16 %ymm
   13705             images are packed back-to-back.  If not, the settings for
   13706             d->fxState[5] are wrong. */
   13707          vassert(32 == sizeof(U256));
   13708          vassert(OFFB_YMM15 == (OFFB_YMM0 + 15 * 32));
   13709 
   13710          stmt( IRStmt_Dirty(d) );
   13711 
   13712          goto decode_success;
   13713       }
   13714       break;
   13715 
   13716    case 0xC2:
   13717       /* 0F C2 = CMPPS -- 32Fx4 comparison from R/M to R */
   13718       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13719          Long delta0 = delta;
   13720          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpps", True, 4 );
   13721          if (delta > delta0) goto decode_success;
   13722       }
   13723       /* F3 0F C2 = CMPSS -- 32F0x4 comparison from R/M to R */
   13724       if (haveF3no66noF2(pfx) && sz == 4) {
   13725          Long delta0 = delta;
   13726          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpss", False, 4 );
   13727          if (delta > delta0) goto decode_success;
   13728       }
   13729       /* F2 0F C2 = CMPSD -- 64F0x2 comparison from R/M to R */
   13730       if (haveF2no66noF3(pfx) && sz == 4) {
   13731          Long delta0 = delta;
   13732          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpsd", False, 8 );
   13733          if (delta > delta0) goto decode_success;
   13734       }
   13735       /* 66 0F C2 = CMPPD -- 64Fx2 comparison from R/M to R */
   13736       if (have66noF2noF3(pfx) && sz == 2) {
   13737          Long delta0 = delta;
   13738          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmppd", True, 8 );
   13739          if (delta > delta0) goto decode_success;
   13740       }
   13741       break;
   13742 
   13743    case 0xC3:
   13744       /* 0F C3 = MOVNTI -- for us, just a plain ireg store. */
   13745       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)) {
   13746          modrm = getUChar(delta);
   13747          if (!epartIsReg(modrm)) {
   13748             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13749             storeLE( mkexpr(addr), getIRegG(sz, pfx, modrm) );
   13750             DIP("movnti %s,%s\n", dis_buf,
   13751                                   nameIRegG(sz, pfx, modrm));
   13752             delta += alen;
   13753             goto decode_success;
   13754          }
   13755          /* else fall through */
   13756       }
   13757       break;
   13758 
   13759    case 0xC4:
   13760       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   13761       /* 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   13762          put it into the specified lane of mmx(G). */
   13763       if (haveNo66noF2noF3(pfx)
   13764           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   13765          /* Use t0 .. t3 to hold the 4 original 16-bit lanes of the
   13766             mmx reg.  t4 is the new lane value.  t5 is the original
   13767             mmx value. t6 is the new mmx value. */
   13768          Int lane;
   13769          t4 = newTemp(Ity_I16);
   13770          t5 = newTemp(Ity_I64);
   13771          t6 = newTemp(Ity_I64);
   13772          modrm = getUChar(delta);
   13773          do_MMX_preamble();
   13774 
   13775          assign(t5, getMMXReg(gregLO3ofRM(modrm)));
   13776          breakup64to16s( t5, &t3, &t2, &t1, &t0 );
   13777 
   13778          if (epartIsReg(modrm)) {
   13779             assign(t4, getIReg16(eregOfRexRM(pfx,modrm)));
   13780             delta += 1+1;
   13781             lane = getUChar(delta-1);
   13782             DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   13783                                       nameIReg16(eregOfRexRM(pfx,modrm)),
   13784                                       nameMMXReg(gregLO3ofRM(modrm)));
   13785          } else {
   13786             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   13787             delta += 1+alen;
   13788             lane = getUChar(delta-1);
   13789             assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   13790             DIP("pinsrw $%d,%s,%s\n", (Int)lane,
   13791                                       dis_buf,
   13792                                       nameMMXReg(gregLO3ofRM(modrm)));
   13793          }
   13794 
   13795          switch (lane & 3) {
   13796             case 0:  assign(t6, mk64from16s(t3,t2,t1,t4)); break;
   13797             case 1:  assign(t6, mk64from16s(t3,t2,t4,t0)); break;
   13798             case 2:  assign(t6, mk64from16s(t3,t4,t1,t0)); break;
   13799             case 3:  assign(t6, mk64from16s(t4,t2,t1,t0)); break;
   13800             default: vassert(0);
   13801          }
   13802          putMMXReg(gregLO3ofRM(modrm), mkexpr(t6));
   13803          goto decode_success;
   13804       }
   13805       /* 66 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   13806          put it into the specified lane of xmm(G). */
   13807       if (have66noF2noF3(pfx)
   13808           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   13809          Int lane;
   13810          t4 = newTemp(Ity_I16);
   13811          modrm = getUChar(delta);
   13812          UInt rG = gregOfRexRM(pfx,modrm);
   13813          if (epartIsReg(modrm)) {
   13814             UInt rE = eregOfRexRM(pfx,modrm);
   13815             assign(t4, getIReg16(rE));
   13816             delta += 1+1;
   13817             lane = getUChar(delta-1);
   13818             DIP("pinsrw $%d,%s,%s\n",
   13819                 (Int)lane, nameIReg16(rE), nameXMMReg(rG));
   13820          } else {
   13821             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   13822                               1/*byte after the amode*/ );
   13823             delta += 1+alen;
   13824             lane = getUChar(delta-1);
   13825             assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   13826             DIP("pinsrw $%d,%s,%s\n",
   13827                 (Int)lane, dis_buf, nameXMMReg(rG));
   13828          }
   13829          IRTemp src_vec = newTemp(Ity_V128);
   13830          assign(src_vec, getXMMReg(rG));
   13831          IRTemp res_vec = math_PINSRW_128( src_vec, t4, lane & 7);
   13832          putXMMReg(rG, mkexpr(res_vec));
   13833          goto decode_success;
   13834       }
   13835       break;
   13836 
   13837    case 0xC5:
   13838       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   13839       /* 0F C5 = PEXTRW -- extract 16-bit field from mmx(E) and put
   13840          zero-extend of it in ireg(G). */
   13841       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)) {
   13842          modrm = getUChar(delta);
   13843          if (epartIsReg(modrm)) {
   13844             IRTemp sV = newTemp(Ity_I64);
   13845             t5 = newTemp(Ity_I16);
   13846             do_MMX_preamble();
   13847             assign(sV, getMMXReg(eregLO3ofRM(modrm)));
   13848             breakup64to16s( sV, &t3, &t2, &t1, &t0 );
   13849             switch (getUChar(delta+1) & 3) {
   13850                case 0:  assign(t5, mkexpr(t0)); break;
   13851                case 1:  assign(t5, mkexpr(t1)); break;
   13852                case 2:  assign(t5, mkexpr(t2)); break;
   13853                case 3:  assign(t5, mkexpr(t3)); break;
   13854                default: vassert(0);
   13855             }
   13856             if (sz == 8)
   13857                putIReg64(gregOfRexRM(pfx,modrm), unop(Iop_16Uto64, mkexpr(t5)));
   13858             else
   13859                putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_16Uto32, mkexpr(t5)));
   13860             DIP("pextrw $%d,%s,%s\n",
   13861                 (Int)getUChar(delta+1),
   13862                 nameMMXReg(eregLO3ofRM(modrm)),
   13863                 sz==8 ? nameIReg64(gregOfRexRM(pfx,modrm))
   13864                       : nameIReg32(gregOfRexRM(pfx,modrm))
   13865             );
   13866             delta += 2;
   13867             goto decode_success;
   13868          }
   13869          /* else fall through */
   13870          /* note, for anyone filling in the mem case: this insn has one
   13871             byte after the amode and therefore you must pass 1 as the
   13872             last arg to disAMode */
   13873       }
   13874       /* 66 0F C5 = PEXTRW -- extract 16-bit field from xmm(E) and put
   13875          zero-extend of it in ireg(G). */
   13876       if (have66noF2noF3(pfx)
   13877           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   13878          Long delta0 = delta;
   13879          delta = dis_PEXTRW_128_EregOnly_toG( vbi, pfx, delta,
   13880                                               False/*!isAvx*/ );
   13881          if (delta > delta0) goto decode_success;
   13882          /* else fall through -- decoding has failed */
   13883       }
   13884       break;
   13885 
   13886    case 0xC6:
   13887       /* 0F C6 /r ib = SHUFPS -- shuffle packed F32s */
   13888       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13889          Int    imm8 = 0;
   13890          IRTemp sV   = newTemp(Ity_V128);
   13891          IRTemp dV   = newTemp(Ity_V128);
   13892          modrm = getUChar(delta);
   13893          UInt rG = gregOfRexRM(pfx,modrm);
   13894          assign( dV, getXMMReg(rG) );
   13895          if (epartIsReg(modrm)) {
   13896             UInt rE = eregOfRexRM(pfx,modrm);
   13897             assign( sV, getXMMReg(rE) );
   13898             imm8 = (Int)getUChar(delta+1);
   13899             delta += 1+1;
   13900             DIP("shufps $%d,%s,%s\n", imm8, nameXMMReg(rE), nameXMMReg(rG));
   13901          } else {
   13902             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   13903             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   13904             imm8 = (Int)getUChar(delta+alen);
   13905             delta += 1+alen;
   13906             DIP("shufps $%d,%s,%s\n", imm8, dis_buf, nameXMMReg(rG));
   13907          }
   13908          IRTemp res = math_SHUFPS_128( sV, dV, imm8 );
   13909          putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
   13910          goto decode_success;
   13911       }
   13912       /* 66 0F C6 /r ib = SHUFPD -- shuffle packed F64s */
   13913       if (have66noF2noF3(pfx) && sz == 2) {
   13914          Int    select;
   13915          IRTemp sV = newTemp(Ity_V128);
   13916          IRTemp dV = newTemp(Ity_V128);
   13917 
   13918          modrm = getUChar(delta);
   13919          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   13920 
   13921          if (epartIsReg(modrm)) {
   13922             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   13923             select = (Int)getUChar(delta+1);
   13924             delta += 1+1;
   13925             DIP("shufpd $%d,%s,%s\n", select,
   13926                                       nameXMMReg(eregOfRexRM(pfx,modrm)),
   13927                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
   13928          } else {
   13929             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   13930             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   13931             select = getUChar(delta+alen);
   13932             delta += 1+alen;
   13933             DIP("shufpd $%d,%s,%s\n", select,
   13934                                       dis_buf,
   13935                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
   13936          }
   13937 
   13938          IRTemp res = math_SHUFPD_128( sV, dV, select );
   13939          putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
   13940          goto decode_success;
   13941       }
   13942       break;
   13943 
   13944    case 0xD1:
   13945       /* 66 0F D1 = PSRLW by E */
   13946       if (have66noF2noF3(pfx) && sz == 2) {
   13947          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrlw", Iop_ShrN16x8 );
   13948          goto decode_success;
   13949       }
   13950       break;
   13951 
   13952    case 0xD2:
   13953       /* 66 0F D2 = PSRLD by E */
   13954       if (have66noF2noF3(pfx) && sz == 2) {
   13955          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrld", Iop_ShrN32x4 );
   13956          goto decode_success;
   13957       }
   13958       break;
   13959 
   13960    case 0xD3:
   13961       /* 66 0F D3 = PSRLQ by E */
   13962       if (have66noF2noF3(pfx) && sz == 2) {
   13963          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrlq", Iop_ShrN64x2 );
   13964          goto decode_success;
   13965       }
   13966       break;
   13967 
   13968    case 0xD4:
   13969       /* 66 0F D4 = PADDQ */
   13970       if (have66noF2noF3(pfx) && sz == 2) {
   13971          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13972                                     "paddq", Iop_Add64x2, False );
   13973          goto decode_success;
   13974       }
   13975       /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   13976       /* 0F D4 = PADDQ -- add 64x1 */
   13977       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13978          do_MMX_preamble();
   13979          delta = dis_MMXop_regmem_to_reg (
   13980                    vbi, pfx, delta, opc, "paddq", False );
   13981          goto decode_success;
   13982       }
   13983       break;
   13984 
   13985    case 0xD5:
   13986       /* 66 0F D5 = PMULLW -- 16x8 multiply */
   13987       if (have66noF2noF3(pfx) && sz == 2) {
   13988          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13989                                     "pmullw", Iop_Mul16x8, False );
   13990          goto decode_success;
   13991       }
   13992       break;
   13993 
   13994    case 0xD6:
   13995       /* F3 0F D6 = MOVQ2DQ -- move from E (mmx) to G (lo half xmm, zero
   13996          hi half). */
   13997       if (haveF3no66noF2(pfx) && sz == 4) {
   13998          modrm = getUChar(delta);
   13999          if (epartIsReg(modrm)) {
   14000             do_MMX_preamble();
   14001             putXMMReg( gregOfRexRM(pfx,modrm),
   14002                        unop(Iop_64UtoV128, getMMXReg( eregLO3ofRM(modrm) )) );
   14003             DIP("movq2dq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   14004                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   14005             delta += 1;
   14006             goto decode_success;
   14007          }
   14008          /* apparently no mem case for this insn */
   14009       }
   14010       /* 66 0F D6 = MOVQ -- move 64 bits from G (lo half xmm) to E (mem
   14011          or lo half xmm).  */
   14012       if (have66noF2noF3(pfx)
   14013           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   14014          modrm = getUChar(delta);
   14015          if (epartIsReg(modrm)) {
   14016             /* fall through, awaiting test case */
   14017             /* dst: lo half copied, hi half zeroed */
   14018          } else {
   14019             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14020             storeLE( mkexpr(addr),
   14021                      getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
   14022             DIP("movq %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf );
   14023             delta += alen;
   14024             goto decode_success;
   14025          }
   14026       }
   14027       /* F2 0F D6 = MOVDQ2Q -- move from E (lo half xmm, not mem) to G (mmx). */
   14028       if (haveF2no66noF3(pfx) && sz == 4) {
   14029          modrm = getUChar(delta);
   14030          if (epartIsReg(modrm)) {
   14031             do_MMX_preamble();
   14032             putMMXReg( gregLO3ofRM(modrm),
   14033                        getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
   14034             DIP("movdq2q %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   14035                                    nameMMXReg(gregLO3ofRM(modrm)));
   14036             delta += 1;
   14037             goto decode_success;
   14038          }
   14039          /* apparently no mem case for this insn */
   14040       }
   14041       break;
   14042 
   14043    case 0xD7:
   14044       /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16
   14045          lanes in xmm(E), turn them into a byte, and put
   14046          zero-extend of it in ireg(G).  Doing this directly is just
   14047          too cumbersome; give up therefore and call a helper. */
   14048       if (have66noF2noF3(pfx)
   14049           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
   14050           && epartIsReg(getUChar(delta))) { /* no memory case, it seems */
   14051          delta = dis_PMOVMSKB_128( vbi, pfx, delta, False/*!isAvx*/ );
   14052          goto decode_success;
   14053       }
   14054       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14055       /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
   14056          mmx(E), turn them into a byte, and put zero-extend of it in
   14057          ireg(G). */
   14058       if (haveNo66noF2noF3(pfx)
   14059           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   14060          modrm = getUChar(delta);
   14061          if (epartIsReg(modrm)) {
   14062             do_MMX_preamble();
   14063             t0 = newTemp(Ity_I64);
   14064             t1 = newTemp(Ity_I32);
   14065             assign(t0, getMMXReg(eregLO3ofRM(modrm)));
   14066             assign(t1, unop(Iop_8Uto32, unop(Iop_GetMSBs8x8, mkexpr(t0))));
   14067             putIReg32(gregOfRexRM(pfx,modrm), mkexpr(t1));
   14068             DIP("pmovmskb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   14069                                     nameIReg32(gregOfRexRM(pfx,modrm)));
   14070             delta += 1;
   14071             goto decode_success;
   14072          }
   14073          /* else fall through */
   14074       }
   14075       break;
   14076 
   14077    case 0xD8:
   14078       /* 66 0F D8 = PSUBUSB */
   14079       if (have66noF2noF3(pfx) && sz == 2) {
   14080          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14081                                     "psubusb", Iop_QSub8Ux16, False );
   14082          goto decode_success;
   14083       }
   14084       break;
   14085 
   14086    case 0xD9:
   14087       /* 66 0F D9 = PSUBUSW */
   14088       if (have66noF2noF3(pfx) && sz == 2) {
   14089          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14090                                     "psubusw", Iop_QSub16Ux8, False );
   14091          goto decode_success;
   14092       }
   14093       break;
   14094 
   14095    case 0xDA:
   14096       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14097       /* 0F DA = PMINUB -- 8x8 unsigned min */
   14098       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14099          do_MMX_preamble();
   14100          delta = dis_MMXop_regmem_to_reg (
   14101                     vbi, pfx, delta, opc, "pminub", False );
   14102          goto decode_success;
   14103       }
   14104       /* 66 0F DA = PMINUB -- 8x16 unsigned min */
   14105       if (have66noF2noF3(pfx) && sz == 2) {
   14106          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14107                                     "pminub", Iop_Min8Ux16, False );
   14108          goto decode_success;
   14109       }
   14110       break;
   14111 
   14112    case 0xDB:
   14113       /* 66 0F DB = PAND */
   14114       if (have66noF2noF3(pfx) && sz == 2) {
   14115          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "pand", Iop_AndV128 );
   14116          goto decode_success;
   14117       }
   14118       break;
   14119 
   14120    case 0xDC:
   14121       /* 66 0F DC = PADDUSB */
   14122       if (have66noF2noF3(pfx) && sz == 2) {
   14123          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14124                                     "paddusb", Iop_QAdd8Ux16, False );
   14125          goto decode_success;
   14126       }
   14127       break;
   14128 
   14129    case 0xDD:
   14130       /* 66 0F DD = PADDUSW */
   14131       if (have66noF2noF3(pfx) && sz == 2) {
   14132          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14133                                     "paddusw", Iop_QAdd16Ux8, False );
   14134          goto decode_success;
   14135       }
   14136       break;
   14137 
   14138    case 0xDE:
   14139       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14140       /* 0F DE = PMAXUB -- 8x8 unsigned max */
   14141       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14142          do_MMX_preamble();
   14143          delta = dis_MMXop_regmem_to_reg (
   14144                     vbi, pfx, delta, opc, "pmaxub", False );
   14145          goto decode_success;
   14146       }
   14147       /* 66 0F DE = PMAXUB -- 8x16 unsigned max */
   14148       if (have66noF2noF3(pfx) && sz == 2) {
   14149          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14150                                     "pmaxub", Iop_Max8Ux16, False );
   14151          goto decode_success;
   14152       }
   14153       break;
   14154 
   14155    case 0xDF:
   14156       /* 66 0F DF = PANDN */
   14157       if (have66noF2noF3(pfx) && sz == 2) {
   14158          delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "pandn", Iop_AndV128 );
   14159          goto decode_success;
   14160       }
   14161       break;
   14162 
   14163    case 0xE0:
   14164       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14165       /* 0F E0 = PAVGB -- 8x8 unsigned Packed Average, with rounding */
   14166       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14167          do_MMX_preamble();
   14168          delta = dis_MMXop_regmem_to_reg (
   14169                     vbi, pfx, delta, opc, "pavgb", False );
   14170          goto decode_success;
   14171       }
   14172       /* 66 0F E0 = PAVGB */
   14173       if (have66noF2noF3(pfx) && sz == 2) {
   14174          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14175                                     "pavgb", Iop_Avg8Ux16, False );
   14176          goto decode_success;
   14177       }
   14178       break;
   14179 
   14180    case 0xE1:
   14181       /* 66 0F E1 = PSRAW by E */
   14182       if (have66noF2noF3(pfx) && sz == 2) {
   14183          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psraw", Iop_SarN16x8 );
   14184          goto decode_success;
   14185       }
   14186       break;
   14187 
   14188    case 0xE2:
   14189       /* 66 0F E2 = PSRAD by E */
   14190       if (have66noF2noF3(pfx) && sz == 2) {
   14191          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrad", Iop_SarN32x4 );
   14192          goto decode_success;
   14193       }
   14194       break;
   14195 
   14196    case 0xE3:
   14197       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14198       /* 0F E3 = PAVGW -- 16x4 unsigned Packed Average, with rounding */
   14199       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14200          do_MMX_preamble();
   14201          delta = dis_MMXop_regmem_to_reg (
   14202                     vbi, pfx, delta, opc, "pavgw", False );
   14203          goto decode_success;
   14204       }
   14205       /* 66 0F E3 = PAVGW */
   14206       if (have66noF2noF3(pfx) && sz == 2) {
   14207          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14208                                     "pavgw", Iop_Avg16Ux8, False );
   14209          goto decode_success;
   14210       }
   14211       break;
   14212 
   14213    case 0xE4:
   14214       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14215       /* 0F E4 = PMULUH -- 16x4 hi-half of unsigned widening multiply */
   14216       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14217          do_MMX_preamble();
   14218          delta = dis_MMXop_regmem_to_reg (
   14219                     vbi, pfx, delta, opc, "pmuluh", False );
   14220          goto decode_success;
   14221       }
   14222       /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */
   14223       if (have66noF2noF3(pfx) && sz == 2) {
   14224          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14225                                     "pmulhuw", Iop_MulHi16Ux8, False );
   14226          goto decode_success;
   14227       }
   14228       break;
   14229 
   14230    case 0xE5:
   14231       /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */
   14232       if (have66noF2noF3(pfx) && sz == 2) {
   14233          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14234                                     "pmulhw", Iop_MulHi16Sx8, False );
   14235          goto decode_success;
   14236       }
   14237       break;
   14238 
   14239    case 0xE6:
   14240       /* 66 0F E6 = CVTTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   14241          lo half xmm(G), and zero upper half, rounding towards zero */
   14242       /* F2 0F E6 = CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   14243          lo half xmm(G), according to prevailing rounding mode, and zero
   14244          upper half */
   14245       if ( (haveF2no66noF3(pfx) && sz == 4)
   14246            || (have66noF2noF3(pfx) && sz == 2) ) {
   14247          delta = dis_CVTxPD2DQ_128( vbi, pfx, delta, False/*!isAvx*/,
   14248                                     toBool(sz == 2)/*r2zero*/);
   14249          goto decode_success;
   14250       }
   14251       /* F3 0F E6 = CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
   14252          F64 in xmm(G) */
   14253       if (haveF3no66noF2(pfx) && sz == 4) {
   14254          delta = dis_CVTDQ2PD_128(vbi, pfx, delta, False/*!isAvx*/);
   14255          goto decode_success;
   14256       }
   14257       break;
   14258 
   14259    case 0xE7:
   14260       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14261       /* 0F E7 = MOVNTQ -- for us, just a plain MMX store.  Note, the
   14262          Intel manual does not say anything about the usual business of
   14263          the FP reg tags getting trashed whenever an MMX insn happens.
   14264          So we just leave them alone.
   14265       */
   14266       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14267          modrm = getUChar(delta);
   14268          if (!epartIsReg(modrm)) {
   14269             /* do_MMX_preamble(); Intel docs don't specify this */
   14270             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14271             storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
   14272             DIP("movntq %s,%s\n", dis_buf,
   14273                                   nameMMXReg(gregLO3ofRM(modrm)));
   14274             delta += alen;
   14275             goto decode_success;
   14276          }
   14277          /* else fall through */
   14278       }
   14279       /* 66 0F E7 = MOVNTDQ -- for us, just a plain SSE store. */
   14280       if (have66noF2noF3(pfx) && sz == 2) {
   14281          modrm = getUChar(delta);
   14282          if (!epartIsReg(modrm)) {
   14283             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14284             gen_SEGV_if_not_16_aligned( addr );
   14285             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   14286             DIP("movntdq %s,%s\n", dis_buf,
   14287                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   14288             delta += alen;
   14289             goto decode_success;
   14290          }
   14291          /* else fall through */
   14292       }
   14293       break;
   14294 
   14295    case 0xE8:
   14296       /* 66 0F E8 = PSUBSB */
   14297       if (have66noF2noF3(pfx) && sz == 2) {
   14298          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14299                                     "psubsb", Iop_QSub8Sx16, False );
   14300          goto decode_success;
   14301       }
   14302       break;
   14303 
   14304    case 0xE9:
   14305       /* 66 0F E9 = PSUBSW */
   14306       if (have66noF2noF3(pfx) && sz == 2) {
   14307          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14308                                     "psubsw", Iop_QSub16Sx8, False );
   14309          goto decode_success;
   14310       }
   14311       break;
   14312 
   14313    case 0xEA:
   14314       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14315       /* 0F EA = PMINSW -- 16x4 signed min */
   14316       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14317          do_MMX_preamble();
   14318          delta = dis_MMXop_regmem_to_reg (
   14319                     vbi, pfx, delta, opc, "pminsw", False );
   14320          goto decode_success;
   14321       }
   14322       /* 66 0F EA = PMINSW -- 16x8 signed min */
   14323       if (have66noF2noF3(pfx) && sz == 2) {
   14324          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14325                                     "pminsw", Iop_Min16Sx8, False );
   14326          goto decode_success;
   14327       }
   14328       break;
   14329 
   14330    case 0xEB:
   14331       /* 66 0F EB = POR */
   14332       if (have66noF2noF3(pfx) && sz == 2) {
   14333          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "por", Iop_OrV128 );
   14334          goto decode_success;
   14335       }
   14336       break;
   14337 
   14338    case 0xEC:
   14339       /* 66 0F EC = PADDSB */
   14340       if (have66noF2noF3(pfx) && sz == 2) {
   14341          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14342                                     "paddsb", Iop_QAdd8Sx16, False );
   14343          goto decode_success;
   14344       }
   14345       break;
   14346 
   14347    case 0xED:
   14348       /* 66 0F ED = PADDSW */
   14349       if (have66noF2noF3(pfx) && sz == 2) {
   14350          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14351                                     "paddsw", Iop_QAdd16Sx8, False );
   14352          goto decode_success;
   14353       }
   14354       break;
   14355 
   14356    case 0xEE:
   14357       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14358       /* 0F EE = PMAXSW -- 16x4 signed max */
   14359       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14360          do_MMX_preamble();
   14361          delta = dis_MMXop_regmem_to_reg (
   14362                     vbi, pfx, delta, opc, "pmaxsw", False );
   14363          goto decode_success;
   14364       }
   14365       /* 66 0F EE = PMAXSW -- 16x8 signed max */
   14366       if (have66noF2noF3(pfx) && sz == 2) {
   14367          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14368                                     "pmaxsw", Iop_Max16Sx8, False );
   14369          goto decode_success;
   14370       }
   14371       break;
   14372 
   14373    case 0xEF:
   14374       /* 66 0F EF = PXOR */
   14375       if (have66noF2noF3(pfx) && sz == 2) {
   14376          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "pxor", Iop_XorV128 );
   14377          goto decode_success;
   14378       }
   14379       break;
   14380 
   14381    case 0xF1:
   14382       /* 66 0F F1 = PSLLW by E */
   14383       if (have66noF2noF3(pfx) && sz == 2) {
   14384          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psllw", Iop_ShlN16x8 );
   14385          goto decode_success;
   14386       }
   14387       break;
   14388 
   14389    case 0xF2:
   14390       /* 66 0F F2 = PSLLD by E */
   14391       if (have66noF2noF3(pfx) && sz == 2) {
   14392          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "pslld", Iop_ShlN32x4 );
   14393          goto decode_success;
   14394       }
   14395       break;
   14396 
   14397    case 0xF3:
   14398       /* 66 0F F3 = PSLLQ by E */
   14399       if (have66noF2noF3(pfx) && sz == 2) {
   14400          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psllq", Iop_ShlN64x2 );
   14401          goto decode_success;
   14402       }
   14403       break;
   14404 
   14405    case 0xF4:
   14406       /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   14407          0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
   14408          half */
   14409       if (have66noF2noF3(pfx) && sz == 2) {
   14410          IRTemp sV = newTemp(Ity_V128);
   14411          IRTemp dV = newTemp(Ity_V128);
   14412          modrm = getUChar(delta);
   14413          UInt rG = gregOfRexRM(pfx,modrm);
   14414          assign( dV, getXMMReg(rG) );
   14415          if (epartIsReg(modrm)) {
   14416             UInt rE = eregOfRexRM(pfx,modrm);
   14417             assign( sV, getXMMReg(rE) );
   14418             delta += 1;
   14419             DIP("pmuludq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   14420          } else {
   14421             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14422             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   14423             delta += alen;
   14424             DIP("pmuludq %s,%s\n", dis_buf, nameXMMReg(rG));
   14425          }
   14426          putXMMReg( rG, mkexpr(math_PMULUDQ_128( sV, dV )) );
   14427          goto decode_success;
   14428       }
   14429       /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   14430       /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   14431          0 to form 64-bit result */
   14432       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14433          IRTemp sV = newTemp(Ity_I64);
   14434          IRTemp dV = newTemp(Ity_I64);
   14435          t1 = newTemp(Ity_I32);
   14436          t0 = newTemp(Ity_I32);
   14437          modrm = getUChar(delta);
   14438 
   14439          do_MMX_preamble();
   14440          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   14441 
   14442          if (epartIsReg(modrm)) {
   14443             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   14444             delta += 1;
   14445             DIP("pmuludq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   14446                                    nameMMXReg(gregLO3ofRM(modrm)));
   14447          } else {
   14448             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14449             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   14450             delta += alen;
   14451             DIP("pmuludq %s,%s\n", dis_buf,
   14452                                    nameMMXReg(gregLO3ofRM(modrm)));
   14453          }
   14454 
   14455          assign( t0, unop(Iop_64to32, mkexpr(dV)) );
   14456          assign( t1, unop(Iop_64to32, mkexpr(sV)) );
   14457          putMMXReg( gregLO3ofRM(modrm),
   14458                     binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) );
   14459          goto decode_success;
   14460       }
   14461       break;
   14462 
   14463    case 0xF5:
   14464       /* 66 0F F5 = PMADDWD -- Multiply and add packed integers from
   14465          E(xmm or mem) to G(xmm) */
   14466       if (have66noF2noF3(pfx) && sz == 2) {
   14467          IRTemp sV = newTemp(Ity_V128);
   14468          IRTemp dV = newTemp(Ity_V128);
   14469          modrm     = getUChar(delta);
   14470          UInt   rG = gregOfRexRM(pfx,modrm);
   14471          if (epartIsReg(modrm)) {
   14472             UInt rE = eregOfRexRM(pfx,modrm);
   14473             assign( sV, getXMMReg(rE) );
   14474             delta += 1;
   14475             DIP("pmaddwd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   14476          } else {
   14477             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14478             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   14479             delta += alen;
   14480             DIP("pmaddwd %s,%s\n", dis_buf, nameXMMReg(rG));
   14481          }
   14482          assign( dV, getXMMReg(rG) );
   14483          putXMMReg( rG, mkexpr(math_PMADDWD_128(dV, sV)) );
   14484          goto decode_success;
   14485       }
   14486       break;
   14487 
   14488    case 0xF6:
   14489       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14490       /* 0F F6 = PSADBW -- sum of 8Ux8 absolute differences */
   14491       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14492          do_MMX_preamble();
   14493          delta = dis_MMXop_regmem_to_reg (
   14494                     vbi, pfx, delta, opc, "psadbw", False );
   14495          goto decode_success;
   14496       }
   14497       /* 66 0F F6 = PSADBW -- 2 x (8x8 -> 48 zeroes ++ u16) Sum Abs Diffs
   14498          from E(xmm or mem) to G(xmm) */
   14499       if (have66noF2noF3(pfx) && sz == 2) {
   14500          IRTemp sV  = newTemp(Ity_V128);
   14501          IRTemp dV  = newTemp(Ity_V128);
   14502          modrm = getUChar(delta);
   14503          UInt   rG   = gregOfRexRM(pfx,modrm);
   14504          if (epartIsReg(modrm)) {
   14505             UInt rE = eregOfRexRM(pfx,modrm);
   14506             assign( sV, getXMMReg(rE) );
   14507             delta += 1;
   14508             DIP("psadbw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   14509          } else {
   14510             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14511             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   14512             delta += alen;
   14513             DIP("psadbw %s,%s\n", dis_buf, nameXMMReg(rG));
   14514          }
   14515          assign( dV, getXMMReg(rG) );
   14516          putXMMReg( rG, mkexpr( math_PSADBW_128 ( dV, sV ) ) );
   14517 
   14518          goto decode_success;
   14519       }
   14520       break;
   14521 
   14522    case 0xF7:
   14523       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14524       /* 0F F7 = MASKMOVQ -- 8x8 masked store */
   14525       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14526          Bool ok = False;
   14527          delta = dis_MMX( &ok, vbi, pfx, sz, delta-1 );
   14528          if (ok) goto decode_success;
   14529       }
   14530       /* 66 0F F7 = MASKMOVDQU -- store selected bytes of double quadword */
   14531       if (have66noF2noF3(pfx) && sz == 2 && epartIsReg(getUChar(delta))) {
   14532          delta = dis_MASKMOVDQU( vbi, pfx, delta, False/*!isAvx*/ );
   14533          goto decode_success;
   14534       }
   14535       break;
   14536 
   14537    case 0xF8:
   14538       /* 66 0F F8 = PSUBB */
   14539       if (have66noF2noF3(pfx) && sz == 2) {
   14540          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14541                                     "psubb", Iop_Sub8x16, False );
   14542          goto decode_success;
   14543       }
   14544       break;
   14545 
   14546    case 0xF9:
   14547       /* 66 0F F9 = PSUBW */
   14548       if (have66noF2noF3(pfx) && sz == 2) {
   14549          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14550                                     "psubw", Iop_Sub16x8, False );
   14551          goto decode_success;
   14552       }
   14553       break;
   14554 
   14555    case 0xFA:
   14556       /* 66 0F FA = PSUBD */
   14557       if (have66noF2noF3(pfx) && sz == 2) {
   14558          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14559                                     "psubd", Iop_Sub32x4, False );
   14560          goto decode_success;
   14561       }
   14562       break;
   14563 
   14564    case 0xFB:
   14565       /* 66 0F FB = PSUBQ */
   14566       if (have66noF2noF3(pfx) && sz == 2) {
   14567          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14568                                     "psubq", Iop_Sub64x2, False );
   14569          goto decode_success;
   14570       }
   14571       /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   14572       /* 0F FB = PSUBQ -- sub 64x1 */
   14573       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14574          do_MMX_preamble();
   14575          delta = dis_MMXop_regmem_to_reg (
   14576                    vbi, pfx, delta, opc, "psubq", False );
   14577          goto decode_success;
   14578       }
   14579       break;
   14580 
   14581    case 0xFC:
   14582       /* 66 0F FC = PADDB */
   14583       if (have66noF2noF3(pfx) && sz == 2) {
   14584          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14585                                     "paddb", Iop_Add8x16, False );
   14586          goto decode_success;
   14587       }
   14588       break;
   14589 
   14590    case 0xFD:
   14591       /* 66 0F FD = PADDW */
   14592       if (have66noF2noF3(pfx) && sz == 2) {
   14593          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14594                                     "paddw", Iop_Add16x8, False );
   14595          goto decode_success;
   14596       }
   14597       break;
   14598 
   14599    case 0xFE:
   14600       /* 66 0F FE = PADDD */
   14601       if (have66noF2noF3(pfx) && sz == 2) {
   14602          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14603                                     "paddd", Iop_Add32x4, False );
   14604          goto decode_success;
   14605       }
   14606       break;
   14607 
   14608    default:
   14609       goto decode_failure;
   14610 
   14611    }
   14612 
   14613   decode_failure:
   14614    *decode_OK = False;
   14615    return deltaIN;
   14616 
   14617   decode_success:
   14618    *decode_OK = True;
   14619    return delta;
   14620 }
   14621 
   14622 
   14623 /*------------------------------------------------------------*/
   14624 /*---                                                      ---*/
   14625 /*--- Top-level SSE3 (not SupSSE3): dis_ESC_0F__SSE3       ---*/
   14626 /*---                                                      ---*/
   14627 /*------------------------------------------------------------*/
   14628 
   14629 static Long dis_MOVDDUP_128 ( VexAbiInfo* vbi, Prefix pfx,
   14630                               Long delta, Bool isAvx )
   14631 {
   14632    IRTemp addr   = IRTemp_INVALID;
   14633    Int    alen   = 0;
   14634    HChar  dis_buf[50];
   14635    IRTemp sV    = newTemp(Ity_V128);
   14636    IRTemp d0    = newTemp(Ity_I64);
   14637    UChar  modrm = getUChar(delta);
   14638    UInt   rG    = gregOfRexRM(pfx,modrm);
   14639    if (epartIsReg(modrm)) {
   14640       UInt rE = eregOfRexRM(pfx,modrm);
   14641       assign( sV, getXMMReg(rE) );
   14642       DIP("%smovddup %s,%s\n",
   14643           isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
   14644       delta += 1;
   14645       assign ( d0, unop(Iop_V128to64, mkexpr(sV)) );
   14646    } else {
   14647       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14648       assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
   14649       DIP("%smovddup %s,%s\n",
   14650           isAvx ? "v" : "", dis_buf, nameXMMReg(rG));
   14651       delta += alen;
   14652    }
   14653    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   14654       ( rG, binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) );
   14655    return delta;
   14656 }
   14657 
   14658 
   14659 static Long dis_MOVDDUP_256 ( VexAbiInfo* vbi, Prefix pfx,
   14660                               Long delta )
   14661 {
   14662    IRTemp addr   = IRTemp_INVALID;
   14663    Int    alen   = 0;
   14664    HChar  dis_buf[50];
   14665    IRTemp d0    = newTemp(Ity_I64);
   14666    IRTemp d1    = newTemp(Ity_I64);
   14667    UChar  modrm = getUChar(delta);
   14668    UInt   rG    = gregOfRexRM(pfx,modrm);
   14669    if (epartIsReg(modrm)) {
   14670       UInt rE = eregOfRexRM(pfx,modrm);
   14671       DIP("vmovddup %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   14672       delta += 1;
   14673       assign ( d0, getYMMRegLane64(rE, 0) );
   14674       assign ( d1, getYMMRegLane64(rE, 2) );
   14675    } else {
   14676       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14677       assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
   14678       assign( d1, loadLE(Ity_I64, binop(Iop_Add64,
   14679                                         mkexpr(addr), mkU64(16))) );
   14680       DIP("vmovddup %s,%s\n", dis_buf, nameYMMReg(rG));
   14681       delta += alen;
   14682    }
   14683    putYMMRegLane64( rG, 0, mkexpr(d0) );
   14684    putYMMRegLane64( rG, 1, mkexpr(d0) );
   14685    putYMMRegLane64( rG, 2, mkexpr(d1) );
   14686    putYMMRegLane64( rG, 3, mkexpr(d1) );
   14687    return delta;
   14688 }
   14689 
   14690 
   14691 static Long dis_MOVSxDUP_128 ( VexAbiInfo* vbi, Prefix pfx,
   14692                                Long delta, Bool isAvx, Bool isL )
   14693 {
   14694    IRTemp addr  = IRTemp_INVALID;
   14695    Int    alen  = 0;
   14696    HChar  dis_buf[50];
   14697    IRTemp sV    = newTemp(Ity_V128);
   14698    UChar  modrm = getUChar(delta);
   14699    UInt   rG    = gregOfRexRM(pfx,modrm);
   14700    IRTemp s3, s2, s1, s0;
   14701    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   14702    if (epartIsReg(modrm)) {
   14703       UInt rE = eregOfRexRM(pfx,modrm);
   14704       assign( sV, getXMMReg(rE) );
   14705       DIP("%smovs%cdup %s,%s\n",
   14706           isAvx ? "v" : "", isL ? 'l' : 'h', nameXMMReg(rE), nameXMMReg(rG));
   14707       delta += 1;
   14708    } else {
   14709       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14710       if (!isAvx)
   14711          gen_SEGV_if_not_16_aligned( addr );
   14712       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   14713       DIP("%smovs%cdup %s,%s\n",
   14714           isAvx ? "v" : "", isL ? 'l' : 'h', dis_buf, nameXMMReg(rG));
   14715       delta += alen;
   14716    }
   14717    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   14718    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   14719       ( rG, isL ? mkV128from32s( s2, s2, s0, s0 )
   14720                 : mkV128from32s( s3, s3, s1, s1 ) );
   14721    return delta;
   14722 }
   14723 
   14724 
   14725 static Long dis_MOVSxDUP_256 ( VexAbiInfo* vbi, Prefix pfx,
   14726                                Long delta, Bool isL )
   14727 {
   14728    IRTemp addr  = IRTemp_INVALID;
   14729    Int    alen  = 0;
   14730    HChar  dis_buf[50];
   14731    IRTemp sV    = newTemp(Ity_V256);
   14732    UChar  modrm = getUChar(delta);
   14733    UInt   rG    = gregOfRexRM(pfx,modrm);
   14734    IRTemp s7, s6, s5, s4, s3, s2, s1, s0;
   14735    s7 = s6 = s5 = s4 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   14736    if (epartIsReg(modrm)) {
   14737       UInt rE = eregOfRexRM(pfx,modrm);
   14738       assign( sV, getYMMReg(rE) );
   14739       DIP("vmovs%cdup %s,%s\n",
   14740           isL ? 'l' : 'h', nameYMMReg(rE), nameYMMReg(rG));
   14741       delta += 1;
   14742    } else {
   14743       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14744       assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   14745       DIP("vmovs%cdup %s,%s\n",
   14746           isL ? 'l' : 'h', dis_buf, nameYMMReg(rG));
   14747       delta += alen;
   14748    }
   14749    breakupV256to32s( sV, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
   14750    putYMMRegLane128( rG, 1, isL ? mkV128from32s( s6, s6, s4, s4 )
   14751                                 : mkV128from32s( s7, s7, s5, s5 ) );
   14752    putYMMRegLane128( rG, 0, isL ? mkV128from32s( s2, s2, s0, s0 )
   14753                                 : mkV128from32s( s3, s3, s1, s1 ) );
   14754    return delta;
   14755 }
   14756 
   14757 
   14758 static IRTemp math_HADDPS_128 ( IRTemp dV, IRTemp sV, Bool isAdd )
   14759 {
   14760    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   14761    IRTemp leftV  = newTemp(Ity_V128);
   14762    IRTemp rightV = newTemp(Ity_V128);
   14763    IRTemp rm     = newTemp(Ity_I32);
   14764    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   14765 
   14766    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   14767    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   14768 
   14769    assign( leftV,  mkV128from32s( s2, s0, d2, d0 ) );
   14770    assign( rightV, mkV128from32s( s3, s1, d3, d1 ) );
   14771 
   14772    IRTemp res = newTemp(Ity_V128);
   14773    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   14774    assign( res, triop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4,
   14775                       mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
   14776    return res;
   14777 }
   14778 
   14779 
   14780 static IRTemp math_HADDPD_128 ( IRTemp dV, IRTemp sV, Bool isAdd )
   14781 {
   14782    IRTemp s1, s0, d1, d0;
   14783    IRTemp leftV  = newTemp(Ity_V128);
   14784    IRTemp rightV = newTemp(Ity_V128);
   14785    IRTemp rm     = newTemp(Ity_I32);
   14786    s1 = s0 = d1 = d0 = IRTemp_INVALID;
   14787 
   14788    breakupV128to64s( sV, &s1, &s0 );
   14789    breakupV128to64s( dV, &d1, &d0 );
   14790 
   14791    assign( leftV,  binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) );
   14792    assign( rightV, binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
   14793 
   14794    IRTemp res = newTemp(Ity_V128);
   14795    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   14796    assign( res, triop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2,
   14797                       mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
   14798    return res;
   14799 }
   14800 
   14801 
   14802 __attribute__((noinline))
   14803 static
   14804 Long dis_ESC_0F__SSE3 ( Bool* decode_OK,
   14805                         VexAbiInfo* vbi,
   14806                         Prefix pfx, Int sz, Long deltaIN )
   14807 {
   14808    IRTemp addr  = IRTemp_INVALID;
   14809    UChar  modrm = 0;
   14810    Int    alen  = 0;
   14811    HChar  dis_buf[50];
   14812 
   14813    *decode_OK = False;
   14814 
   14815    Long   delta = deltaIN;
   14816    UChar  opc   = getUChar(delta);
   14817    delta++;
   14818    switch (opc) {
   14819 
   14820    case 0x12:
   14821       /* F3 0F 12 = MOVSLDUP -- move from E (mem or xmm) to G (xmm),
   14822          duplicating some lanes (2:2:0:0). */
   14823       if (haveF3no66noF2(pfx) && sz == 4) {
   14824          delta = dis_MOVSxDUP_128( vbi, pfx, delta, False/*!isAvx*/,
   14825                                    True/*isL*/ );
   14826          goto decode_success;
   14827       }
   14828       /* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
   14829          duplicating some lanes (0:1:0:1). */
   14830       if (haveF2no66noF3(pfx)
   14831           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   14832          delta = dis_MOVDDUP_128( vbi, pfx, delta, False/*!isAvx*/ );
   14833          goto decode_success;
   14834       }
   14835       break;
   14836 
   14837    case 0x16:
   14838       /* F3 0F 16 = MOVSHDUP -- move from E (mem or xmm) to G (xmm),
   14839          duplicating some lanes (3:3:1:1). */
   14840       if (haveF3no66noF2(pfx) && sz == 4) {
   14841          delta = dis_MOVSxDUP_128( vbi, pfx, delta, False/*!isAvx*/,
   14842                                    False/*!isL*/ );
   14843          goto decode_success;
   14844       }
   14845       break;
   14846 
   14847    case 0x7C:
   14848    case 0x7D:
   14849       /* F2 0F 7C = HADDPS -- 32x4 add across from E (mem or xmm) to G (xmm). */
   14850       /* F2 0F 7D = HSUBPS -- 32x4 sub across from E (mem or xmm) to G (xmm). */
   14851       if (haveF2no66noF3(pfx) && sz == 4) {
   14852          IRTemp eV     = newTemp(Ity_V128);
   14853          IRTemp gV     = newTemp(Ity_V128);
   14854          Bool   isAdd  = opc == 0x7C;
   14855          const HChar* str = isAdd ? "add" : "sub";
   14856          modrm         = getUChar(delta);
   14857          UInt   rG     = gregOfRexRM(pfx,modrm);
   14858          if (epartIsReg(modrm)) {
   14859             UInt rE = eregOfRexRM(pfx,modrm);
   14860             assign( eV, getXMMReg(rE) );
   14861             DIP("h%sps %s,%s\n", str, nameXMMReg(rE), nameXMMReg(rG));
   14862             delta += 1;
   14863          } else {
   14864             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14865             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   14866             DIP("h%sps %s,%s\n", str, dis_buf, nameXMMReg(rG));
   14867             delta += alen;
   14868          }
   14869 
   14870          assign( gV, getXMMReg(rG) );
   14871          putXMMReg( rG, mkexpr( math_HADDPS_128 ( gV, eV, isAdd ) ) );
   14872          goto decode_success;
   14873       }
   14874       /* 66 0F 7C = HADDPD -- 64x2 add across from E (mem or xmm) to G (xmm). */
   14875       /* 66 0F 7D = HSUBPD -- 64x2 sub across from E (mem or xmm) to G (xmm). */
   14876       if (have66noF2noF3(pfx) && sz == 2) {
   14877          IRTemp eV     = newTemp(Ity_V128);
   14878          IRTemp gV     = newTemp(Ity_V128);
   14879          Bool   isAdd  = opc == 0x7C;
   14880          const HChar* str = isAdd ? "add" : "sub";
   14881          modrm         = getUChar(delta);
   14882          UInt   rG     = gregOfRexRM(pfx,modrm);
   14883          if (epartIsReg(modrm)) {
   14884             UInt rE = eregOfRexRM(pfx,modrm);
   14885             assign( eV, getXMMReg(rE) );
   14886             DIP("h%spd %s,%s\n", str, nameXMMReg(rE), nameXMMReg(rG));
   14887             delta += 1;
   14888          } else {
   14889             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14890             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   14891             DIP("h%spd %s,%s\n", str, dis_buf, nameXMMReg(rG));
   14892             delta += alen;
   14893          }
   14894 
   14895          assign( gV, getXMMReg(rG) );
   14896          putXMMReg( rG, mkexpr( math_HADDPD_128 ( gV, eV, isAdd ) ) );
   14897          goto decode_success;
   14898       }
   14899       break;
   14900 
   14901    case 0xD0:
   14902       /* 66 0F D0 = ADDSUBPD -- 64x4 +/- from E (mem or xmm) to G (xmm). */
   14903       if (have66noF2noF3(pfx) && sz == 2) {
   14904          IRTemp eV   = newTemp(Ity_V128);
   14905          IRTemp gV   = newTemp(Ity_V128);
   14906          modrm       = getUChar(delta);
   14907          UInt   rG   = gregOfRexRM(pfx,modrm);
   14908          if (epartIsReg(modrm)) {
   14909             UInt rE = eregOfRexRM(pfx,modrm);
   14910             assign( eV, getXMMReg(rE) );
   14911             DIP("addsubpd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   14912             delta += 1;
   14913          } else {
   14914             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14915             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   14916             DIP("addsubpd %s,%s\n", dis_buf, nameXMMReg(rG));
   14917             delta += alen;
   14918          }
   14919 
   14920          assign( gV, getXMMReg(rG) );
   14921          putXMMReg( rG, mkexpr( math_ADDSUBPD_128 ( gV, eV ) ) );
   14922          goto decode_success;
   14923       }
   14924       /* F2 0F D0 = ADDSUBPS -- 32x4 +/-/+/- from E (mem or xmm) to G (xmm). */
   14925       if (haveF2no66noF3(pfx) && sz == 4) {
   14926          IRTemp eV   = newTemp(Ity_V128);
   14927          IRTemp gV   = newTemp(Ity_V128);
   14928          modrm       = getUChar(delta);
   14929          UInt   rG   = gregOfRexRM(pfx,modrm);
   14930 
   14931          modrm = getUChar(delta);
   14932          if (epartIsReg(modrm)) {
   14933             UInt rE = eregOfRexRM(pfx,modrm);
   14934             assign( eV, getXMMReg(rE) );
   14935             DIP("addsubps %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   14936             delta += 1;
   14937          } else {
   14938             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14939             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   14940             DIP("addsubps %s,%s\n", dis_buf, nameXMMReg(rG));
   14941             delta += alen;
   14942          }
   14943 
   14944          assign( gV, getXMMReg(rG) );
   14945          putXMMReg( rG, mkexpr( math_ADDSUBPS_128 ( gV, eV ) ) );
   14946          goto decode_success;
   14947       }
   14948       break;
   14949 
   14950    case 0xF0:
   14951       /* F2 0F F0 = LDDQU -- move from E (mem or xmm) to G (xmm). */
   14952       if (haveF2no66noF3(pfx) && sz == 4) {
   14953          modrm = getUChar(delta);
   14954          if (epartIsReg(modrm)) {
   14955             goto decode_failure;
   14956          } else {
   14957             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14958             putXMMReg( gregOfRexRM(pfx,modrm),
   14959                        loadLE(Ity_V128, mkexpr(addr)) );
   14960             DIP("lddqu %s,%s\n", dis_buf,
   14961                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   14962             delta += alen;
   14963          }
   14964          goto decode_success;
   14965       }
   14966       break;
   14967 
   14968    default:
   14969       goto decode_failure;
   14970 
   14971    }
   14972 
   14973   decode_failure:
   14974    *decode_OK = False;
   14975    return deltaIN;
   14976 
   14977   decode_success:
   14978    *decode_OK = True;
   14979    return delta;
   14980 }
   14981 
   14982 
   14983 /*------------------------------------------------------------*/
   14984 /*---                                                      ---*/
   14985 /*--- Top-level SSSE3: dis_ESC_0F38__SupSSE3               ---*/
   14986 /*---                                                      ---*/
   14987 /*------------------------------------------------------------*/
   14988 
   14989 static
   14990 IRTemp math_PSHUFB_XMM ( IRTemp dV/*data to perm*/, IRTemp sV/*perm*/ )
   14991 {
   14992    IRTemp sHi        = newTemp(Ity_I64);
   14993    IRTemp sLo        = newTemp(Ity_I64);
   14994    IRTemp dHi        = newTemp(Ity_I64);
   14995    IRTemp dLo        = newTemp(Ity_I64);
   14996    IRTemp rHi        = newTemp(Ity_I64);
   14997    IRTemp rLo        = newTemp(Ity_I64);
   14998    IRTemp sevens     = newTemp(Ity_I64);
   14999    IRTemp mask0x80hi = newTemp(Ity_I64);
   15000    IRTemp mask0x80lo = newTemp(Ity_I64);
   15001    IRTemp maskBit3hi = newTemp(Ity_I64);
   15002    IRTemp maskBit3lo = newTemp(Ity_I64);
   15003    IRTemp sAnd7hi    = newTemp(Ity_I64);
   15004    IRTemp sAnd7lo    = newTemp(Ity_I64);
   15005    IRTemp permdHi    = newTemp(Ity_I64);
   15006    IRTemp permdLo    = newTemp(Ity_I64);
   15007    IRTemp res        = newTemp(Ity_V128);
   15008 
   15009    assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   15010    assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   15011    assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   15012    assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   15013 
   15014    assign( sevens, mkU64(0x0707070707070707ULL) );
   15015 
   15016    /* mask0x80hi = Not(SarN8x8(sHi,7))
   15017       maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7)
   15018       sAnd7hi    = And(sHi,sevens)
   15019       permdHi    = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi),
   15020       And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) )
   15021       rHi        = And(permdHi,mask0x80hi)
   15022    */
   15023    assign(
   15024       mask0x80hi,
   15025       unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7))));
   15026 
   15027    assign(
   15028       maskBit3hi,
   15029       binop(Iop_SarN8x8,
   15030             binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)),
   15031             mkU8(7)));
   15032 
   15033    assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens)));
   15034 
   15035    assign(
   15036       permdHi,
   15037       binop(
   15038          Iop_Or64,
   15039          binop(Iop_And64,
   15040                binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)),
   15041                mkexpr(maskBit3hi)),
   15042          binop(Iop_And64,
   15043                binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)),
   15044                unop(Iop_Not64,mkexpr(maskBit3hi))) ));
   15045 
   15046    assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) );
   15047 
   15048    /* And the same for the lower half of the result.  What fun. */
   15049 
   15050    assign(
   15051       mask0x80lo,
   15052       unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7))));
   15053 
   15054    assign(
   15055       maskBit3lo,
   15056       binop(Iop_SarN8x8,
   15057             binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)),
   15058             mkU8(7)));
   15059 
   15060    assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens)));
   15061 
   15062    assign(
   15063       permdLo,
   15064       binop(
   15065          Iop_Or64,
   15066          binop(Iop_And64,
   15067                binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)),
   15068                mkexpr(maskBit3lo)),
   15069          binop(Iop_And64,
   15070                binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)),
   15071                unop(Iop_Not64,mkexpr(maskBit3lo))) ));
   15072 
   15073    assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) );
   15074 
   15075    assign(res, binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo)));
   15076    return res;
   15077 }
   15078 
   15079 
   15080 static
   15081 IRTemp math_PSHUFB_YMM ( IRTemp dV/*data to perm*/, IRTemp sV/*perm*/ )
   15082 {
   15083    IRTemp sHi, sLo, dHi, dLo;
   15084    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   15085    breakupV256toV128s( dV, &dHi, &dLo);
   15086    breakupV256toV128s( sV, &sHi, &sLo);
   15087    IRTemp res = newTemp(Ity_V256);
   15088    assign(res, binop(Iop_V128HLtoV256,
   15089                      mkexpr(math_PSHUFB_XMM(dHi, sHi)),
   15090                      mkexpr(math_PSHUFB_XMM(dLo, sLo))));
   15091    return res;
   15092 }
   15093 
   15094 
   15095 static Long dis_PHADD_128 ( VexAbiInfo* vbi, Prefix pfx, Long delta,
   15096                             Bool isAvx, UChar opc )
   15097 {
   15098    IRTemp addr   = IRTemp_INVALID;
   15099    Int    alen   = 0;
   15100    HChar  dis_buf[50];
   15101    const HChar* str = "???";
   15102    IROp   opV64  = Iop_INVALID;
   15103    IROp   opCatO = Iop_CatOddLanes16x4;
   15104    IROp   opCatE = Iop_CatEvenLanes16x4;
   15105    IRTemp sV     = newTemp(Ity_V128);
   15106    IRTemp dV     = newTemp(Ity_V128);
   15107    IRTemp sHi    = newTemp(Ity_I64);
   15108    IRTemp sLo    = newTemp(Ity_I64);
   15109    IRTemp dHi    = newTemp(Ity_I64);
   15110    IRTemp dLo    = newTemp(Ity_I64);
   15111    UChar  modrm  = getUChar(delta);
   15112    UInt   rG     = gregOfRexRM(pfx,modrm);
   15113    UInt   rV     = isAvx ? getVexNvvvv(pfx) : rG;
   15114 
   15115    switch (opc) {
   15116       case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   15117       case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   15118       case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   15119       case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   15120       case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   15121       case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   15122       default: vassert(0);
   15123    }
   15124    if (opc == 0x02 || opc == 0x06) {
   15125       opCatO = Iop_InterleaveHI32x2;
   15126       opCatE = Iop_InterleaveLO32x2;
   15127    }
   15128 
   15129    assign( dV, getXMMReg(rV) );
   15130 
   15131    if (epartIsReg(modrm)) {
   15132       UInt rE = eregOfRexRM(pfx,modrm);
   15133       assign( sV, getXMMReg(rE) );
   15134       DIP("%sph%s %s,%s\n", isAvx ? "v" : "", str,
   15135           nameXMMReg(rE), nameXMMReg(rG));
   15136       delta += 1;
   15137    } else {
   15138       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15139       if (!isAvx)
   15140          gen_SEGV_if_not_16_aligned( addr );
   15141       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15142       DIP("%sph%s %s,%s\n", isAvx ? "v" : "", str,
   15143           dis_buf, nameXMMReg(rG));
   15144       delta += alen;
   15145    }
   15146 
   15147    assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   15148    assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   15149    assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   15150    assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   15151 
   15152    /* This isn't a particularly efficient way to compute the
   15153       result, but at least it avoids a proliferation of IROps,
   15154       hence avoids complication all the backends. */
   15155 
   15156    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   15157       ( rG,
   15158         binop(Iop_64HLtoV128,
   15159               binop(opV64,
   15160                     binop(opCatE,mkexpr(sHi),mkexpr(sLo)),
   15161                     binop(opCatO,mkexpr(sHi),mkexpr(sLo)) ),
   15162               binop(opV64,
   15163                     binop(opCatE,mkexpr(dHi),mkexpr(dLo)),
   15164                     binop(opCatO,mkexpr(dHi),mkexpr(dLo)) ) ) );
   15165    return delta;
   15166 }
   15167 
   15168 
   15169 static Long dis_PHADD_256 ( VexAbiInfo* vbi, Prefix pfx, Long delta, UChar opc )
   15170 {
   15171    IRTemp addr   = IRTemp_INVALID;
   15172    Int    alen   = 0;
   15173    HChar  dis_buf[50];
   15174    const HChar* str = "???";
   15175    IROp   opV64  = Iop_INVALID;
   15176    IROp   opCatO = Iop_CatOddLanes16x4;
   15177    IROp   opCatE = Iop_CatEvenLanes16x4;
   15178    IRTemp sV     = newTemp(Ity_V256);
   15179    IRTemp dV     = newTemp(Ity_V256);
   15180    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   15181    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   15182    UChar  modrm  = getUChar(delta);
   15183    UInt   rG     = gregOfRexRM(pfx,modrm);
   15184    UInt   rV     = getVexNvvvv(pfx);
   15185 
   15186    switch (opc) {
   15187       case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   15188       case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   15189       case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   15190       case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   15191       case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   15192       case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   15193       default: vassert(0);
   15194    }
   15195    if (opc == 0x02 || opc == 0x06) {
   15196       opCatO = Iop_InterleaveHI32x2;
   15197       opCatE = Iop_InterleaveLO32x2;
   15198    }
   15199 
   15200    assign( dV, getYMMReg(rV) );
   15201 
   15202    if (epartIsReg(modrm)) {
   15203       UInt rE = eregOfRexRM(pfx,modrm);
   15204       assign( sV, getYMMReg(rE) );
   15205       DIP("vph%s %s,%s\n", str, nameYMMReg(rE), nameYMMReg(rG));
   15206       delta += 1;
   15207    } else {
   15208       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15209       assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   15210       DIP("vph%s %s,%s\n", str, dis_buf, nameYMMReg(rG));
   15211       delta += alen;
   15212    }
   15213 
   15214    breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
   15215    breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
   15216 
   15217    /* This isn't a particularly efficient way to compute the
   15218       result, but at least it avoids a proliferation of IROps,
   15219       hence avoids complication all the backends. */
   15220 
   15221    putYMMReg( rG,
   15222               binop(Iop_V128HLtoV256,
   15223                     binop(Iop_64HLtoV128,
   15224                           binop(opV64,
   15225                                 binop(opCatE,mkexpr(s3),mkexpr(s2)),
   15226                                 binop(opCatO,mkexpr(s3),mkexpr(s2)) ),
   15227                           binop(opV64,
   15228                                 binop(opCatE,mkexpr(d3),mkexpr(d2)),
   15229                                 binop(opCatO,mkexpr(d3),mkexpr(d2)) ) ),
   15230                     binop(Iop_64HLtoV128,
   15231                           binop(opV64,
   15232                                 binop(opCatE,mkexpr(s1),mkexpr(s0)),
   15233                                 binop(opCatO,mkexpr(s1),mkexpr(s0)) ),
   15234                           binop(opV64,
   15235                                 binop(opCatE,mkexpr(d1),mkexpr(d0)),
   15236                                 binop(opCatO,mkexpr(d1),mkexpr(d0)) ) ) ) );
   15237    return delta;
   15238 }
   15239 
   15240 
   15241 static IRTemp math_PMADDUBSW_128 ( IRTemp dV, IRTemp sV )
   15242 {
   15243    IRTemp sVoddsSX  = newTemp(Ity_V128);
   15244    IRTemp sVevensSX = newTemp(Ity_V128);
   15245    IRTemp dVoddsZX  = newTemp(Ity_V128);
   15246    IRTemp dVevensZX = newTemp(Ity_V128);
   15247    /* compute dV unsigned x sV signed */
   15248    assign( sVoddsSX, binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) );
   15249    assign( sVevensSX, binop(Iop_SarN16x8,
   15250                             binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)),
   15251                             mkU8(8)) );
   15252    assign( dVoddsZX, binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) );
   15253    assign( dVevensZX, binop(Iop_ShrN16x8,
   15254                             binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)),
   15255                             mkU8(8)) );
   15256 
   15257    IRTemp res = newTemp(Ity_V128);
   15258    assign( res, binop(Iop_QAdd16Sx8,
   15259                       binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
   15260                       binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX))
   15261                      )
   15262          );
   15263    return res;
   15264 }
   15265 
   15266 
   15267 static
   15268 IRTemp math_PMADDUBSW_256 ( IRTemp dV, IRTemp sV )
   15269 {
   15270    IRTemp sHi, sLo, dHi, dLo;
   15271    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   15272    breakupV256toV128s( dV, &dHi, &dLo);
   15273    breakupV256toV128s( sV, &sHi, &sLo);
   15274    IRTemp res = newTemp(Ity_V256);
   15275    assign(res, binop(Iop_V128HLtoV256,
   15276                      mkexpr(math_PMADDUBSW_128(dHi, sHi)),
   15277                      mkexpr(math_PMADDUBSW_128(dLo, sLo))));
   15278    return res;
   15279 }
   15280 
   15281 
   15282 __attribute__((noinline))
   15283 static
   15284 Long dis_ESC_0F38__SupSSE3 ( Bool* decode_OK,
   15285                              VexAbiInfo* vbi,
   15286                              Prefix pfx, Int sz, Long deltaIN )
   15287 {
   15288    IRTemp addr  = IRTemp_INVALID;
   15289    UChar  modrm = 0;
   15290    Int    alen  = 0;
   15291    HChar  dis_buf[50];
   15292 
   15293    *decode_OK = False;
   15294 
   15295    Long   delta = deltaIN;
   15296    UChar  opc   = getUChar(delta);
   15297    delta++;
   15298    switch (opc) {
   15299 
   15300    case 0x00:
   15301       /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */
   15302       if (have66noF2noF3(pfx)
   15303           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   15304          IRTemp sV = newTemp(Ity_V128);
   15305          IRTemp dV = newTemp(Ity_V128);
   15306 
   15307          modrm = getUChar(delta);
   15308          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   15309 
   15310          if (epartIsReg(modrm)) {
   15311             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   15312             delta += 1;
   15313             DIP("pshufb %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   15314                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   15315          } else {
   15316             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15317             gen_SEGV_if_not_16_aligned( addr );
   15318             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15319             delta += alen;
   15320             DIP("pshufb %s,%s\n", dis_buf,
   15321                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   15322          }
   15323 
   15324          IRTemp res = math_PSHUFB_XMM( dV, sV );
   15325          putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(res));
   15326          goto decode_success;
   15327       }
   15328       /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */
   15329       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15330          IRTemp sV      = newTemp(Ity_I64);
   15331          IRTemp dV      = newTemp(Ity_I64);
   15332 
   15333          modrm = getUChar(delta);
   15334          do_MMX_preamble();
   15335          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   15336 
   15337          if (epartIsReg(modrm)) {
   15338             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   15339             delta += 1;
   15340             DIP("pshufb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   15341                                   nameMMXReg(gregLO3ofRM(modrm)));
   15342          } else {
   15343             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15344             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   15345             delta += alen;
   15346             DIP("pshufb %s,%s\n", dis_buf,
   15347                                   nameMMXReg(gregLO3ofRM(modrm)));
   15348          }
   15349 
   15350          putMMXReg(
   15351             gregLO3ofRM(modrm),
   15352             binop(
   15353                Iop_And64,
   15354                /* permute the lanes */
   15355                binop(
   15356                   Iop_Perm8x8,
   15357                   mkexpr(dV),
   15358                   binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL))
   15359                ),
   15360                /* mask off lanes which have (index & 0x80) == 0x80 */
   15361                unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7)))
   15362             )
   15363          );
   15364          goto decode_success;
   15365       }
   15366       break;
   15367 
   15368    case 0x01:
   15369    case 0x02:
   15370    case 0x03:
   15371    case 0x05:
   15372    case 0x06:
   15373    case 0x07:
   15374       /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and
   15375          G to G (xmm). */
   15376       /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and
   15377          G to G (xmm). */
   15378       /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or
   15379          xmm) and G to G (xmm). */
   15380       /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and
   15381          G to G (xmm). */
   15382       /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and
   15383          G to G (xmm). */
   15384       /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or
   15385          xmm) and G to G (xmm). */
   15386       if (have66noF2noF3(pfx)
   15387           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   15388          delta = dis_PHADD_128( vbi, pfx, delta, False/*isAvx*/, opc );
   15389          goto decode_success;
   15390       }
   15391       /* ***--- these are MMX class insns introduced in SSSE3 ---*** */
   15392       /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G
   15393          to G (mmx). */
   15394       /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G
   15395          to G (mmx). */
   15396       /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or
   15397          mmx) and G to G (mmx). */
   15398       /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G
   15399          to G (mmx). */
   15400       /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G
   15401          to G (mmx). */
   15402       /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or
   15403          mmx) and G to G (mmx). */
   15404       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15405          const HChar* str = "???";
   15406          IROp   opV64  = Iop_INVALID;
   15407          IROp   opCatO = Iop_CatOddLanes16x4;
   15408          IROp   opCatE = Iop_CatEvenLanes16x4;
   15409          IRTemp sV     = newTemp(Ity_I64);
   15410          IRTemp dV     = newTemp(Ity_I64);
   15411 
   15412          modrm = getUChar(delta);
   15413 
   15414          switch (opc) {
   15415             case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
   15416             case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
   15417             case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
   15418             case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
   15419             case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
   15420             case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
   15421             default: vassert(0);
   15422          }
   15423          if (opc == 0x02 || opc == 0x06) {
   15424             opCatO = Iop_InterleaveHI32x2;
   15425             opCatE = Iop_InterleaveLO32x2;
   15426          }
   15427 
   15428          do_MMX_preamble();
   15429          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   15430 
   15431          if (epartIsReg(modrm)) {
   15432             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   15433             delta += 1;
   15434             DIP("ph%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
   15435                                      nameMMXReg(gregLO3ofRM(modrm)));
   15436          } else {
   15437             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15438             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   15439             delta += alen;
   15440             DIP("ph%s %s,%s\n", str, dis_buf,
   15441                                      nameMMXReg(gregLO3ofRM(modrm)));
   15442          }
   15443 
   15444          putMMXReg(
   15445             gregLO3ofRM(modrm),
   15446             binop(opV64,
   15447                   binop(opCatE,mkexpr(sV),mkexpr(dV)),
   15448                   binop(opCatO,mkexpr(sV),mkexpr(dV))
   15449             )
   15450          );
   15451          goto decode_success;
   15452       }
   15453       break;
   15454 
   15455    case 0x04:
   15456       /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
   15457          Unsigned Bytes (XMM) */
   15458       if (have66noF2noF3(pfx)
   15459           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   15460          IRTemp sV = newTemp(Ity_V128);
   15461          IRTemp dV = newTemp(Ity_V128);
   15462          modrm     = getUChar(delta);
   15463          UInt   rG = gregOfRexRM(pfx,modrm);
   15464 
   15465          assign( dV, getXMMReg(rG) );
   15466 
   15467          if (epartIsReg(modrm)) {
   15468             UInt rE = eregOfRexRM(pfx,modrm);
   15469             assign( sV, getXMMReg(rE) );
   15470             delta += 1;
   15471             DIP("pmaddubsw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   15472          } else {
   15473             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15474             gen_SEGV_if_not_16_aligned( addr );
   15475             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15476             delta += alen;
   15477             DIP("pmaddubsw %s,%s\n", dis_buf, nameXMMReg(rG));
   15478          }
   15479 
   15480          putXMMReg( rG, mkexpr( math_PMADDUBSW_128( dV, sV ) ) );
   15481          goto decode_success;
   15482       }
   15483       /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
   15484          Unsigned Bytes (MMX) */
   15485       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15486          IRTemp sV        = newTemp(Ity_I64);
   15487          IRTemp dV        = newTemp(Ity_I64);
   15488          IRTemp sVoddsSX  = newTemp(Ity_I64);
   15489          IRTemp sVevensSX = newTemp(Ity_I64);
   15490          IRTemp dVoddsZX  = newTemp(Ity_I64);
   15491          IRTemp dVevensZX = newTemp(Ity_I64);
   15492 
   15493          modrm = getUChar(delta);
   15494          do_MMX_preamble();
   15495          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   15496 
   15497          if (epartIsReg(modrm)) {
   15498             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   15499             delta += 1;
   15500             DIP("pmaddubsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   15501                                      nameMMXReg(gregLO3ofRM(modrm)));
   15502          } else {
   15503             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15504             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   15505             delta += alen;
   15506             DIP("pmaddubsw %s,%s\n", dis_buf,
   15507                                      nameMMXReg(gregLO3ofRM(modrm)));
   15508          }
   15509 
   15510          /* compute dV unsigned x sV signed */
   15511          assign( sVoddsSX,
   15512                  binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) );
   15513          assign( sVevensSX,
   15514                  binop(Iop_SarN16x4,
   15515                        binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)),
   15516                        mkU8(8)) );
   15517          assign( dVoddsZX,
   15518                  binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) );
   15519          assign( dVevensZX,
   15520                  binop(Iop_ShrN16x4,
   15521                        binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)),
   15522                        mkU8(8)) );
   15523 
   15524          putMMXReg(
   15525             gregLO3ofRM(modrm),
   15526             binop(Iop_QAdd16Sx4,
   15527                   binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
   15528                   binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX))
   15529             )
   15530          );
   15531          goto decode_success;
   15532       }
   15533       break;
   15534 
   15535    case 0x08:
   15536    case 0x09:
   15537    case 0x0A:
   15538       /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */
   15539       /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */
   15540       /* 66 0F 38 0A = PSIGND -- Packed Sign 32x4 (XMM) */
   15541       if (have66noF2noF3(pfx)
   15542           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   15543          IRTemp sV      = newTemp(Ity_V128);
   15544          IRTemp dV      = newTemp(Ity_V128);
   15545          IRTemp sHi     = newTemp(Ity_I64);
   15546          IRTemp sLo     = newTemp(Ity_I64);
   15547          IRTemp dHi     = newTemp(Ity_I64);
   15548          IRTemp dLo     = newTemp(Ity_I64);
   15549          const HChar* str = "???";
   15550          Int    laneszB = 0;
   15551 
   15552          switch (opc) {
   15553             case 0x08: laneszB = 1; str = "b"; break;
   15554             case 0x09: laneszB = 2; str = "w"; break;
   15555             case 0x0A: laneszB = 4; str = "d"; break;
   15556             default: vassert(0);
   15557          }
   15558 
   15559          modrm = getUChar(delta);
   15560          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   15561 
   15562          if (epartIsReg(modrm)) {
   15563             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   15564             delta += 1;
   15565             DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
   15566                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   15567          } else {
   15568             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15569             gen_SEGV_if_not_16_aligned( addr );
   15570             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15571             delta += alen;
   15572             DIP("psign%s %s,%s\n", str, dis_buf,
   15573                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   15574          }
   15575 
   15576          assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   15577          assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   15578          assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   15579          assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   15580 
   15581          putXMMReg(
   15582             gregOfRexRM(pfx,modrm),
   15583             binop(Iop_64HLtoV128,
   15584                   dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
   15585                   dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
   15586             )
   15587          );
   15588          goto decode_success;
   15589       }
   15590       /* 0F 38 08 = PSIGNB -- Packed Sign 8x8  (MMX) */
   15591       /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */
   15592       /* 0F 38 0A = PSIGND -- Packed Sign 32x2 (MMX) */
   15593       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15594          IRTemp sV      = newTemp(Ity_I64);
   15595          IRTemp dV      = newTemp(Ity_I64);
   15596          const HChar* str = "???";
   15597          Int    laneszB = 0;
   15598 
   15599          switch (opc) {
   15600             case 0x08: laneszB = 1; str = "b"; break;
   15601             case 0x09: laneszB = 2; str = "w"; break;
   15602             case 0x0A: laneszB = 4; str = "d"; break;
   15603             default: vassert(0);
   15604          }
   15605 
   15606          modrm = getUChar(delta);
   15607          do_MMX_preamble();
   15608          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   15609 
   15610          if (epartIsReg(modrm)) {
   15611             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   15612             delta += 1;
   15613             DIP("psign%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
   15614                                         nameMMXReg(gregLO3ofRM(modrm)));
   15615          } else {
   15616             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15617             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   15618             delta += alen;
   15619             DIP("psign%s %s,%s\n", str, dis_buf,
   15620                                         nameMMXReg(gregLO3ofRM(modrm)));
   15621          }
   15622 
   15623          putMMXReg(
   15624             gregLO3ofRM(modrm),
   15625             dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB )
   15626          );
   15627          goto decode_success;
   15628       }
   15629       break;
   15630 
   15631    case 0x0B:
   15632       /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and
   15633          Scale (XMM) */
   15634       if (have66noF2noF3(pfx)
   15635           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   15636          IRTemp sV  = newTemp(Ity_V128);
   15637          IRTemp dV  = newTemp(Ity_V128);
   15638          IRTemp sHi = newTemp(Ity_I64);
   15639          IRTemp sLo = newTemp(Ity_I64);
   15640          IRTemp dHi = newTemp(Ity_I64);
   15641          IRTemp dLo = newTemp(Ity_I64);
   15642 
   15643          modrm = getUChar(delta);
   15644          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   15645 
   15646          if (epartIsReg(modrm)) {
   15647             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   15648             delta += 1;
   15649             DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   15650                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   15651          } else {
   15652             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15653             gen_SEGV_if_not_16_aligned( addr );
   15654             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15655             delta += alen;
   15656             DIP("pmulhrsw %s,%s\n", dis_buf,
   15657                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   15658          }
   15659 
   15660          assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   15661          assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   15662          assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   15663          assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   15664 
   15665          putXMMReg(
   15666             gregOfRexRM(pfx,modrm),
   15667             binop(Iop_64HLtoV128,
   15668                   dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
   15669                   dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
   15670             )
   15671          );
   15672          goto decode_success;
   15673       }
   15674       /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale
   15675          (MMX) */
   15676       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15677          IRTemp sV = newTemp(Ity_I64);
   15678          IRTemp dV = newTemp(Ity_I64);
   15679 
   15680          modrm = getUChar(delta);
   15681          do_MMX_preamble();
   15682          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   15683 
   15684          if (epartIsReg(modrm)) {
   15685             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   15686             delta += 1;
   15687             DIP("pmulhrsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   15688                                     nameMMXReg(gregLO3ofRM(modrm)));
   15689          } else {
   15690             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15691             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   15692             delta += alen;
   15693             DIP("pmulhrsw %s,%s\n", dis_buf,
   15694                                     nameMMXReg(gregLO3ofRM(modrm)));
   15695          }
   15696 
   15697          putMMXReg(
   15698             gregLO3ofRM(modrm),
   15699             dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) )
   15700          );
   15701          goto decode_success;
   15702       }
   15703       break;
   15704 
   15705    case 0x1C:
   15706    case 0x1D:
   15707    case 0x1E:
   15708       /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */
   15709       /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */
   15710       /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */
   15711       if (have66noF2noF3(pfx)
   15712           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   15713          IRTemp sV  = newTemp(Ity_V128);
   15714          const HChar* str = "???";
   15715          Int    laneszB = 0;
   15716 
   15717          switch (opc) {
   15718             case 0x1C: laneszB = 1; str = "b"; break;
   15719             case 0x1D: laneszB = 2; str = "w"; break;
   15720             case 0x1E: laneszB = 4; str = "d"; break;
   15721             default: vassert(0);
   15722          }
   15723 
   15724          modrm = getUChar(delta);
   15725          if (epartIsReg(modrm)) {
   15726             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   15727             delta += 1;
   15728             DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
   15729                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
   15730          } else {
   15731             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15732             gen_SEGV_if_not_16_aligned( addr );
   15733             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15734             delta += alen;
   15735             DIP("pabs%s %s,%s\n", str, dis_buf,
   15736                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
   15737          }
   15738 
   15739          putXMMReg( gregOfRexRM(pfx,modrm),
   15740                     mkexpr(math_PABS_XMM(sV, laneszB)) );
   15741          goto decode_success;
   15742       }
   15743       /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8  (MMX) */
   15744       /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */
   15745       /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */
   15746       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15747          IRTemp sV      = newTemp(Ity_I64);
   15748          const HChar* str = "???";
   15749          Int    laneszB = 0;
   15750 
   15751          switch (opc) {
   15752             case 0x1C: laneszB = 1; str = "b"; break;
   15753             case 0x1D: laneszB = 2; str = "w"; break;
   15754             case 0x1E: laneszB = 4; str = "d"; break;
   15755             default: vassert(0);
   15756          }
   15757 
   15758          modrm = getUChar(delta);
   15759          do_MMX_preamble();
   15760 
   15761          if (epartIsReg(modrm)) {
   15762             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   15763             delta += 1;
   15764             DIP("pabs%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
   15765                                        nameMMXReg(gregLO3ofRM(modrm)));
   15766          } else {
   15767             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15768             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   15769             delta += alen;
   15770             DIP("pabs%s %s,%s\n", str, dis_buf,
   15771                                        nameMMXReg(gregLO3ofRM(modrm)));
   15772          }
   15773 
   15774          putMMXReg( gregLO3ofRM(modrm),
   15775                     mkexpr(math_PABS_MMX( sV, laneszB )) );
   15776          goto decode_success;
   15777       }
   15778       break;
   15779 
   15780    default:
   15781       break;
   15782 
   15783    }
   15784 
   15785   //decode_failure:
   15786    *decode_OK = False;
   15787    return deltaIN;
   15788 
   15789   decode_success:
   15790    *decode_OK = True;
   15791    return delta;
   15792 }
   15793 
   15794 
   15795 /*------------------------------------------------------------*/
   15796 /*---                                                      ---*/
   15797 /*--- Top-level SSSE3: dis_ESC_0F3A__SupSSE3               ---*/
   15798 /*---                                                      ---*/
   15799 /*------------------------------------------------------------*/
   15800 
   15801 __attribute__((noinline))
   15802 static
   15803 Long dis_ESC_0F3A__SupSSE3 ( Bool* decode_OK,
   15804                              VexAbiInfo* vbi,
   15805                              Prefix pfx, Int sz, Long deltaIN )
   15806 {
   15807    Long   d64   = 0;
   15808    IRTemp addr  = IRTemp_INVALID;
   15809    UChar  modrm = 0;
   15810    Int    alen  = 0;
   15811    HChar  dis_buf[50];
   15812 
   15813    *decode_OK = False;
   15814 
   15815    Long   delta = deltaIN;
   15816    UChar  opc   = getUChar(delta);
   15817    delta++;
   15818    switch (opc) {
   15819 
   15820    case 0x0F:
   15821       /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */
   15822       if (have66noF2noF3(pfx)
   15823           && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
   15824          IRTemp sV  = newTemp(Ity_V128);
   15825          IRTemp dV  = newTemp(Ity_V128);
   15826 
   15827          modrm = getUChar(delta);
   15828          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   15829 
   15830          if (epartIsReg(modrm)) {
   15831             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   15832             d64 = (Long)getUChar(delta+1);
   15833             delta += 1+1;
   15834             DIP("palignr $%d,%s,%s\n", (Int)d64,
   15835                                        nameXMMReg(eregOfRexRM(pfx,modrm)),
   15836                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
   15837          } else {
   15838             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   15839             gen_SEGV_if_not_16_aligned( addr );
   15840             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15841             d64 = (Long)getUChar(delta+alen);
   15842             delta += alen+1;
   15843             DIP("palignr $%d,%s,%s\n", (Int)d64,
   15844                                        dis_buf,
   15845                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
   15846          }
   15847 
   15848          IRTemp res = math_PALIGNR_XMM( sV, dV, d64 );
   15849          putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
   15850          goto decode_success;
   15851       }
   15852       /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */
   15853       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15854          IRTemp sV  = newTemp(Ity_I64);
   15855          IRTemp dV  = newTemp(Ity_I64);
   15856          IRTemp res = newTemp(Ity_I64);
   15857 
   15858          modrm = getUChar(delta);
   15859          do_MMX_preamble();
   15860          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   15861 
   15862          if (epartIsReg(modrm)) {
   15863             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   15864             d64 = (Long)getUChar(delta+1);
   15865             delta += 1+1;
   15866             DIP("palignr $%d,%s,%s\n",  (Int)d64,
   15867                                         nameMMXReg(eregLO3ofRM(modrm)),
   15868                                         nameMMXReg(gregLO3ofRM(modrm)));
   15869          } else {
   15870             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   15871             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   15872             d64 = (Long)getUChar(delta+alen);
   15873             delta += alen+1;
   15874             DIP("palignr $%d%s,%s\n", (Int)d64,
   15875                                       dis_buf,
   15876                                       nameMMXReg(gregLO3ofRM(modrm)));
   15877          }
   15878 
   15879          if (d64 == 0) {
   15880             assign( res, mkexpr(sV) );
   15881          }
   15882          else if (d64 >= 1 && d64 <= 7) {
   15883             assign(res,
   15884                    binop(Iop_Or64,
   15885                          binop(Iop_Shr64, mkexpr(sV), mkU8(8*d64)),
   15886                          binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d64))
   15887                         )));
   15888          }
   15889          else if (d64 == 8) {
   15890            assign( res, mkexpr(dV) );
   15891          }
   15892          else if (d64 >= 9 && d64 <= 15) {
   15893             assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d64-8))) );
   15894          }
   15895          else if (d64 >= 16 && d64 <= 255) {
   15896             assign( res, mkU64(0) );
   15897          }
   15898          else
   15899             vassert(0);
   15900 
   15901          putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
   15902          goto decode_success;
   15903       }
   15904       break;
   15905 
   15906    default:
   15907       break;
   15908 
   15909    }
   15910 
   15911   //decode_failure:
   15912    *decode_OK = False;
   15913    return deltaIN;
   15914 
   15915   decode_success:
   15916    *decode_OK = True;
   15917    return delta;
   15918 }
   15919 
   15920 
   15921 /*------------------------------------------------------------*/
   15922 /*---                                                      ---*/
   15923 /*--- Top-level SSE4: dis_ESC_0F__SSE4                     ---*/
   15924 /*---                                                      ---*/
   15925 /*------------------------------------------------------------*/
   15926 
   15927 __attribute__((noinline))
   15928 static
   15929 Long dis_ESC_0F__SSE4 ( Bool* decode_OK,
   15930                         VexArchInfo* archinfo,
   15931                         VexAbiInfo* vbi,
   15932                         Prefix pfx, Int sz, Long deltaIN )
   15933 {
   15934    IRTemp addr  = IRTemp_INVALID;
   15935    IRType ty    = Ity_INVALID;
   15936    UChar  modrm = 0;
   15937    Int    alen  = 0;
   15938    HChar  dis_buf[50];
   15939 
   15940    *decode_OK = False;
   15941 
   15942    Long   delta = deltaIN;
   15943    UChar  opc   = getUChar(delta);
   15944    delta++;
   15945    switch (opc) {
   15946 
   15947    case 0xB8:
   15948       /* F3 0F B8  = POPCNT{W,L,Q}
   15949          Count the number of 1 bits in a register
   15950       */
   15951       if (haveF3noF2(pfx) /* so both 66 and REX.W are possibilities */
   15952           && (sz == 2 || sz == 4 || sz == 8)) {
   15953          /*IRType*/ ty  = szToITy(sz);
   15954          IRTemp     src = newTemp(ty);
   15955          modrm = getUChar(delta);
   15956          if (epartIsReg(modrm)) {
   15957             assign(src, getIRegE(sz, pfx, modrm));
   15958             delta += 1;
   15959             DIP("popcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
   15960                 nameIRegG(sz, pfx, modrm));
   15961          } else {
   15962             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0);
   15963             assign(src, loadLE(ty, mkexpr(addr)));
   15964             delta += alen;
   15965             DIP("popcnt%c %s, %s\n", nameISize(sz), dis_buf,
   15966                 nameIRegG(sz, pfx, modrm));
   15967          }
   15968 
   15969          IRTemp result = gen_POPCOUNT(ty, src);
   15970          putIRegG(sz, pfx, modrm, mkexpr(result));
   15971 
   15972          // Update flags.  This is pretty lame .. perhaps can do better
   15973          // if this turns out to be performance critical.
   15974          // O S A C P are cleared.  Z is set if SRC == 0.
   15975          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   15976          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   15977          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   15978          stmt( IRStmt_Put( OFFB_CC_DEP1,
   15979                binop(Iop_Shl64,
   15980                      unop(Iop_1Uto64,
   15981                           binop(Iop_CmpEQ64,
   15982                                 widenUto64(mkexpr(src)),
   15983                                 mkU64(0))),
   15984                      mkU8(AMD64G_CC_SHIFT_Z))));
   15985 
   15986          goto decode_success;
   15987       }
   15988       break;
   15989 
   15990    case 0xBC:
   15991       /* F3 0F BC -- TZCNT (count trailing zeroes.  A BMI extension,
   15992          which we can only decode if we're sure this is a BMI1 capable cpu
   15993          that supports TZCNT, since otherwise it's BSF, which behaves
   15994          differently on zero source.  */
   15995       if (haveF3noF2(pfx) /* so both 66 and 48 are possibilities */
   15996           && (sz == 2 || sz == 4 || sz == 8)
   15997           && 0 != (archinfo->hwcaps & VEX_HWCAPS_AMD64_BMI)) {
   15998          /*IRType*/ ty  = szToITy(sz);
   15999          IRTemp     src = newTemp(ty);
   16000          modrm = getUChar(delta);
   16001          if (epartIsReg(modrm)) {
   16002             assign(src, getIRegE(sz, pfx, modrm));
   16003             delta += 1;
   16004             DIP("tzcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
   16005                 nameIRegG(sz, pfx, modrm));
   16006          } else {
   16007             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0);
   16008             assign(src, loadLE(ty, mkexpr(addr)));
   16009             delta += alen;
   16010             DIP("tzcnt%c %s, %s\n", nameISize(sz), dis_buf,
   16011                 nameIRegG(sz, pfx, modrm));
   16012          }
   16013 
   16014          IRTemp res = gen_TZCNT(ty, src);
   16015          putIRegG(sz, pfx, modrm, mkexpr(res));
   16016 
   16017          // Update flags.  This is pretty lame .. perhaps can do better
   16018          // if this turns out to be performance critical.
   16019          // O S A P are cleared.  Z is set if RESULT == 0.
   16020          // C is set if SRC is zero.
   16021          IRTemp src64 = newTemp(Ity_I64);
   16022          IRTemp res64 = newTemp(Ity_I64);
   16023          assign(src64, widenUto64(mkexpr(src)));
   16024          assign(res64, widenUto64(mkexpr(res)));
   16025 
   16026          IRTemp oszacp = newTemp(Ity_I64);
   16027          assign(
   16028             oszacp,
   16029             binop(Iop_Or64,
   16030                   binop(Iop_Shl64,
   16031                         unop(Iop_1Uto64,
   16032                              binop(Iop_CmpEQ64, mkexpr(res64), mkU64(0))),
   16033                         mkU8(AMD64G_CC_SHIFT_Z)),
   16034                   binop(Iop_Shl64,
   16035                         unop(Iop_1Uto64,
   16036                              binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0))),
   16037                         mkU8(AMD64G_CC_SHIFT_C))
   16038             )
   16039          );
   16040 
   16041          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   16042          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   16043          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   16044          stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
   16045 
   16046          goto decode_success;
   16047       }
   16048       break;
   16049 
   16050    case 0xBD:
   16051       /* F3 0F BD -- LZCNT (count leading zeroes.  An AMD extension,
   16052          which we can only decode if we're sure this is an AMD cpu
   16053          that supports LZCNT, since otherwise it's BSR, which behaves
   16054          differently.  Bizarrely, my Sandy Bridge also accepts these
   16055          instructions but produces different results. */
   16056       if (haveF3noF2(pfx) /* so both 66 and 48 are possibilities */
   16057           && (sz == 2 || sz == 4 || sz == 8)
   16058           && 0 != (archinfo->hwcaps & VEX_HWCAPS_AMD64_LZCNT)) {
   16059          /*IRType*/ ty  = szToITy(sz);
   16060          IRTemp     src = newTemp(ty);
   16061          modrm = getUChar(delta);
   16062          if (epartIsReg(modrm)) {
   16063             assign(src, getIRegE(sz, pfx, modrm));
   16064             delta += 1;
   16065             DIP("lzcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
   16066                 nameIRegG(sz, pfx, modrm));
   16067          } else {
   16068             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0);
   16069             assign(src, loadLE(ty, mkexpr(addr)));
   16070             delta += alen;
   16071             DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf,
   16072                 nameIRegG(sz, pfx, modrm));
   16073          }
   16074 
   16075          IRTemp res = gen_LZCNT(ty, src);
   16076          putIRegG(sz, pfx, modrm, mkexpr(res));
   16077 
   16078          // Update flags.  This is pretty lame .. perhaps can do better
   16079          // if this turns out to be performance critical.
   16080          // O S A P are cleared.  Z is set if RESULT == 0.
   16081          // C is set if SRC is zero.
   16082          IRTemp src64 = newTemp(Ity_I64);
   16083          IRTemp res64 = newTemp(Ity_I64);
   16084          assign(src64, widenUto64(mkexpr(src)));
   16085          assign(res64, widenUto64(mkexpr(res)));
   16086 
   16087          IRTemp oszacp = newTemp(Ity_I64);
   16088          assign(
   16089             oszacp,
   16090             binop(Iop_Or64,
   16091                   binop(Iop_Shl64,
   16092                         unop(Iop_1Uto64,
   16093                              binop(Iop_CmpEQ64, mkexpr(res64), mkU64(0))),
   16094                         mkU8(AMD64G_CC_SHIFT_Z)),
   16095                   binop(Iop_Shl64,
   16096                         unop(Iop_1Uto64,
   16097                              binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0))),
   16098                         mkU8(AMD64G_CC_SHIFT_C))
   16099             )
   16100          );
   16101 
   16102          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   16103          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   16104          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   16105          stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
   16106 
   16107          goto decode_success;
   16108       }
   16109       break;
   16110 
   16111    default:
   16112       break;
   16113 
   16114    }
   16115 
   16116   //decode_failure:
   16117    *decode_OK = False;
   16118    return deltaIN;
   16119 
   16120   decode_success:
   16121    *decode_OK = True;
   16122    return delta;
   16123 }
   16124 
   16125 
   16126 /*------------------------------------------------------------*/
   16127 /*---                                                      ---*/
   16128 /*--- Top-level SSE4: dis_ESC_0F38__SSE4                   ---*/
   16129 /*---                                                      ---*/
   16130 /*------------------------------------------------------------*/
   16131 
   16132 static IRTemp math_PBLENDVB_128 ( IRTemp vecE, IRTemp vecG,
   16133                                   IRTemp vec0/*controlling mask*/,
   16134                                   UInt gran, IROp opSAR )
   16135 {
   16136    /* The tricky bit is to convert vec0 into a suitable mask, by
   16137       copying the most significant bit of each lane into all positions
   16138       in the lane. */
   16139    IRTemp sh = newTemp(Ity_I8);
   16140    assign(sh, mkU8(8 * gran - 1));
   16141 
   16142    IRTemp mask = newTemp(Ity_V128);
   16143    assign(mask, binop(opSAR, mkexpr(vec0), mkexpr(sh)));
   16144 
   16145    IRTemp notmask = newTemp(Ity_V128);
   16146    assign(notmask, unop(Iop_NotV128, mkexpr(mask)));
   16147 
   16148    IRTemp res = newTemp(Ity_V128);
   16149    assign(res,  binop(Iop_OrV128,
   16150                       binop(Iop_AndV128, mkexpr(vecE), mkexpr(mask)),
   16151                       binop(Iop_AndV128, mkexpr(vecG), mkexpr(notmask))));
   16152    return res;
   16153 }
   16154 
   16155 static IRTemp math_PBLENDVB_256 ( IRTemp vecE, IRTemp vecG,
   16156                                   IRTemp vec0/*controlling mask*/,
   16157                                   UInt gran, IROp opSAR128 )
   16158 {
   16159    /* The tricky bit is to convert vec0 into a suitable mask, by
   16160       copying the most significant bit of each lane into all positions
   16161       in the lane. */
   16162    IRTemp sh = newTemp(Ity_I8);
   16163    assign(sh, mkU8(8 * gran - 1));
   16164 
   16165    IRTemp vec0Hi = IRTemp_INVALID;
   16166    IRTemp vec0Lo = IRTemp_INVALID;
   16167    breakupV256toV128s( vec0, &vec0Hi, &vec0Lo );
   16168 
   16169    IRTemp mask = newTemp(Ity_V256);
   16170    assign(mask, binop(Iop_V128HLtoV256,
   16171                       binop(opSAR128, mkexpr(vec0Hi), mkexpr(sh)),
   16172                       binop(opSAR128, mkexpr(vec0Lo), mkexpr(sh))));
   16173 
   16174    IRTemp notmask = newTemp(Ity_V256);
   16175    assign(notmask, unop(Iop_NotV256, mkexpr(mask)));
   16176 
   16177    IRTemp res = newTemp(Ity_V256);
   16178    assign(res,  binop(Iop_OrV256,
   16179                       binop(Iop_AndV256, mkexpr(vecE), mkexpr(mask)),
   16180                       binop(Iop_AndV256, mkexpr(vecG), mkexpr(notmask))));
   16181    return res;
   16182 }
   16183 
   16184 static Long dis_VBLENDV_128 ( VexAbiInfo* vbi, Prefix pfx, Long delta,
   16185                               const HChar *name, UInt gran, IROp opSAR )
   16186 {
   16187    IRTemp addr   = IRTemp_INVALID;
   16188    Int    alen   = 0;
   16189    HChar  dis_buf[50];
   16190    UChar  modrm  = getUChar(delta);
   16191    UInt   rG     = gregOfRexRM(pfx, modrm);
   16192    UInt   rV     = getVexNvvvv(pfx);
   16193    UInt   rIS4   = 0xFF; /* invalid */
   16194    IRTemp vecE   = newTemp(Ity_V128);
   16195    IRTemp vecV   = newTemp(Ity_V128);
   16196    IRTemp vecIS4 = newTemp(Ity_V128);
   16197    if (epartIsReg(modrm)) {
   16198       delta++;
   16199       UInt rE = eregOfRexRM(pfx, modrm);
   16200       assign(vecE, getXMMReg(rE));
   16201       UChar ib = getUChar(delta);
   16202       rIS4 = (ib >> 4) & 0xF;
   16203       DIP("%s %s,%s,%s,%s\n",
   16204           name, nameXMMReg(rIS4), nameXMMReg(rE),
   16205           nameXMMReg(rV), nameXMMReg(rG));
   16206    } else {
   16207       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   16208       delta += alen;
   16209       assign(vecE, loadLE(Ity_V128, mkexpr(addr)));
   16210       UChar ib = getUChar(delta);
   16211       rIS4 = (ib >> 4) & 0xF;
   16212       DIP("%s %s,%s,%s,%s\n",
   16213           name, nameXMMReg(rIS4), dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   16214    }
   16215    delta++;
   16216    assign(vecV,   getXMMReg(rV));
   16217    assign(vecIS4, getXMMReg(rIS4));
   16218    IRTemp res = math_PBLENDVB_128( vecE, vecV, vecIS4, gran, opSAR );
   16219    putYMMRegLoAndZU( rG, mkexpr(res) );
   16220    return delta;
   16221 }
   16222 
   16223 static Long dis_VBLENDV_256 ( VexAbiInfo* vbi, Prefix pfx, Long delta,
   16224                               const HChar *name, UInt gran, IROp opSAR128 )
   16225 {
   16226    IRTemp addr   = IRTemp_INVALID;
   16227    Int    alen   = 0;
   16228    HChar  dis_buf[50];
   16229    UChar  modrm  = getUChar(delta);
   16230    UInt   rG     = gregOfRexRM(pfx, modrm);
   16231    UInt   rV     = getVexNvvvv(pfx);
   16232    UInt   rIS4   = 0xFF; /* invalid */
   16233    IRTemp vecE   = newTemp(Ity_V256);
   16234    IRTemp vecV   = newTemp(Ity_V256);
   16235    IRTemp vecIS4 = newTemp(Ity_V256);
   16236    if (epartIsReg(modrm)) {
   16237       delta++;
   16238       UInt rE = eregOfRexRM(pfx, modrm);
   16239       assign(vecE, getYMMReg(rE));
   16240       UChar ib = getUChar(delta);
   16241       rIS4 = (ib >> 4) & 0xF;
   16242       DIP("%s %s,%s,%s,%s\n",
   16243           name, nameYMMReg(rIS4), nameYMMReg(rE),
   16244           nameYMMReg(rV), nameYMMReg(rG));
   16245    } else {
   16246       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   16247       delta += alen;
   16248       assign(vecE, loadLE(Ity_V256, mkexpr(addr)));
   16249       UChar ib = getUChar(delta);
   16250       rIS4 = (ib >> 4) & 0xF;
   16251       DIP("%s %s,%s,%s,%s\n",
   16252           name, nameYMMReg(rIS4), dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   16253    }
   16254    delta++;
   16255    assign(vecV,   getYMMReg(rV));
   16256    assign(vecIS4, getYMMReg(rIS4));
   16257    IRTemp res = math_PBLENDVB_256( vecE, vecV, vecIS4, gran, opSAR128 );
   16258    putYMMReg( rG, mkexpr(res) );
   16259    return delta;
   16260 }
   16261 
   16262 static void finish_xTESTy ( IRTemp andV, IRTemp andnV, Int sign )
   16263 {
   16264    /* Set Z=1 iff (vecE & vecG) == 0
   16265       Set C=1 iff (vecE & not vecG) == 0
   16266    */
   16267 
   16268    /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
   16269 
   16270    /* andV resp. andnV, reduced to 64-bit values, by or-ing the top
   16271       and bottom 64-bits together.  It relies on this trick:
   16272 
   16273       InterleaveLO64x2([a,b],[c,d]) == [b,d]    hence
   16274 
   16275       InterleaveLO64x2([a,b],[a,b]) == [b,b]    and similarly
   16276       InterleaveHI64x2([a,b],[a,b]) == [a,a]
   16277 
   16278       and so the OR of the above 2 exprs produces
   16279       [a OR b, a OR b], from which we simply take the lower half.
   16280    */
   16281    IRTemp and64  = newTemp(Ity_I64);
   16282    IRTemp andn64 = newTemp(Ity_I64);
   16283 
   16284    assign(and64,
   16285           unop(Iop_V128to64,
   16286                binop(Iop_OrV128,
   16287                      binop(Iop_InterleaveLO64x2,
   16288                            mkexpr(andV), mkexpr(andV)),
   16289                      binop(Iop_InterleaveHI64x2,
   16290                            mkexpr(andV), mkexpr(andV)))));
   16291 
   16292    assign(andn64,
   16293           unop(Iop_V128to64,
   16294                binop(Iop_OrV128,
   16295                      binop(Iop_InterleaveLO64x2,
   16296                            mkexpr(andnV), mkexpr(andnV)),
   16297                      binop(Iop_InterleaveHI64x2,
   16298                            mkexpr(andnV), mkexpr(andnV)))));
   16299 
   16300    IRTemp z64 = newTemp(Ity_I64);
   16301    IRTemp c64 = newTemp(Ity_I64);
   16302    if (sign == 64) {
   16303       /* When only interested in the most significant bit, just shift
   16304          arithmetically right and negate.  */
   16305       assign(z64,
   16306              unop(Iop_Not64,
   16307                   binop(Iop_Sar64, mkexpr(and64), mkU8(63))));
   16308 
   16309       assign(c64,
   16310              unop(Iop_Not64,
   16311                   binop(Iop_Sar64, mkexpr(andn64), mkU8(63))));
   16312    } else {
   16313       if (sign == 32) {
   16314          /* When interested in bit 31 and bit 63, mask those bits and
   16315             fallthrough into the PTEST handling.  */
   16316          IRTemp t0 = newTemp(Ity_I64);
   16317          IRTemp t1 = newTemp(Ity_I64);
   16318          IRTemp t2 = newTemp(Ity_I64);
   16319          assign(t0, mkU64(0x8000000080000000ULL));
   16320          assign(t1, binop(Iop_And64, mkexpr(and64), mkexpr(t0)));
   16321          assign(t2, binop(Iop_And64, mkexpr(andn64), mkexpr(t0)));
   16322          and64 = t1;
   16323          andn64 = t2;
   16324       }
   16325       /* Now convert and64, andn64 to all-zeroes or all-1s, so we can
   16326          slice out the Z and C bits conveniently.  We use the standard
   16327          trick all-zeroes -> all-zeroes, anything-else -> all-ones
   16328          done by "(x | -x) >>s (word-size - 1)".
   16329       */
   16330       assign(z64,
   16331              unop(Iop_Not64,
   16332                   binop(Iop_Sar64,
   16333                         binop(Iop_Or64,
   16334                               binop(Iop_Sub64, mkU64(0), mkexpr(and64)),
   16335                                     mkexpr(and64)), mkU8(63))));
   16336 
   16337       assign(c64,
   16338              unop(Iop_Not64,
   16339                   binop(Iop_Sar64,
   16340                         binop(Iop_Or64,
   16341                               binop(Iop_Sub64, mkU64(0), mkexpr(andn64)),
   16342                                     mkexpr(andn64)), mkU8(63))));
   16343    }
   16344 
   16345    /* And finally, slice out the Z and C flags and set the flags
   16346       thunk to COPY for them.  OSAP are set to zero. */
   16347    IRTemp newOSZACP = newTemp(Ity_I64);
   16348    assign(newOSZACP,
   16349           binop(Iop_Or64,
   16350                 binop(Iop_And64, mkexpr(z64), mkU64(AMD64G_CC_MASK_Z)),
   16351                 binop(Iop_And64, mkexpr(c64), mkU64(AMD64G_CC_MASK_C))));
   16352 
   16353    stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(newOSZACP)));
   16354    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   16355    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   16356    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   16357 }
   16358 
   16359 
   16360 /* Handles 128 bit versions of PTEST, VTESTPS or VTESTPD.
   16361    sign is 0 for PTEST insn, 32 for VTESTPS and 64 for VTESTPD. */
   16362 static Long dis_xTESTy_128 ( VexAbiInfo* vbi, Prefix pfx,
   16363                              Long delta, Bool isAvx, Int sign )
   16364 {
   16365    IRTemp addr   = IRTemp_INVALID;
   16366    Int    alen   = 0;
   16367    HChar  dis_buf[50];
   16368    UChar  modrm  = getUChar(delta);
   16369    UInt   rG     = gregOfRexRM(pfx, modrm);
   16370    IRTemp vecE = newTemp(Ity_V128);
   16371    IRTemp vecG = newTemp(Ity_V128);
   16372 
   16373    if ( epartIsReg(modrm) ) {
   16374       UInt rE = eregOfRexRM(pfx, modrm);
   16375       assign(vecE, getXMMReg(rE));
   16376       delta += 1;
   16377       DIP( "%s%stest%s %s,%s\n",
   16378            isAvx ? "v" : "", sign == 0 ? "p" : "",
   16379            sign == 0 ? "" : sign == 32 ? "ps" : "pd",
   16380            nameXMMReg(rE), nameXMMReg(rG) );
   16381    } else {
   16382       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16383       if (!isAvx)
   16384          gen_SEGV_if_not_16_aligned( addr );
   16385       assign(vecE, loadLE( Ity_V128, mkexpr(addr) ));
   16386       delta += alen;
   16387       DIP( "%s%stest%s %s,%s\n",
   16388            isAvx ? "v" : "", sign == 0 ? "p" : "",
   16389            sign == 0 ? "" : sign == 32 ? "ps" : "pd",
   16390            dis_buf, nameXMMReg(rG) );
   16391    }
   16392 
   16393    assign(vecG, getXMMReg(rG));
   16394 
   16395    /* Set Z=1 iff (vecE & vecG) == 0
   16396       Set C=1 iff (vecE & not vecG) == 0
   16397    */
   16398 
   16399    /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
   16400    IRTemp andV  = newTemp(Ity_V128);
   16401    IRTemp andnV = newTemp(Ity_V128);
   16402    assign(andV,  binop(Iop_AndV128, mkexpr(vecE), mkexpr(vecG)));
   16403    assign(andnV, binop(Iop_AndV128,
   16404                        mkexpr(vecE),
   16405                        binop(Iop_XorV128, mkexpr(vecG),
   16406                                           mkV128(0xFFFF))));
   16407 
   16408    finish_xTESTy ( andV, andnV, sign );
   16409    return delta;
   16410 }
   16411 
   16412 
   16413 /* Handles 256 bit versions of PTEST, VTESTPS or VTESTPD.
   16414    sign is 0 for PTEST insn, 32 for VTESTPS and 64 for VTESTPD. */
   16415 static Long dis_xTESTy_256 ( VexAbiInfo* vbi, Prefix pfx,
   16416                              Long delta, Int sign )
   16417 {
   16418    IRTemp addr   = IRTemp_INVALID;
   16419    Int    alen   = 0;
   16420    HChar  dis_buf[50];
   16421    UChar  modrm  = getUChar(delta);
   16422    UInt   rG     = gregOfRexRM(pfx, modrm);
   16423    IRTemp vecE   = newTemp(Ity_V256);
   16424    IRTemp vecG   = newTemp(Ity_V256);
   16425 
   16426    if ( epartIsReg(modrm) ) {
   16427       UInt rE = eregOfRexRM(pfx, modrm);
   16428       assign(vecE, getYMMReg(rE));
   16429       delta += 1;
   16430       DIP( "v%stest%s %s,%s\n", sign == 0 ? "p" : "",
   16431            sign == 0 ? "" : sign == 32 ? "ps" : "pd",
   16432            nameYMMReg(rE), nameYMMReg(rG) );
   16433    } else {
   16434       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16435       assign(vecE, loadLE( Ity_V256, mkexpr(addr) ));
   16436       delta += alen;
   16437       DIP( "v%stest%s %s,%s\n", sign == 0 ? "p" : "",
   16438            sign == 0 ? "" : sign == 32 ? "ps" : "pd",
   16439            dis_buf, nameYMMReg(rG) );
   16440    }
   16441 
   16442    assign(vecG, getYMMReg(rG));
   16443 
   16444    /* Set Z=1 iff (vecE & vecG) == 0
   16445       Set C=1 iff (vecE & not vecG) == 0
   16446    */
   16447 
   16448    /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
   16449    IRTemp andV  = newTemp(Ity_V256);
   16450    IRTemp andnV = newTemp(Ity_V256);
   16451    assign(andV,  binop(Iop_AndV256, mkexpr(vecE), mkexpr(vecG)));
   16452    assign(andnV, binop(Iop_AndV256,
   16453                        mkexpr(vecE), unop(Iop_NotV256, mkexpr(vecG))));
   16454 
   16455    IRTemp andVhi  = IRTemp_INVALID;
   16456    IRTemp andVlo  = IRTemp_INVALID;
   16457    IRTemp andnVhi = IRTemp_INVALID;
   16458    IRTemp andnVlo = IRTemp_INVALID;
   16459    breakupV256toV128s( andV, &andVhi, &andVlo );
   16460    breakupV256toV128s( andnV, &andnVhi, &andnVlo );
   16461 
   16462    IRTemp andV128  = newTemp(Ity_V128);
   16463    IRTemp andnV128 = newTemp(Ity_V128);
   16464    assign( andV128, binop( Iop_OrV128, mkexpr(andVhi), mkexpr(andVlo) ) );
   16465    assign( andnV128, binop( Iop_OrV128, mkexpr(andnVhi), mkexpr(andnVlo) ) );
   16466 
   16467    finish_xTESTy ( andV128, andnV128, sign );
   16468    return delta;
   16469 }
   16470 
   16471 
   16472 /* Handles 128 bit versions of PMOVZXBW and PMOVSXBW. */
   16473 static Long dis_PMOVxXBW_128 ( VexAbiInfo* vbi, Prefix pfx,
   16474                                Long delta, Bool isAvx, Bool xIsZ )
   16475 {
   16476    IRTemp addr   = IRTemp_INVALID;
   16477    Int    alen   = 0;
   16478    HChar  dis_buf[50];
   16479    IRTemp srcVec = newTemp(Ity_V128);
   16480    UChar  modrm  = getUChar(delta);
   16481    const HChar* mbV    = isAvx ? "v" : "";
   16482    const HChar  how    = xIsZ ? 'z' : 's';
   16483    UInt   rG     = gregOfRexRM(pfx, modrm);
   16484    if ( epartIsReg(modrm) ) {
   16485       UInt rE = eregOfRexRM(pfx, modrm);
   16486       assign( srcVec, getXMMReg(rE) );
   16487       delta += 1;
   16488       DIP( "%spmov%cxbw %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
   16489    } else {
   16490       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16491       assign( srcVec,
   16492               unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
   16493       delta += alen;
   16494       DIP( "%spmov%cxbw %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
   16495    }
   16496 
   16497    IRExpr* res
   16498       = xIsZ /* do math for either zero or sign extend */
   16499         ? binop( Iop_InterleaveLO8x16,
   16500                  IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) )
   16501         : binop( Iop_SarN16x8,
   16502                  binop( Iop_ShlN16x8,
   16503                         binop( Iop_InterleaveLO8x16,
   16504                                IRExpr_Const( IRConst_V128(0) ),
   16505                                mkexpr(srcVec) ),
   16506                         mkU8(8) ),
   16507                  mkU8(8) );
   16508 
   16509    (isAvx ? putYMMRegLoAndZU : putXMMReg) ( rG, res );
   16510 
   16511    return delta;
   16512 }
   16513 
   16514 
   16515 /* Handles 256 bit versions of PMOVZXBW and PMOVSXBW. */
   16516 static Long dis_PMOVxXBW_256 ( VexAbiInfo* vbi, Prefix pfx,
   16517                                Long delta, Bool xIsZ )
   16518 {
   16519    IRTemp addr   = IRTemp_INVALID;
   16520    Int    alen   = 0;
   16521    HChar  dis_buf[50];
   16522    IRTemp srcVec = newTemp(Ity_V128);
   16523    UChar  modrm  = getUChar(delta);
   16524    UChar  how    = xIsZ ? 'z' : 's';
   16525    UInt   rG     = gregOfRexRM(pfx, modrm);
   16526    if ( epartIsReg(modrm) ) {
   16527       UInt rE = eregOfRexRM(pfx, modrm);
   16528       assign( srcVec, getXMMReg(rE) );
   16529       delta += 1;
   16530       DIP( "vpmov%cxbw %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
   16531    } else {
   16532       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16533       assign( srcVec, loadLE( Ity_V128, mkexpr(addr) ) );
   16534       delta += alen;
   16535       DIP( "vpmov%cxbw %s,%s\n", how, dis_buf, nameYMMReg(rG) );
   16536    }
   16537 
   16538    /* First do zero extend.  */
   16539    IRExpr* res
   16540       = binop( Iop_V128HLtoV256,
   16541                binop( Iop_InterleaveHI8x16,
   16542                       IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ),
   16543                binop( Iop_InterleaveLO8x16,
   16544                       IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) );
   16545    /* And if needed sign extension as well.  */
   16546    if (!xIsZ)
   16547       res = binop( Iop_SarN16x16,
   16548                    binop( Iop_ShlN16x16, res, mkU8(8) ), mkU8(8) );
   16549 
   16550    putYMMReg ( rG, res );
   16551 
   16552    return delta;
   16553 }
   16554 
   16555 
   16556 static Long dis_PMOVxXWD_128 ( VexAbiInfo* vbi, Prefix pfx,
   16557                                Long delta, Bool isAvx, Bool xIsZ )
   16558 {
   16559    IRTemp addr   = IRTemp_INVALID;
   16560    Int    alen   = 0;
   16561    HChar  dis_buf[50];
   16562    IRTemp srcVec = newTemp(Ity_V128);
   16563    UChar  modrm  = getUChar(delta);
   16564    const HChar* mbV    = isAvx ? "v" : "";
   16565    const HChar  how    = xIsZ ? 'z' : 's';
   16566    UInt   rG     = gregOfRexRM(pfx, modrm);
   16567 
   16568    if ( epartIsReg(modrm) ) {
   16569       UInt rE = eregOfRexRM(pfx, modrm);
   16570       assign( srcVec, getXMMReg(rE) );
   16571       delta += 1;
   16572       DIP( "%spmov%cxwd %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
   16573    } else {
   16574       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16575       assign( srcVec,
   16576               unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
   16577       delta += alen;
   16578       DIP( "%spmov%cxwd %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
   16579    }
   16580 
   16581    IRExpr* res
   16582       = binop( Iop_InterleaveLO16x8,
   16583                IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) );
   16584    if (!xIsZ)
   16585       res = binop(Iop_SarN32x4,
   16586                   binop(Iop_ShlN32x4, res, mkU8(16)), mkU8(16));
   16587 
   16588    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   16589       ( gregOfRexRM(pfx, modrm), res );
   16590 
   16591    return delta;
   16592 }
   16593 
   16594 
   16595 static Long dis_PMOVxXWD_256 ( VexAbiInfo* vbi, Prefix pfx,
   16596                                Long delta, Bool xIsZ )
   16597 {
   16598    IRTemp addr   = IRTemp_INVALID;
   16599    Int    alen   = 0;
   16600    HChar  dis_buf[50];
   16601    IRTemp srcVec = newTemp(Ity_V128);
   16602    UChar  modrm  = getUChar(delta);
   16603    UChar  how    = xIsZ ? 'z' : 's';
   16604    UInt   rG     = gregOfRexRM(pfx, modrm);
   16605 
   16606    if ( epartIsReg(modrm) ) {
   16607       UInt rE = eregOfRexRM(pfx, modrm);
   16608       assign( srcVec, getXMMReg(rE) );
   16609       delta += 1;
   16610       DIP( "vpmov%cxwd %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
   16611    } else {
   16612       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16613       assign( srcVec, loadLE( Ity_V128, mkexpr(addr) ) );
   16614       delta += alen;
   16615       DIP( "vpmov%cxwd %s,%s\n", how, dis_buf, nameYMMReg(rG) );
   16616    }
   16617 
   16618    IRExpr* res
   16619       = binop( Iop_V128HLtoV256,
   16620                binop( Iop_InterleaveHI16x8,
   16621                       IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ),
   16622                binop( Iop_InterleaveLO16x8,
   16623                       IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) );
   16624    if (!xIsZ)
   16625       res = binop(Iop_SarN32x8,
   16626                   binop(Iop_ShlN32x8, res, mkU8(16)), mkU8(16));
   16627 
   16628    putYMMReg ( rG, res );
   16629 
   16630    return delta;
   16631 }
   16632 
   16633 
   16634 static Long dis_PMOVSXWQ_128 ( VexAbiInfo* vbi, Prefix pfx,
   16635                                Long delta, Bool isAvx )
   16636 {
   16637    IRTemp addr     = IRTemp_INVALID;
   16638    Int    alen     = 0;
   16639    HChar  dis_buf[50];
   16640    IRTemp srcBytes = newTemp(Ity_I32);
   16641    UChar  modrm    = getUChar(delta);
   16642    const HChar* mbV = isAvx ? "v" : "";
   16643    UInt   rG       = gregOfRexRM(pfx, modrm);
   16644 
   16645    if ( epartIsReg( modrm ) ) {
   16646       UInt rE = eregOfRexRM(pfx, modrm);
   16647       assign( srcBytes, getXMMRegLane32( rE, 0 ) );
   16648       delta += 1;
   16649       DIP( "%spmovsxwq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
   16650    } else {
   16651       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16652       assign( srcBytes, loadLE( Ity_I32, mkexpr(addr) ) );
   16653       delta += alen;
   16654       DIP( "%spmovsxwq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
   16655    }
   16656 
   16657    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   16658       ( rG, binop( Iop_64HLtoV128,
   16659                    unop( Iop_16Sto64,
   16660                          unop( Iop_32HIto16, mkexpr(srcBytes) ) ),
   16661                    unop( Iop_16Sto64,
   16662                          unop( Iop_32to16, mkexpr(srcBytes) ) ) ) );
   16663    return delta;
   16664 }
   16665 
   16666 
   16667 static Long dis_PMOVSXWQ_256 ( VexAbiInfo* vbi, Prefix pfx, Long delta )
   16668 {
   16669    IRTemp addr     = IRTemp_INVALID;
   16670    Int    alen     = 0;
   16671    HChar  dis_buf[50];
   16672    IRTemp srcBytes = newTemp(Ity_I64);
   16673    UChar  modrm    = getUChar(delta);
   16674    UInt   rG       = gregOfRexRM(pfx, modrm);
   16675    IRTemp s3, s2, s1, s0;
   16676    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   16677 
   16678    if ( epartIsReg( modrm ) ) {
   16679       UInt rE = eregOfRexRM(pfx, modrm);
   16680       assign( srcBytes, getXMMRegLane64( rE, 0 ) );
   16681       delta += 1;
   16682       DIP( "vpmovsxwq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
   16683    } else {
   16684       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16685       assign( srcBytes, loadLE( Ity_I64, mkexpr(addr) ) );
   16686       delta += alen;
   16687       DIP( "vpmovsxwq %s,%s\n", dis_buf, nameYMMReg(rG) );
   16688    }
   16689 
   16690    breakup64to16s( srcBytes, &s3, &s2, &s1, &s0 );
   16691    putYMMReg( rG, binop( Iop_V128HLtoV256,
   16692                          binop( Iop_64HLtoV128,
   16693                                 unop( Iop_16Sto64, mkexpr(s3) ),
   16694                                 unop( Iop_16Sto64, mkexpr(s2) ) ),
   16695                          binop( Iop_64HLtoV128,
   16696                                 unop( Iop_16Sto64, mkexpr(s1) ),
   16697                                 unop( Iop_16Sto64, mkexpr(s0) ) ) ) );
   16698    return delta;
   16699 }
   16700 
   16701 
   16702 static Long dis_PMOVZXWQ_128 ( VexAbiInfo* vbi, Prefix pfx,
   16703                                Long delta, Bool isAvx )
   16704 {
   16705    IRTemp addr     = IRTemp_INVALID;
   16706    Int    alen     = 0;
   16707    HChar  dis_buf[50];
   16708    IRTemp srcVec = newTemp(Ity_V128);
   16709    UChar  modrm    = getUChar(delta);
   16710    const HChar* mbV = isAvx ? "v" : "";
   16711    UInt   rG       = gregOfRexRM(pfx, modrm);
   16712 
   16713    if ( epartIsReg( modrm ) ) {
   16714       UInt rE = eregOfRexRM(pfx, modrm);
   16715       assign( srcVec, getXMMReg(rE) );
   16716       delta += 1;
   16717       DIP( "%spmovzxwq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
   16718    } else {
   16719       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16720       assign( srcVec,
   16721               unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
   16722       delta += alen;
   16723       DIP( "%spmovzxwq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
   16724    }
   16725 
   16726    IRTemp zeroVec = newTemp( Ity_V128 );
   16727    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   16728 
   16729    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   16730       ( rG, binop( Iop_InterleaveLO16x8,
   16731                    mkexpr(zeroVec),
   16732                    binop( Iop_InterleaveLO16x8,
   16733                           mkexpr(zeroVec), mkexpr(srcVec) ) ) );
   16734    return delta;
   16735 }
   16736 
   16737 
   16738 static Long dis_PMOVZXWQ_256 ( VexAbiInfo* vbi, Prefix pfx,
   16739                                Long delta )
   16740 {
   16741    IRTemp addr     = IRTemp_INVALID;
   16742    Int    alen     = 0;
   16743    HChar  dis_buf[50];
   16744    IRTemp srcVec = newTemp(Ity_V128);
   16745    UChar  modrm    = getUChar(delta);
   16746    UInt   rG       = gregOfRexRM(pfx, modrm);
   16747 
   16748    if ( epartIsReg( modrm ) ) {
   16749       UInt rE = eregOfRexRM(pfx, modrm);
   16750       assign( srcVec, getXMMReg(rE) );
   16751       delta += 1;
   16752       DIP( "vpmovzxwq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
   16753    } else {
   16754       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16755       assign( srcVec,
   16756               unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
   16757       delta += alen;
   16758       DIP( "vpmovzxwq %s,%s\n", dis_buf, nameYMMReg(rG) );
   16759    }
   16760 
   16761    IRTemp zeroVec = newTemp( Ity_V128 );
   16762    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   16763 
   16764    putYMMReg( rG, binop( Iop_V128HLtoV256,
   16765                          binop( Iop_InterleaveHI16x8,
   16766                                 mkexpr(zeroVec),
   16767                                 binop( Iop_InterleaveLO16x8,
   16768                                        mkexpr(zeroVec), mkexpr(srcVec) ) ),
   16769                          binop( Iop_InterleaveLO16x8,
   16770                                 mkexpr(zeroVec),
   16771                                 binop( Iop_InterleaveLO16x8,
   16772                                        mkexpr(zeroVec), mkexpr(srcVec) ) ) ) );
   16773    return delta;
   16774 }
   16775 
   16776 
   16777 /* Handles 128 bit versions of PMOVZXDQ and PMOVSXDQ. */
   16778 static Long dis_PMOVxXDQ_128 ( VexAbiInfo* vbi, Prefix pfx,
   16779                                Long delta, Bool isAvx, Bool xIsZ )
   16780 {
   16781    IRTemp addr   = IRTemp_INVALID;
   16782    Int    alen   = 0;
   16783    HChar  dis_buf[50];
   16784    IRTemp srcI64 = newTemp(Ity_I64);
   16785    IRTemp srcVec = newTemp(Ity_V128);
   16786    UChar  modrm  = getUChar(delta);
   16787    const HChar* mbV = isAvx ? "v" : "";
   16788    const HChar  how = xIsZ ? 'z' : 's';
   16789    UInt   rG     = gregOfRexRM(pfx, modrm);
   16790    /* Compute both srcI64 -- the value to expand -- and srcVec -- same
   16791       thing in a V128, with arbitrary junk in the top 64 bits.  Use
   16792       one or both of them and let iropt clean up afterwards (as
   16793       usual). */
   16794    if ( epartIsReg(modrm) ) {
   16795       UInt rE = eregOfRexRM(pfx, modrm);
   16796       assign( srcVec, getXMMReg(rE) );
   16797       assign( srcI64, unop(Iop_V128to64, mkexpr(srcVec)) );
   16798       delta += 1;
   16799       DIP( "%spmov%cxdq %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
   16800    } else {
   16801       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16802       assign( srcI64, loadLE(Ity_I64, mkexpr(addr)) );
   16803       assign( srcVec, unop( Iop_64UtoV128, mkexpr(srcI64)) );
   16804       delta += alen;
   16805       DIP( "%spmov%cxdq %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
   16806    }
   16807 
   16808    IRExpr* res
   16809       = xIsZ /* do math for either zero or sign extend */
   16810         ? binop( Iop_InterleaveLO32x4,
   16811                  IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) )
   16812         : binop( Iop_64HLtoV128,
   16813                  unop( Iop_32Sto64,
   16814                        unop( Iop_64HIto32, mkexpr(srcI64) ) ),
   16815                  unop( Iop_32Sto64,
   16816                        unop( Iop_64to32, mkexpr(srcI64) ) ) );
   16817 
   16818    (isAvx ? putYMMRegLoAndZU : putXMMReg) ( rG, res );
   16819 
   16820    return delta;
   16821 }
   16822 
   16823 
   16824 /* Handles 256 bit versions of PMOVZXDQ and PMOVSXDQ. */
   16825 static Long dis_PMOVxXDQ_256 ( VexAbiInfo* vbi, Prefix pfx,
   16826                                Long delta, Bool xIsZ )
   16827 {
   16828    IRTemp addr   = IRTemp_INVALID;
   16829    Int    alen   = 0;
   16830    HChar  dis_buf[50];
   16831    IRTemp srcVec = newTemp(Ity_V128);
   16832    UChar  modrm  = getUChar(delta);
   16833    UChar  how    = xIsZ ? 'z' : 's';
   16834    UInt   rG     = gregOfRexRM(pfx, modrm);
   16835    /* Compute both srcI64 -- the value to expand -- and srcVec -- same
   16836       thing in a V128, with arbitrary junk in the top 64 bits.  Use
   16837       one or both of them and let iropt clean up afterwards (as
   16838       usual). */
   16839    if ( epartIsReg(modrm) ) {
   16840       UInt rE = eregOfRexRM(pfx, modrm);
   16841       assign( srcVec, getXMMReg(rE) );
   16842       delta += 1;
   16843       DIP( "vpmov%cxdq %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
   16844    } else {
   16845       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16846       assign( srcVec, loadLE(Ity_V128, mkexpr(addr)) );
   16847       delta += alen;
   16848       DIP( "vpmov%cxdq %s,%s\n", how, dis_buf, nameYMMReg(rG) );
   16849    }
   16850 
   16851    IRExpr* res;
   16852    if (xIsZ)
   16853       res = binop( Iop_V128HLtoV256,
   16854                    binop( Iop_InterleaveHI32x4,
   16855                           IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ),
   16856                    binop( Iop_InterleaveLO32x4,
   16857                           IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) );
   16858    else {
   16859       IRTemp s3, s2, s1, s0;
   16860       s3 = s2 = s1 = s0 = IRTemp_INVALID;
   16861       breakupV128to32s( srcVec, &s3, &s2, &s1, &s0 );
   16862       res = binop( Iop_V128HLtoV256,
   16863                    binop( Iop_64HLtoV128,
   16864                           unop( Iop_32Sto64, mkexpr(s3) ),
   16865                           unop( Iop_32Sto64, mkexpr(s2) ) ),
   16866                    binop( Iop_64HLtoV128,
   16867                           unop( Iop_32Sto64, mkexpr(s1) ),
   16868                           unop( Iop_32Sto64, mkexpr(s0) ) ) );
   16869    }
   16870 
   16871    putYMMReg ( rG, res );
   16872 
   16873    return delta;
   16874 }
   16875 
   16876 
   16877 /* Handles 128 bit versions of PMOVZXBD and PMOVSXBD. */
   16878 static Long dis_PMOVxXBD_128 ( VexAbiInfo* vbi, Prefix pfx,
   16879                                Long delta, Bool isAvx, Bool xIsZ )
   16880 {
   16881    IRTemp addr   = IRTemp_INVALID;
   16882    Int    alen   = 0;
   16883    HChar  dis_buf[50];
   16884    IRTemp srcVec = newTemp(Ity_V128);
   16885    UChar  modrm  = getUChar(delta);
   16886    const HChar* mbV = isAvx ? "v" : "";
   16887    const HChar  how = xIsZ ? 'z' : 's';
   16888    UInt   rG     = gregOfRexRM(pfx, modrm);
   16889    if ( epartIsReg(modrm) ) {
   16890       UInt rE = eregOfRexRM(pfx, modrm);
   16891       assign( srcVec, getXMMReg(rE) );
   16892       delta += 1;
   16893       DIP( "%spmov%cxbd %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
   16894    } else {
   16895       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16896       assign( srcVec,
   16897               unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
   16898       delta += alen;
   16899       DIP( "%spmov%cxbd %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
   16900    }
   16901 
   16902    IRTemp zeroVec = newTemp(Ity_V128);
   16903    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   16904 
   16905    IRExpr* res
   16906       = binop(Iop_InterleaveLO8x16,
   16907               mkexpr(zeroVec),
   16908               binop(Iop_InterleaveLO8x16,
   16909                     mkexpr(zeroVec), mkexpr(srcVec)));
   16910    if (!xIsZ)
   16911       res = binop(Iop_SarN32x4,
   16912                   binop(Iop_ShlN32x4, res, mkU8(24)), mkU8(24));
   16913 
   16914    (isAvx ? putYMMRegLoAndZU : putXMMReg) ( rG, res );
   16915 
   16916    return delta;
   16917 }
   16918 
   16919 
   16920 /* Handles 256 bit versions of PMOVZXBD and PMOVSXBD. */
   16921 static Long dis_PMOVxXBD_256 ( VexAbiInfo* vbi, Prefix pfx,
   16922                                Long delta, Bool xIsZ )
   16923 {
   16924    IRTemp addr   = IRTemp_INVALID;
   16925    Int    alen   = 0;
   16926    HChar  dis_buf[50];
   16927    IRTemp srcVec = newTemp(Ity_V128);
   16928    UChar  modrm  = getUChar(delta);
   16929    UChar  how    = xIsZ ? 'z' : 's';
   16930    UInt   rG     = gregOfRexRM(pfx, modrm);
   16931    if ( epartIsReg(modrm) ) {
   16932       UInt rE = eregOfRexRM(pfx, modrm);
   16933       assign( srcVec, getXMMReg(rE) );
   16934       delta += 1;
   16935       DIP( "vpmov%cxbd %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
   16936    } else {
   16937       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16938       assign( srcVec,
   16939               unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
   16940       delta += alen;
   16941       DIP( "vpmov%cxbd %s,%s\n", how, dis_buf, nameYMMReg(rG) );
   16942    }
   16943 
   16944    IRTemp zeroVec = newTemp(Ity_V128);
   16945    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   16946 
   16947    IRExpr* res
   16948       = binop( Iop_V128HLtoV256,
   16949                binop(Iop_InterleaveHI8x16,
   16950                      mkexpr(zeroVec),
   16951                      binop(Iop_InterleaveLO8x16,
   16952                            mkexpr(zeroVec), mkexpr(srcVec)) ),
   16953                binop(Iop_InterleaveLO8x16,
   16954                      mkexpr(zeroVec),
   16955                      binop(Iop_InterleaveLO8x16,
   16956                            mkexpr(zeroVec), mkexpr(srcVec)) ) );
   16957    if (!xIsZ)
   16958       res = binop(Iop_SarN32x8,
   16959                   binop(Iop_ShlN32x8, res, mkU8(24)), mkU8(24));
   16960 
   16961    putYMMReg ( rG, res );
   16962 
   16963    return delta;
   16964 }
   16965 
   16966 
   16967 /* Handles 128 bit versions of PMOVSXBQ. */
   16968 static Long dis_PMOVSXBQ_128 ( VexAbiInfo* vbi, Prefix pfx,
   16969                                Long delta, Bool isAvx )
   16970 {
   16971    IRTemp addr     = IRTemp_INVALID;
   16972    Int    alen     = 0;
   16973    HChar  dis_buf[50];
   16974    IRTemp srcBytes = newTemp(Ity_I16);
   16975    UChar  modrm    = getUChar(delta);
   16976    const HChar* mbV = isAvx ? "v" : "";
   16977    UInt   rG       = gregOfRexRM(pfx, modrm);
   16978    if ( epartIsReg(modrm) ) {
   16979       UInt rE = eregOfRexRM(pfx, modrm);
   16980       assign( srcBytes, getXMMRegLane16( rE, 0 ) );
   16981       delta += 1;
   16982       DIP( "%spmovsxbq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
   16983    } else {
   16984       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   16985       assign( srcBytes, loadLE( Ity_I16, mkexpr(addr) ) );
   16986       delta += alen;
   16987       DIP( "%spmovsxbq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
   16988    }
   16989 
   16990    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   16991       ( rG, binop( Iop_64HLtoV128,
   16992                    unop( Iop_8Sto64,
   16993                          unop( Iop_16HIto8, mkexpr(srcBytes) ) ),
   16994                    unop( Iop_8Sto64,
   16995                          unop( Iop_16to8, mkexpr(srcBytes) ) ) ) );
   16996    return delta;
   16997 }
   16998 
   16999 
   17000 /* Handles 256 bit versions of PMOVSXBQ. */
   17001 static Long dis_PMOVSXBQ_256 ( VexAbiInfo* vbi, Prefix pfx,
   17002                                Long delta )
   17003 {
   17004    IRTemp addr     = IRTemp_INVALID;
   17005    Int    alen     = 0;
   17006    HChar  dis_buf[50];
   17007    IRTemp srcBytes = newTemp(Ity_I32);
   17008    UChar  modrm    = getUChar(delta);
   17009    UInt   rG       = gregOfRexRM(pfx, modrm);
   17010    if ( epartIsReg(modrm) ) {
   17011       UInt rE = eregOfRexRM(pfx, modrm);
   17012       assign( srcBytes, getXMMRegLane32( rE, 0 ) );
   17013       delta += 1;
   17014       DIP( "vpmovsxbq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
   17015    } else {
   17016       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17017       assign( srcBytes, loadLE( Ity_I32, mkexpr(addr) ) );
   17018       delta += alen;
   17019       DIP( "vpmovsxbq %s,%s\n", dis_buf, nameYMMReg(rG) );
   17020    }
   17021 
   17022    putYMMReg
   17023       ( rG, binop( Iop_V128HLtoV256,
   17024                    binop( Iop_64HLtoV128,
   17025                           unop( Iop_8Sto64,
   17026                                 unop( Iop_16HIto8,
   17027                                       unop( Iop_32HIto16,
   17028                                             mkexpr(srcBytes) ) ) ),
   17029                           unop( Iop_8Sto64,
   17030                                 unop( Iop_16to8,
   17031                                       unop( Iop_32HIto16,
   17032                                             mkexpr(srcBytes) ) ) ) ),
   17033                    binop( Iop_64HLtoV128,
   17034                           unop( Iop_8Sto64,
   17035                                 unop( Iop_16HIto8,
   17036                                       unop( Iop_32to16,
   17037                                             mkexpr(srcBytes) ) ) ),
   17038                           unop( Iop_8Sto64,
   17039                                 unop( Iop_16to8,
   17040                                       unop( Iop_32to16,
   17041                                             mkexpr(srcBytes) ) ) ) ) ) );
   17042    return delta;
   17043 }
   17044 
   17045 
   17046 /* Handles 128 bit versions of PMOVZXBQ. */
   17047 static Long dis_PMOVZXBQ_128 ( VexAbiInfo* vbi, Prefix pfx,
   17048                                Long delta, Bool isAvx )
   17049 {
   17050    IRTemp addr     = IRTemp_INVALID;
   17051    Int    alen     = 0;
   17052    HChar  dis_buf[50];
   17053    IRTemp srcVec   = newTemp(Ity_V128);
   17054    UChar  modrm    = getUChar(delta);
   17055    const HChar* mbV = isAvx ? "v" : "";
   17056    UInt   rG       = gregOfRexRM(pfx, modrm);
   17057    if ( epartIsReg(modrm) ) {
   17058       UInt rE = eregOfRexRM(pfx, modrm);
   17059       assign( srcVec, getXMMReg(rE) );
   17060       delta += 1;
   17061       DIP( "%spmovzxbq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
   17062    } else {
   17063       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17064       assign( srcVec,
   17065               unop( Iop_32UtoV128,
   17066                     unop( Iop_16Uto32, loadLE( Ity_I16, mkexpr(addr) ))));
   17067       delta += alen;
   17068       DIP( "%spmovzxbq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
   17069    }
   17070 
   17071    IRTemp zeroVec = newTemp(Ity_V128);
   17072    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   17073 
   17074    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   17075       ( rG, binop( Iop_InterleaveLO8x16,
   17076                    mkexpr(zeroVec),
   17077                    binop( Iop_InterleaveLO8x16,
   17078                           mkexpr(zeroVec),
   17079                           binop( Iop_InterleaveLO8x16,
   17080                                  mkexpr(zeroVec), mkexpr(srcVec) ) ) ) );
   17081    return delta;
   17082 }
   17083 
   17084 
   17085 /* Handles 256 bit versions of PMOVZXBQ. */
   17086 static Long dis_PMOVZXBQ_256 ( VexAbiInfo* vbi, Prefix pfx,
   17087                                Long delta )
   17088 {
   17089    IRTemp addr     = IRTemp_INVALID;
   17090    Int    alen     = 0;
   17091    HChar  dis_buf[50];
   17092    IRTemp srcVec   = newTemp(Ity_V128);
   17093    UChar  modrm    = getUChar(delta);
   17094    UInt   rG       = gregOfRexRM(pfx, modrm);
   17095    if ( epartIsReg(modrm) ) {
   17096       UInt rE = eregOfRexRM(pfx, modrm);
   17097       assign( srcVec, getXMMReg(rE) );
   17098       delta += 1;
   17099       DIP( "vpmovzxbq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
   17100    } else {
   17101       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17102       assign( srcVec,
   17103               unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) )));
   17104       delta += alen;
   17105       DIP( "vpmovzxbq %s,%s\n", dis_buf, nameYMMReg(rG) );
   17106    }
   17107 
   17108    IRTemp zeroVec = newTemp(Ity_V128);
   17109    assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
   17110 
   17111    putYMMReg
   17112       ( rG, binop( Iop_V128HLtoV256,
   17113                    binop( Iop_InterleaveHI8x16,
   17114                           mkexpr(zeroVec),
   17115                           binop( Iop_InterleaveLO8x16,
   17116                                  mkexpr(zeroVec),
   17117                                  binop( Iop_InterleaveLO8x16,
   17118                                         mkexpr(zeroVec), mkexpr(srcVec) ) ) ),
   17119                    binop( Iop_InterleaveLO8x16,
   17120                           mkexpr(zeroVec),
   17121                           binop( Iop_InterleaveLO8x16,
   17122                                  mkexpr(zeroVec),
   17123                                  binop( Iop_InterleaveLO8x16,
   17124                                         mkexpr(zeroVec), mkexpr(srcVec) ) ) )
   17125                  ) );
   17126    return delta;
   17127 }
   17128 
   17129 
   17130 static Long dis_PHMINPOSUW_128 ( VexAbiInfo* vbi, Prefix pfx,
   17131                                  Long delta, Bool isAvx )
   17132 {
   17133    IRTemp addr   = IRTemp_INVALID;
   17134    Int    alen   = 0;
   17135    HChar  dis_buf[50];
   17136    UChar  modrm  = getUChar(delta);
   17137    const HChar* mbV = isAvx ? "v" : "";
   17138    IRTemp sV     = newTemp(Ity_V128);
   17139    IRTemp sHi    = newTemp(Ity_I64);
   17140    IRTemp sLo    = newTemp(Ity_I64);
   17141    IRTemp dLo    = newTemp(Ity_I64);
   17142    UInt   rG     = gregOfRexRM(pfx,modrm);
   17143    if (epartIsReg(modrm)) {
   17144       UInt rE = eregOfRexRM(pfx,modrm);
   17145       assign( sV, getXMMReg(rE) );
   17146       delta += 1;
   17147       DIP("%sphminposuw %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG));
   17148    } else {
   17149       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   17150       if (!isAvx)
   17151          gen_SEGV_if_not_16_aligned(addr);
   17152       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   17153       delta += alen;
   17154       DIP("%sphminposuw %s,%s\n", mbV, dis_buf, nameXMMReg(rG));
   17155    }
   17156    assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   17157    assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   17158    assign( dLo, mkIRExprCCall(
   17159                    Ity_I64, 0/*regparms*/,
   17160                    "amd64g_calculate_sse_phminposuw",
   17161                    &amd64g_calculate_sse_phminposuw,
   17162                    mkIRExprVec_2( mkexpr(sLo), mkexpr(sHi) )
   17163          ));
   17164    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   17165       (rG, unop(Iop_64UtoV128, mkexpr(dLo)));
   17166    return delta;
   17167 }
   17168 
   17169 
   17170 static Long dis_AESx ( VexAbiInfo* vbi, Prefix pfx,
   17171                        Long delta, Bool isAvx, UChar opc )
   17172 {
   17173    IRTemp addr   = IRTemp_INVALID;
   17174    Int    alen   = 0;
   17175    HChar  dis_buf[50];
   17176    UChar  modrm  = getUChar(delta);
   17177    UInt   rG     = gregOfRexRM(pfx, modrm);
   17178    UInt   regNoL = 0;
   17179    UInt   regNoR = (isAvx && opc != 0xDB) ? getVexNvvvv(pfx) : rG;
   17180 
   17181    /* This is a nasty kludge.  We need to pass 2 x V128 to the
   17182       helper.  Since we can't do that, use a dirty
   17183       helper to compute the results directly from the XMM regs in
   17184       the guest state.  That means for the memory case, we need to
   17185       move the left operand into a pseudo-register (XMM16, let's
   17186       call it). */
   17187    if (epartIsReg(modrm)) {
   17188       regNoL = eregOfRexRM(pfx, modrm);
   17189       delta += 1;
   17190    } else {
   17191       regNoL = 16; /* use XMM16 as an intermediary */
   17192       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17193       /* alignment check needed ???? */
   17194       stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) ));
   17195       delta += alen;
   17196    }
   17197 
   17198    void*  fn = &amd64g_dirtyhelper_AES;
   17199    const HChar* nm = "amd64g_dirtyhelper_AES";
   17200 
   17201    /* Round up the arguments.  Note that this is a kludge -- the
   17202       use of mkU64 rather than mkIRExpr_HWord implies the
   17203       assumption that the host's word size is 64-bit. */
   17204    UInt gstOffD = ymmGuestRegOffset(rG);
   17205    UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL);
   17206    UInt gstOffR = ymmGuestRegOffset(regNoR);
   17207    IRExpr*  opc4         = mkU64(opc);
   17208    IRExpr*  gstOffDe     = mkU64(gstOffD);
   17209    IRExpr*  gstOffLe     = mkU64(gstOffL);
   17210    IRExpr*  gstOffRe     = mkU64(gstOffR);
   17211    IRExpr** args
   17212       = mkIRExprVec_5( IRExpr_BBPTR(), opc4, gstOffDe, gstOffLe, gstOffRe );
   17213 
   17214    IRDirty* d    = unsafeIRDirty_0_N( 0/*regparms*/, nm, fn, args );
   17215    /* It's not really a dirty call, but we can't use the clean helper
   17216       mechanism here for the very lame reason that we can't pass 2 x
   17217       V128s by value to a helper.  Hence this roundabout scheme. */
   17218    d->nFxState = 2;
   17219    vex_bzero(&d->fxState, sizeof(d->fxState));
   17220    /* AES{ENC,ENCLAST,DEC,DECLAST} read both registers, and writes
   17221       the second for !isAvx or the third for isAvx.
   17222       AESIMC (0xDB) reads the first register, and writes the second. */
   17223    d->fxState[0].fx     = Ifx_Read;
   17224    d->fxState[0].offset = gstOffL;
   17225    d->fxState[0].size   = sizeof(U128);
   17226    d->fxState[1].offset = gstOffR;
   17227    d->fxState[1].size   = sizeof(U128);
   17228    if (opc == 0xDB)
   17229       d->fxState[1].fx   = Ifx_Write;
   17230    else if (!isAvx || rG == regNoR)
   17231       d->fxState[1].fx   = Ifx_Modify;
   17232    else {
   17233       d->fxState[1].fx     = Ifx_Read;
   17234       d->nFxState++;
   17235       d->fxState[2].fx     = Ifx_Write;
   17236       d->fxState[2].offset = gstOffD;
   17237       d->fxState[2].size   = sizeof(U128);
   17238    }
   17239 
   17240    stmt( IRStmt_Dirty(d) );
   17241    {
   17242       const HChar* opsuf;
   17243       switch (opc) {
   17244          case 0xDC: opsuf = "enc"; break;
   17245          case 0XDD: opsuf = "enclast"; break;
   17246          case 0xDE: opsuf = "dec"; break;
   17247          case 0xDF: opsuf = "declast"; break;
   17248          case 0xDB: opsuf = "imc"; break;
   17249          default: vassert(0);
   17250       }
   17251       DIP("%saes%s %s,%s%s%s\n", isAvx ? "v" : "", opsuf,
   17252           (regNoL == 16 ? dis_buf : nameXMMReg(regNoL)),
   17253           nameXMMReg(regNoR),
   17254           (isAvx && opc != 0xDB) ? "," : "",
   17255           (isAvx && opc != 0xDB) ? nameXMMReg(rG) : "");
   17256    }
   17257    if (isAvx)
   17258       putYMMRegLane128( rG, 1, mkV128(0) );
   17259    return delta;
   17260 }
   17261 
   17262 static Long dis_AESKEYGENASSIST ( VexAbiInfo* vbi, Prefix pfx,
   17263                                   Long delta, Bool isAvx )
   17264 {
   17265    IRTemp addr   = IRTemp_INVALID;
   17266    Int    alen   = 0;
   17267    HChar  dis_buf[50];
   17268    UChar  modrm  = getUChar(delta);
   17269    UInt   regNoL = 0;
   17270    UInt   regNoR = gregOfRexRM(pfx, modrm);
   17271    UChar  imm    = 0;
   17272 
   17273    /* This is a nasty kludge.  See AESENC et al. instructions. */
   17274    modrm = getUChar(delta);
   17275    if (epartIsReg(modrm)) {
   17276       regNoL = eregOfRexRM(pfx, modrm);
   17277       imm = getUChar(delta+1);
   17278       delta += 1+1;
   17279    } else {
   17280       regNoL = 16; /* use XMM16 as an intermediary */
   17281       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   17282       /* alignment check ???? . */
   17283       stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) ));
   17284       imm = getUChar(delta+alen);
   17285       delta += alen+1;
   17286    }
   17287 
   17288    /* Who ya gonna call?  Presumably not Ghostbusters. */
   17289    void*  fn = &amd64g_dirtyhelper_AESKEYGENASSIST;
   17290    const HChar* nm = "amd64g_dirtyhelper_AESKEYGENASSIST";
   17291 
   17292    /* Round up the arguments.  Note that this is a kludge -- the
   17293       use of mkU64 rather than mkIRExpr_HWord implies the
   17294       assumption that the host's word size is 64-bit. */
   17295    UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL);
   17296    UInt gstOffR = ymmGuestRegOffset(regNoR);
   17297 
   17298    IRExpr*  imme          = mkU64(imm & 0xFF);
   17299    IRExpr*  gstOffLe     = mkU64(gstOffL);
   17300    IRExpr*  gstOffRe     = mkU64(gstOffR);
   17301    IRExpr** args
   17302       = mkIRExprVec_4( IRExpr_BBPTR(), imme, gstOffLe, gstOffRe );
   17303 
   17304    IRDirty* d    = unsafeIRDirty_0_N( 0/*regparms*/, nm, fn, args );
   17305    /* It's not really a dirty call, but we can't use the clean helper
   17306       mechanism here for the very lame reason that we can't pass 2 x
   17307       V128s by value to a helper.  Hence this roundabout scheme. */
   17308    d->nFxState = 2;
   17309    vex_bzero(&d->fxState, sizeof(d->fxState));
   17310    d->fxState[0].fx     = Ifx_Read;
   17311    d->fxState[0].offset = gstOffL;
   17312    d->fxState[0].size   = sizeof(U128);
   17313    d->fxState[1].fx     = Ifx_Write;
   17314    d->fxState[1].offset = gstOffR;
   17315    d->fxState[1].size   = sizeof(U128);
   17316    stmt( IRStmt_Dirty(d) );
   17317 
   17318    DIP("%saeskeygenassist $%x,%s,%s\n", isAvx ? "v" : "", (UInt)imm,
   17319        (regNoL == 16 ? dis_buf : nameXMMReg(regNoL)),
   17320        nameXMMReg(regNoR));
   17321    if (isAvx)
   17322       putYMMRegLane128( regNoR, 1, mkV128(0) );
   17323    return delta;
   17324 }
   17325 
   17326 
   17327 __attribute__((noinline))
   17328 static
   17329 Long dis_ESC_0F38__SSE4 ( Bool* decode_OK,
   17330                           VexAbiInfo* vbi,
   17331                           Prefix pfx, Int sz, Long deltaIN )
   17332 {
   17333    IRTemp addr  = IRTemp_INVALID;
   17334    UChar  modrm = 0;
   17335    Int    alen  = 0;
   17336    HChar  dis_buf[50];
   17337 
   17338    *decode_OK = False;
   17339 
   17340    Long   delta = deltaIN;
   17341    UChar  opc   = getUChar(delta);
   17342    delta++;
   17343    switch (opc) {
   17344 
   17345    case 0x10:
   17346    case 0x14:
   17347    case 0x15:
   17348       /* 66 0F 38 10 /r = PBLENDVB xmm1, xmm2/m128  (byte gran)
   17349          66 0F 38 14 /r = BLENDVPS xmm1, xmm2/m128  (float gran)
   17350          66 0F 38 15 /r = BLENDVPD xmm1, xmm2/m128  (double gran)
   17351          Blend at various granularities, with XMM0 (implicit operand)
   17352          providing the controlling mask.
   17353       */
   17354       if (have66noF2noF3(pfx) && sz == 2) {
   17355          modrm = getUChar(delta);
   17356 
   17357          const HChar* nm    = NULL;
   17358          UInt   gran  = 0;
   17359          IROp   opSAR = Iop_INVALID;
   17360          switch (opc) {
   17361             case 0x10:
   17362                nm = "pblendvb"; gran = 1; opSAR = Iop_SarN8x16;
   17363                break;
   17364             case 0x14:
   17365                nm = "blendvps"; gran = 4; opSAR = Iop_SarN32x4;
   17366                break;
   17367             case 0x15:
   17368                nm = "blendvpd"; gran = 8; opSAR = Iop_SarN64x2;
   17369                break;
   17370          }
   17371          vassert(nm);
   17372 
   17373          IRTemp vecE = newTemp(Ity_V128);
   17374          IRTemp vecG = newTemp(Ity_V128);
   17375          IRTemp vec0 = newTemp(Ity_V128);
   17376 
   17377          if ( epartIsReg(modrm) ) {
   17378             assign(vecE, getXMMReg(eregOfRexRM(pfx, modrm)));
   17379             delta += 1;
   17380             DIP( "%s %s,%s\n", nm,
   17381                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   17382                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   17383          } else {
   17384             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17385             gen_SEGV_if_not_16_aligned( addr );
   17386             assign(vecE, loadLE( Ity_V128, mkexpr(addr) ));
   17387             delta += alen;
   17388             DIP( "%s %s,%s\n", nm,
   17389                  dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   17390          }
   17391 
   17392          assign(vecG, getXMMReg(gregOfRexRM(pfx, modrm)));
   17393          assign(vec0, getXMMReg(0));
   17394 
   17395          IRTemp res = math_PBLENDVB_128( vecE, vecG, vec0, gran, opSAR );
   17396          putXMMReg(gregOfRexRM(pfx, modrm), mkexpr(res));
   17397 
   17398          goto decode_success;
   17399       }
   17400       break;
   17401 
   17402    case 0x17:
   17403       /* 66 0F 38 17 /r = PTEST xmm1, xmm2/m128
   17404          Logical compare (set ZF and CF from AND/ANDN of the operands) */
   17405       if (have66noF2noF3(pfx)
   17406           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   17407          delta = dis_xTESTy_128( vbi, pfx, delta, False/*!isAvx*/, 0 );
   17408          goto decode_success;
   17409       }
   17410       break;
   17411 
   17412    case 0x20:
   17413       /* 66 0F 38 20 /r = PMOVSXBW xmm1, xmm2/m64
   17414          Packed Move with Sign Extend from Byte to Word (XMM) */
   17415       if (have66noF2noF3(pfx) && sz == 2) {
   17416          delta = dis_PMOVxXBW_128( vbi, pfx, delta,
   17417                                    False/*!isAvx*/, False/*!xIsZ*/ );
   17418          goto decode_success;
   17419       }
   17420       break;
   17421 
   17422    case 0x21:
   17423       /* 66 0F 38 21 /r = PMOVSXBD xmm1, xmm2/m32
   17424          Packed Move with Sign Extend from Byte to DWord (XMM) */
   17425       if (have66noF2noF3(pfx) && sz == 2) {
   17426          delta = dis_PMOVxXBD_128( vbi, pfx, delta,
   17427                                    False/*!isAvx*/, False/*!xIsZ*/ );
   17428          goto decode_success;
   17429       }
   17430       break;
   17431 
   17432    case 0x22:
   17433       /* 66 0F 38 22 /r = PMOVSXBQ xmm1, xmm2/m16
   17434          Packed Move with Sign Extend from Byte to QWord (XMM) */
   17435       if (have66noF2noF3(pfx) && sz == 2) {
   17436          delta = dis_PMOVSXBQ_128( vbi, pfx, delta, False/*!isAvx*/ );
   17437          goto decode_success;
   17438       }
   17439       break;
   17440 
   17441    case 0x23:
   17442       /* 66 0F 38 23 /r = PMOVSXWD xmm1, xmm2/m64
   17443          Packed Move with Sign Extend from Word to DWord (XMM) */
   17444       if (have66noF2noF3(pfx) && sz == 2) {
   17445          delta = dis_PMOVxXWD_128(vbi, pfx, delta,
   17446                                   False/*!isAvx*/, False/*!xIsZ*/);
   17447          goto decode_success;
   17448       }
   17449       break;
   17450 
   17451    case 0x24:
   17452       /* 66 0F 38 24 /r = PMOVSXWQ xmm1, xmm2/m32
   17453          Packed Move with Sign Extend from Word to QWord (XMM) */
   17454       if (have66noF2noF3(pfx) && sz == 2) {
   17455          delta = dis_PMOVSXWQ_128( vbi, pfx, delta, False/*!isAvx*/ );
   17456          goto decode_success;
   17457       }
   17458       break;
   17459 
   17460    case 0x25:
   17461       /* 66 0F 38 25 /r = PMOVSXDQ xmm1, xmm2/m64
   17462          Packed Move with Sign Extend from Double Word to Quad Word (XMM) */
   17463       if (have66noF2noF3(pfx) && sz == 2) {
   17464          delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
   17465                                    False/*!isAvx*/, False/*!xIsZ*/ );
   17466          goto decode_success;
   17467       }
   17468       break;
   17469 
   17470    case 0x28:
   17471       /* 66 0F 38 28 = PMULDQ -- signed widening multiply of 32-lanes
   17472          0 x 0 to form lower 64-bit half and lanes 2 x 2 to form upper
   17473          64-bit half */
   17474       /* This is a really poor translation -- could be improved if
   17475          performance critical.  It's a copy-paste of PMULUDQ, too. */
   17476       if (have66noF2noF3(pfx) && sz == 2) {
   17477          IRTemp sV = newTemp(Ity_V128);
   17478          IRTemp dV = newTemp(Ity_V128);
   17479          modrm = getUChar(delta);
   17480          UInt rG = gregOfRexRM(pfx,modrm);
   17481          assign( dV, getXMMReg(rG) );
   17482          if (epartIsReg(modrm)) {
   17483             UInt rE = eregOfRexRM(pfx,modrm);
   17484             assign( sV, getXMMReg(rE) );
   17485             delta += 1;
   17486             DIP("pmuldq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   17487          } else {
   17488             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   17489             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   17490             delta += alen;
   17491             DIP("pmuldq %s,%s\n", dis_buf, nameXMMReg(rG));
   17492          }
   17493 
   17494          putXMMReg( rG, mkexpr(math_PMULDQ_128( dV, sV )) );
   17495          goto decode_success;
   17496       }
   17497       break;
   17498 
   17499    case 0x29:
   17500       /* 66 0F 38 29 = PCMPEQQ
   17501          64x2 equality comparison */
   17502       if (have66noF2noF3(pfx) && sz == 2) {
   17503          /* FIXME: this needs an alignment check */
   17504          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   17505                                     "pcmpeqq", Iop_CmpEQ64x2, False );
   17506          goto decode_success;
   17507       }
   17508       break;
   17509 
   17510    case 0x2A:
   17511       /* 66 0F 38 2A = MOVNTDQA
   17512          "non-temporal" "streaming" load
   17513          Handle like MOVDQA but only memory operand is allowed */
   17514       if (have66noF2noF3(pfx) && sz == 2) {
   17515          modrm = getUChar(delta);
   17516          if (!epartIsReg(modrm)) {
   17517             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   17518             gen_SEGV_if_not_16_aligned( addr );
   17519             putXMMReg( gregOfRexRM(pfx,modrm),
   17520                        loadLE(Ity_V128, mkexpr(addr)) );
   17521             DIP("movntdqa %s,%s\n", dis_buf,
   17522                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   17523             delta += alen;
   17524             goto decode_success;
   17525          }
   17526       }
   17527       break;
   17528 
   17529    case 0x2B:
   17530       /* 66 0f 38 2B /r = PACKUSDW xmm1, xmm2/m128
   17531          2x 32x4 S->U saturating narrow from xmm2/m128 to xmm1 */
   17532       if (have66noF2noF3(pfx) && sz == 2) {
   17533 
   17534          modrm = getUChar(delta);
   17535 
   17536          IRTemp argL = newTemp(Ity_V128);
   17537          IRTemp argR = newTemp(Ity_V128);
   17538 
   17539          if ( epartIsReg(modrm) ) {
   17540             assign( argL, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   17541             delta += 1;
   17542             DIP( "packusdw %s,%s\n",
   17543                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   17544                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   17545          } else {
   17546             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17547             gen_SEGV_if_not_16_aligned( addr );
   17548             assign( argL, loadLE( Ity_V128, mkexpr(addr) ));
   17549             delta += alen;
   17550             DIP( "packusdw %s,%s\n",
   17551                  dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   17552          }
   17553 
   17554          assign(argR, getXMMReg( gregOfRexRM(pfx, modrm) ));
   17555 
   17556          putXMMReg( gregOfRexRM(pfx, modrm),
   17557                     binop( Iop_QNarrowBin32Sto16Ux8,
   17558                            mkexpr(argL), mkexpr(argR)) );
   17559 
   17560          goto decode_success;
   17561       }
   17562       break;
   17563 
   17564    case 0x30:
   17565       /* 66 0F 38 30 /r = PMOVZXBW xmm1, xmm2/m64
   17566          Packed Move with Zero Extend from Byte to Word (XMM) */
   17567       if (have66noF2noF3(pfx) && sz == 2) {
   17568          delta = dis_PMOVxXBW_128( vbi, pfx, delta,
   17569                                    False/*!isAvx*/, True/*xIsZ*/ );
   17570          goto decode_success;
   17571       }
   17572       break;
   17573 
   17574    case 0x31:
   17575       /* 66 0F 38 31 /r = PMOVZXBD xmm1, xmm2/m32
   17576          Packed Move with Zero Extend from Byte to DWord (XMM) */
   17577       if (have66noF2noF3(pfx) && sz == 2) {
   17578          delta = dis_PMOVxXBD_128( vbi, pfx, delta,
   17579                                    False/*!isAvx*/, True/*xIsZ*/ );
   17580          goto decode_success;
   17581       }
   17582       break;
   17583 
   17584    case 0x32:
   17585       /* 66 0F 38 32 /r = PMOVZXBQ xmm1, xmm2/m16
   17586          Packed Move with Zero Extend from Byte to QWord (XMM) */
   17587       if (have66noF2noF3(pfx) && sz == 2) {
   17588          delta = dis_PMOVZXBQ_128( vbi, pfx, delta, False/*!isAvx*/ );
   17589          goto decode_success;
   17590       }
   17591       break;
   17592 
   17593    case 0x33:
   17594       /* 66 0F 38 33 /r = PMOVZXWD xmm1, xmm2/m64
   17595          Packed Move with Zero Extend from Word to DWord (XMM) */
   17596       if (have66noF2noF3(pfx) && sz == 2) {
   17597          delta = dis_PMOVxXWD_128( vbi, pfx, delta,
   17598                                    False/*!isAvx*/, True/*xIsZ*/ );
   17599          goto decode_success;
   17600       }
   17601       break;
   17602 
   17603    case 0x34:
   17604       /* 66 0F 38 34 /r = PMOVZXWQ xmm1, xmm2/m32
   17605          Packed Move with Zero Extend from Word to QWord (XMM) */
   17606       if (have66noF2noF3(pfx) && sz == 2) {
   17607          delta = dis_PMOVZXWQ_128( vbi, pfx, delta, False/*!isAvx*/ );
   17608          goto decode_success;
   17609       }
   17610       break;
   17611 
   17612    case 0x35:
   17613       /* 66 0F 38 35 /r = PMOVZXDQ xmm1, xmm2/m64
   17614          Packed Move with Zero Extend from DWord to QWord (XMM) */
   17615       if (have66noF2noF3(pfx) && sz == 2) {
   17616          delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
   17617                                    False/*!isAvx*/, True/*xIsZ*/ );
   17618          goto decode_success;
   17619       }
   17620       break;
   17621 
   17622    case 0x37:
   17623       /* 66 0F 38 37 = PCMPGTQ
   17624          64x2 comparison (signed, presumably; the Intel docs don't say :-)
   17625       */
   17626       if (have66noF2noF3(pfx) && sz == 2) {
   17627          /* FIXME: this needs an alignment check */
   17628          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   17629                                     "pcmpgtq", Iop_CmpGT64Sx2, False );
   17630          goto decode_success;
   17631       }
   17632       break;
   17633 
   17634    case 0x38:
   17635    case 0x3C:
   17636       /* 66 0F 38 38 /r = PMINSB xmm1, xmm2/m128    8Sx16 (signed) min
   17637          66 0F 38 3C /r = PMAXSB xmm1, xmm2/m128    8Sx16 (signed) max
   17638       */
   17639       if (have66noF2noF3(pfx) && sz == 2) {
   17640          /* FIXME: this needs an alignment check */
   17641          Bool isMAX = opc == 0x3C;
   17642          delta = dis_SSEint_E_to_G(
   17643                     vbi, pfx, delta,
   17644                     isMAX ? "pmaxsb" : "pminsb",
   17645                     isMAX ? Iop_Max8Sx16 : Iop_Min8Sx16,
   17646                     False
   17647                  );
   17648          goto decode_success;
   17649       }
   17650       break;
   17651 
   17652    case 0x39:
   17653    case 0x3D:
   17654       /* 66 0F 38 39 /r = PMINSD xmm1, xmm2/m128
   17655          Minimum of Packed Signed Double Word Integers (XMM)
   17656          66 0F 38 3D /r = PMAXSD xmm1, xmm2/m128
   17657          Maximum of Packed Signed Double Word Integers (XMM)
   17658       */
   17659       if (have66noF2noF3(pfx) && sz == 2) {
   17660          /* FIXME: this needs an alignment check */
   17661          Bool isMAX = opc == 0x3D;
   17662          delta = dis_SSEint_E_to_G(
   17663                     vbi, pfx, delta,
   17664                     isMAX ? "pmaxsd" : "pminsd",
   17665                     isMAX ? Iop_Max32Sx4 : Iop_Min32Sx4,
   17666                     False
   17667                  );
   17668          goto decode_success;
   17669       }
   17670       break;
   17671 
   17672    case 0x3A:
   17673    case 0x3E:
   17674       /* 66 0F 38 3A /r = PMINUW xmm1, xmm2/m128
   17675          Minimum of Packed Unsigned Word Integers (XMM)
   17676          66 0F 38 3E /r = PMAXUW xmm1, xmm2/m128
   17677          Maximum of Packed Unsigned Word Integers (XMM)
   17678       */
   17679       if (have66noF2noF3(pfx) && sz == 2) {
   17680          /* FIXME: this needs an alignment check */
   17681          Bool isMAX = opc == 0x3E;
   17682          delta = dis_SSEint_E_to_G(
   17683                     vbi, pfx, delta,
   17684                     isMAX ? "pmaxuw" : "pminuw",
   17685                     isMAX ? Iop_Max16Ux8 : Iop_Min16Ux8,
   17686                     False
   17687                  );
   17688          goto decode_success;
   17689       }
   17690       break;
   17691 
   17692    case 0x3B:
   17693    case 0x3F:
   17694       /* 66 0F 38 3B /r = PMINUD xmm1, xmm2/m128
   17695          Minimum of Packed Unsigned Doubleword Integers (XMM)
   17696          66 0F 38 3F /r = PMAXUD xmm1, xmm2/m128
   17697          Maximum of Packed Unsigned Doubleword Integers (XMM)
   17698       */
   17699       if (have66noF2noF3(pfx) && sz == 2) {
   17700          /* FIXME: this needs an alignment check */
   17701          Bool isMAX = opc == 0x3F;
   17702          delta = dis_SSEint_E_to_G(
   17703                     vbi, pfx, delta,
   17704                     isMAX ? "pmaxud" : "pminud",
   17705                     isMAX ? Iop_Max32Ux4 : Iop_Min32Ux4,
   17706                     False
   17707                  );
   17708          goto decode_success;
   17709       }
   17710       break;
   17711 
   17712    case 0x40:
   17713       /* 66 0F 38 40 /r = PMULLD xmm1, xmm2/m128
   17714          32x4 integer multiply from xmm2/m128 to xmm1 */
   17715       if (have66noF2noF3(pfx) && sz == 2) {
   17716 
   17717          modrm = getUChar(delta);
   17718 
   17719          IRTemp argL = newTemp(Ity_V128);
   17720          IRTemp argR = newTemp(Ity_V128);
   17721 
   17722          if ( epartIsReg(modrm) ) {
   17723             assign( argL, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   17724             delta += 1;
   17725             DIP( "pmulld %s,%s\n",
   17726                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   17727                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   17728          } else {
   17729             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17730             gen_SEGV_if_not_16_aligned( addr );
   17731             assign( argL, loadLE( Ity_V128, mkexpr(addr) ));
   17732             delta += alen;
   17733             DIP( "pmulld %s,%s\n",
   17734                  dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   17735          }
   17736 
   17737          assign(argR, getXMMReg( gregOfRexRM(pfx, modrm) ));
   17738 
   17739          putXMMReg( gregOfRexRM(pfx, modrm),
   17740                     binop( Iop_Mul32x4, mkexpr(argL), mkexpr(argR)) );
   17741 
   17742          goto decode_success;
   17743       }
   17744       break;
   17745 
   17746    case 0x41:
   17747       /* 66 0F 38 41 /r = PHMINPOSUW xmm1, xmm2/m128
   17748          Packed Horizontal Word Minimum from xmm2/m128 to xmm1 */
   17749       if (have66noF2noF3(pfx) && sz == 2) {
   17750          delta = dis_PHMINPOSUW_128( vbi, pfx, delta, False/*!isAvx*/ );
   17751          goto decode_success;
   17752       }
   17753       break;
   17754 
   17755    case 0xDC:
   17756    case 0xDD:
   17757    case 0xDE:
   17758    case 0xDF:
   17759    case 0xDB:
   17760       /* 66 0F 38 DC /r = AESENC xmm1, xmm2/m128
   17761                   DD /r = AESENCLAST xmm1, xmm2/m128
   17762                   DE /r = AESDEC xmm1, xmm2/m128
   17763                   DF /r = AESDECLAST xmm1, xmm2/m128
   17764 
   17765                   DB /r = AESIMC xmm1, xmm2/m128 */
   17766       if (have66noF2noF3(pfx) && sz == 2) {
   17767          delta = dis_AESx( vbi, pfx, delta, False/*!isAvx*/, opc );
   17768          goto decode_success;
   17769       }
   17770       break;
   17771 
   17772    case 0xF0:
   17773    case 0xF1:
   17774       /* F2 0F 38 F0 /r = CRC32 r/m8, r32 (REX.W ok, 66 not ok)
   17775          F2 0F 38 F1 /r = CRC32 r/m{16,32,64}, r32
   17776          The decoding on this is a bit unusual.
   17777       */
   17778       if (haveF2noF3(pfx)
   17779           && (opc == 0xF1 || (opc == 0xF0 && !have66(pfx)))) {
   17780          modrm = getUChar(delta);
   17781 
   17782          if (opc == 0xF0)
   17783             sz = 1;
   17784          else
   17785             vassert(sz == 2 || sz == 4 || sz == 8);
   17786 
   17787          IRType tyE = szToITy(sz);
   17788          IRTemp valE = newTemp(tyE);
   17789 
   17790          if (epartIsReg(modrm)) {
   17791             assign(valE, getIRegE(sz, pfx, modrm));
   17792             delta += 1;
   17793             DIP("crc32b %s,%s\n", nameIRegE(sz, pfx, modrm),
   17794                 nameIRegG(1==getRexW(pfx) ? 8 : 4, pfx, modrm));
   17795          } else {
   17796             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   17797             assign(valE, loadLE(tyE, mkexpr(addr)));
   17798             delta += alen;
   17799             DIP("crc32b %s,%s\n", dis_buf,
   17800                 nameIRegG(1==getRexW(pfx) ? 8 : 4, pfx, modrm));
   17801          }
   17802 
   17803          /* Somewhat funny getting/putting of the crc32 value, in order
   17804             to ensure that it turns into 64-bit gets and puts.  However,
   17805             mask off the upper 32 bits so as to not get memcheck false
   17806             +ves around the helper call. */
   17807          IRTemp valG0 = newTemp(Ity_I64);
   17808          assign(valG0, binop(Iop_And64, getIRegG(8, pfx, modrm),
   17809                              mkU64(0xFFFFFFFF)));
   17810 
   17811          const HChar* nm = NULL;
   17812          void*  fn = NULL;
   17813          switch (sz) {
   17814             case 1: nm = "amd64g_calc_crc32b";
   17815                     fn = &amd64g_calc_crc32b; break;
   17816             case 2: nm = "amd64g_calc_crc32w";
   17817                     fn = &amd64g_calc_crc32w; break;
   17818             case 4: nm = "amd64g_calc_crc32l";
   17819                     fn = &amd64g_calc_crc32l; break;
   17820             case 8: nm = "amd64g_calc_crc32q";
   17821                     fn = &amd64g_calc_crc32q; break;
   17822          }
   17823          vassert(nm && fn);
   17824          IRTemp valG1 = newTemp(Ity_I64);
   17825          assign(valG1,
   17826                 mkIRExprCCall(Ity_I64, 0/*regparm*/, nm, fn,
   17827                               mkIRExprVec_2(mkexpr(valG0),
   17828                                             widenUto64(mkexpr(valE)))));
   17829 
   17830          putIRegG(4, pfx, modrm, unop(Iop_64to32, mkexpr(valG1)));
   17831          goto decode_success;
   17832       }
   17833       break;
   17834 
   17835    default:
   17836       break;
   17837 
   17838    }
   17839 
   17840   //decode_failure:
   17841    *decode_OK = False;
   17842    return deltaIN;
   17843 
   17844   decode_success:
   17845    *decode_OK = True;
   17846    return delta;
   17847 }
   17848 
   17849 
   17850 /*------------------------------------------------------------*/
   17851 /*---                                                      ---*/
   17852 /*--- Top-level SSE4: dis_ESC_0F3A__SSE4                   ---*/
   17853 /*---                                                      ---*/
   17854 /*------------------------------------------------------------*/
   17855 
   17856 static Long dis_PEXTRW ( VexAbiInfo* vbi, Prefix pfx,
   17857                          Long delta, Bool isAvx )
   17858 {
   17859    IRTemp addr  = IRTemp_INVALID;
   17860    IRTemp t0    = IRTemp_INVALID;
   17861    IRTemp t1    = IRTemp_INVALID;
   17862    IRTemp t2    = IRTemp_INVALID;
   17863    IRTemp t3    = IRTemp_INVALID;
   17864    UChar  modrm = getUChar(delta);
   17865    Int    alen  = 0;
   17866    HChar  dis_buf[50];
   17867    UInt   rG    = gregOfRexRM(pfx,modrm);
   17868    Int    imm8_20;
   17869    IRTemp xmm_vec = newTemp(Ity_V128);
   17870    IRTemp d16   = newTemp(Ity_I16);
   17871    const HChar* mbV = isAvx ? "v" : "";
   17872 
   17873    vassert(0==getRexW(pfx)); /* ensured by caller */
   17874    assign( xmm_vec, getXMMReg(rG) );
   17875    breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
   17876 
   17877    if ( epartIsReg( modrm ) ) {
   17878       imm8_20 = (Int)(getUChar(delta+1) & 7);
   17879    } else {
   17880       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   17881       imm8_20 = (Int)(getUChar(delta+alen) & 7);
   17882    }
   17883 
   17884    switch (imm8_20) {
   17885       case 0:  assign(d16, unop(Iop_32to16,   mkexpr(t0))); break;
   17886       case 1:  assign(d16, unop(Iop_32HIto16, mkexpr(t0))); break;
   17887       case 2:  assign(d16, unop(Iop_32to16,   mkexpr(t1))); break;
   17888       case 3:  assign(d16, unop(Iop_32HIto16, mkexpr(t1))); break;
   17889       case 4:  assign(d16, unop(Iop_32to16,   mkexpr(t2))); break;
   17890       case 5:  assign(d16, unop(Iop_32HIto16, mkexpr(t2))); break;
   17891       case 6:  assign(d16, unop(Iop_32to16,   mkexpr(t3))); break;
   17892       case 7:  assign(d16, unop(Iop_32HIto16, mkexpr(t3))); break;
   17893       default: vassert(0);
   17894    }
   17895 
   17896    if ( epartIsReg( modrm ) ) {
   17897       UInt rE = eregOfRexRM(pfx,modrm);
   17898       putIReg32( rE, unop(Iop_16Uto32, mkexpr(d16)) );
   17899       delta += 1+1;
   17900       DIP( "%spextrw $%d, %s,%s\n", mbV, imm8_20,
   17901            nameXMMReg( rG ), nameIReg32( rE ) );
   17902    } else {
   17903       storeLE( mkexpr(addr), mkexpr(d16) );
   17904       delta += alen+1;
   17905       DIP( "%spextrw $%d, %s,%s\n", mbV, imm8_20, nameXMMReg( rG ), dis_buf );
   17906    }
   17907    return delta;
   17908 }
   17909 
   17910 
   17911 static Long dis_PEXTRD ( VexAbiInfo* vbi, Prefix pfx,
   17912                          Long delta, Bool isAvx )
   17913 {
   17914    IRTemp addr  = IRTemp_INVALID;
   17915    IRTemp t0    = IRTemp_INVALID;
   17916    IRTemp t1    = IRTemp_INVALID;
   17917    IRTemp t2    = IRTemp_INVALID;
   17918    IRTemp t3    = IRTemp_INVALID;
   17919    UChar  modrm = 0;
   17920    Int    alen  = 0;
   17921    HChar  dis_buf[50];
   17922 
   17923    Int    imm8_10;
   17924    IRTemp xmm_vec   = newTemp(Ity_V128);
   17925    IRTemp src_dword = newTemp(Ity_I32);
   17926    const HChar* mbV = isAvx ? "v" : "";
   17927 
   17928    vassert(0==getRexW(pfx)); /* ensured by caller */
   17929    modrm = getUChar(delta);
   17930    assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
   17931    breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
   17932 
   17933    if ( epartIsReg( modrm ) ) {
   17934       imm8_10 = (Int)(getUChar(delta+1) & 3);
   17935    } else {
   17936       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   17937       imm8_10 = (Int)(getUChar(delta+alen) & 3);
   17938    }
   17939 
   17940    switch ( imm8_10 ) {
   17941       case 0:  assign( src_dword, mkexpr(t0) ); break;
   17942       case 1:  assign( src_dword, mkexpr(t1) ); break;
   17943       case 2:  assign( src_dword, mkexpr(t2) ); break;
   17944       case 3:  assign( src_dword, mkexpr(t3) ); break;
   17945       default: vassert(0);
   17946    }
   17947 
   17948    if ( epartIsReg( modrm ) ) {
   17949       putIReg32( eregOfRexRM(pfx,modrm), mkexpr(src_dword) );
   17950       delta += 1+1;
   17951       DIP( "%spextrd $%d, %s,%s\n", mbV, imm8_10,
   17952            nameXMMReg( gregOfRexRM(pfx, modrm) ),
   17953            nameIReg32( eregOfRexRM(pfx, modrm) ) );
   17954    } else {
   17955       storeLE( mkexpr(addr), mkexpr(src_dword) );
   17956       delta += alen+1;
   17957       DIP( "%spextrd $%d, %s,%s\n", mbV,
   17958            imm8_10, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
   17959    }
   17960    return delta;
   17961 }
   17962 
   17963 
   17964 static Long dis_PEXTRQ ( VexAbiInfo* vbi, Prefix pfx,
   17965                          Long delta, Bool isAvx )
   17966 {
   17967    IRTemp addr  = IRTemp_INVALID;
   17968    UChar  modrm = 0;
   17969    Int    alen  = 0;
   17970    HChar  dis_buf[50];
   17971 
   17972    Int imm8_0;
   17973    IRTemp xmm_vec   = newTemp(Ity_V128);
   17974    IRTemp src_qword = newTemp(Ity_I64);
   17975    const HChar* mbV = isAvx ? "v" : "";
   17976 
   17977    vassert(1==getRexW(pfx)); /* ensured by caller */
   17978    modrm = getUChar(delta);
   17979    assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
   17980 
   17981    if ( epartIsReg( modrm ) ) {
   17982       imm8_0 = (Int)(getUChar(delta+1) & 1);
   17983    } else {
   17984       addr   = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   17985       imm8_0 = (Int)(getUChar(delta+alen) & 1);
   17986    }
   17987 
   17988    switch ( imm8_0 ) {
   17989       case 0:  assign( src_qword, unop(Iop_V128to64,   mkexpr(xmm_vec)) );
   17990                break;
   17991       case 1:  assign( src_qword, unop(Iop_V128HIto64, mkexpr(xmm_vec)) );
   17992                break;
   17993       default: vassert(0);
   17994    }
   17995 
   17996    if ( epartIsReg( modrm ) ) {
   17997       putIReg64( eregOfRexRM(pfx,modrm), mkexpr(src_qword) );
   17998       delta += 1+1;
   17999       DIP( "%spextrq $%d, %s,%s\n", mbV, imm8_0,
   18000            nameXMMReg( gregOfRexRM(pfx, modrm) ),
   18001            nameIReg64( eregOfRexRM(pfx, modrm) ) );
   18002    } else {
   18003       storeLE( mkexpr(addr), mkexpr(src_qword) );
   18004       delta += alen+1;
   18005       DIP( "%spextrq $%d, %s,%s\n", mbV,
   18006            imm8_0, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
   18007    }
   18008    return delta;
   18009 }
   18010 
   18011 static IRExpr* math_CTZ32(IRExpr *exp)
   18012 {
   18013    /* Iop_Ctz32 isn't implemented by the amd64 back end, so use Iop_Ctz64. */
   18014    return unop(Iop_64to32, unop(Iop_Ctz64, unop(Iop_32Uto64, exp)));
   18015 }
   18016 
   18017 static Long dis_PCMPISTRI_3A ( UChar modrm, UInt regNoL, UInt regNoR,
   18018                                Long delta, UChar opc, UChar imm,
   18019                                HChar dis_buf[])
   18020 {
   18021    /* We only handle PCMPISTRI for now */
   18022    vassert((opc & 0x03) == 0x03);
   18023    /* And only an immediate byte of 0x38 or 0x3A */
   18024    vassert((imm & ~0x02) == 0x38);
   18025 
   18026    /* FIXME: Is this correct when RegNoL == 16 ? */
   18027    IRTemp argL = newTemp(Ity_V128);
   18028    assign(argL, getXMMReg(regNoL));
   18029    IRTemp argR = newTemp(Ity_V128);
   18030    assign(argR, getXMMReg(regNoR));
   18031 
   18032    IRTemp zmaskL = newTemp(Ity_I32);
   18033    assign(zmaskL, unop(Iop_16Uto32,
   18034                        unop(Iop_GetMSBs8x16,
   18035                             binop(Iop_CmpEQ8x16, mkexpr(argL), mkV128(0)))));
   18036    IRTemp zmaskR = newTemp(Ity_I32);
   18037    assign(zmaskR, unop(Iop_16Uto32,
   18038                        unop(Iop_GetMSBs8x16,
   18039                             binop(Iop_CmpEQ8x16, mkexpr(argR), mkV128(0)))));
   18040 
   18041    /* We want validL = ~(zmaskL | -zmaskL)
   18042 
   18043       But this formulation kills memcheck's validity tracking when any
   18044       bits above the first "1" are invalid.  So reformulate as:
   18045 
   18046       validL = (zmaskL ? (1 << ctz(zmaskL)) : 0) - 1
   18047    */
   18048 
   18049    IRExpr *ctzL = unop(Iop_32to8, math_CTZ32(mkexpr(zmaskL)));
   18050 
   18051    /* Generate a bool expression which is zero iff the original is
   18052       zero.  Do this carefully so memcheck can propagate validity bits
   18053       correctly.
   18054     */
   18055    IRTemp zmaskL_zero = newTemp(Ity_I1);
   18056    assign(zmaskL_zero, binop(Iop_ExpCmpNE32, mkexpr(zmaskL), mkU32(0)));
   18057 
   18058    IRTemp validL = newTemp(Ity_I32);
   18059    assign(validL, binop(Iop_Sub32,
   18060                         IRExpr_ITE(mkexpr(zmaskL_zero),
   18061                                    binop(Iop_Shl32, mkU32(1), ctzL),
   18062                                    mkU32(0)),
   18063                         mkU32(1)));
   18064 
   18065    /* And similarly for validR. */
   18066    IRExpr *ctzR = unop(Iop_32to8, math_CTZ32(mkexpr(zmaskR)));
   18067    IRTemp zmaskR_zero = newTemp(Ity_I1);
   18068    assign(zmaskR_zero, binop(Iop_ExpCmpNE32, mkexpr(zmaskR), mkU32(0)));
   18069    IRTemp validR = newTemp(Ity_I32);
   18070    assign(validR, binop(Iop_Sub32,
   18071                         IRExpr_ITE(mkexpr(zmaskR_zero),
   18072                                    binop(Iop_Shl32, mkU32(1), ctzR),
   18073                                    mkU32(0)),
   18074                         mkU32(1)));
   18075 
   18076    /* Do the actual comparison. */
   18077    IRExpr *boolResII = unop(Iop_16Uto32,
   18078                             unop(Iop_GetMSBs8x16,
   18079                                  binop(Iop_CmpEQ8x16, mkexpr(argL),
   18080                                                       mkexpr(argR))));
   18081 
   18082    /* Compute boolresII & validL & validR (i.e., if both valid, use
   18083       comparison result) */
   18084    IRExpr *intRes1_a = binop(Iop_And32, boolResII,
   18085                              binop(Iop_And32,
   18086                                    mkexpr(validL), mkexpr(validR)));
   18087 
   18088    /* Compute ~(validL | validR); i.e., if both invalid, force 1. */
   18089    IRExpr *intRes1_b = unop(Iop_Not32, binop(Iop_Or32,
   18090                                              mkexpr(validL), mkexpr(validR)));
   18091    /* Otherwise, zero. */
   18092    IRExpr *intRes1 = binop(Iop_And32, mkU32(0xFFFF),
   18093                            binop(Iop_Or32, intRes1_a, intRes1_b));
   18094 
   18095    /* The "0x30" in imm=0x3A means "polarity=3" means XOR validL with
   18096       result. */
   18097    IRTemp intRes2 = newTemp(Ity_I32);
   18098    assign(intRes2, binop(Iop_And32, mkU32(0xFFFF),
   18099                          binop(Iop_Xor32, intRes1, mkexpr(validL))));
   18100 
   18101    /* If the 0x40 bit were set in imm=0x3A, we would return the index
   18102       of the msb.  Since it is clear, we return the index of the
   18103       lsb. */
   18104    IRExpr *newECX = math_CTZ32(binop(Iop_Or32,
   18105                                      mkexpr(intRes2), mkU32(0x10000)));
   18106 
   18107    /* And thats our rcx. */
   18108    putIReg32(R_RCX, newECX);
   18109 
   18110    /* Now for the condition codes... */
   18111 
   18112    /* C == 0 iff intRes2 == 0 */
   18113    IRExpr *c_bit = IRExpr_ITE( binop(Iop_ExpCmpNE32, mkexpr(intRes2),
   18114                                      mkU32(0)),
   18115                                mkU32(1 << AMD64G_CC_SHIFT_C),
   18116                                mkU32(0));
   18117    /* Z == 1 iff any in argL is 0 */
   18118    IRExpr *z_bit = IRExpr_ITE( mkexpr(zmaskL_zero),
   18119                                mkU32(1 << AMD64G_CC_SHIFT_Z),
   18120                                mkU32(0));
   18121    /* S == 1 iff any in argR is 0 */
   18122    IRExpr *s_bit = IRExpr_ITE( mkexpr(zmaskR_zero),
   18123                                mkU32(1 << AMD64G_CC_SHIFT_S),
   18124                                mkU32(0));
   18125    /* O == IntRes2[0] */
   18126    IRExpr *o_bit = binop(Iop_Shl32, binop(Iop_And32, mkexpr(intRes2),
   18127                                           mkU32(0x01)),
   18128                          mkU8(AMD64G_CC_SHIFT_O));
   18129 
   18130    /* Put them all together */
   18131    IRTemp cc = newTemp(Ity_I64);
   18132    assign(cc, widenUto64(binop(Iop_Or32,
   18133                                binop(Iop_Or32, c_bit, z_bit),
   18134                                binop(Iop_Or32, s_bit, o_bit))));
   18135    stmt(IRStmt_Put(OFFB_CC_OP, mkU64(AMD64G_CC_OP_COPY)));
   18136    stmt(IRStmt_Put(OFFB_CC_DEP1, mkexpr(cc)));
   18137    stmt(IRStmt_Put(OFFB_CC_DEP2, mkU64(0)));
   18138    stmt(IRStmt_Put(OFFB_CC_NDEP, mkU64(0)));
   18139 
   18140    return delta;
   18141 }
   18142 
   18143 /* This can fail, in which case it returns the original (unchanged)
   18144    delta. */
   18145 static Long dis_PCMPxSTRx ( VexAbiInfo* vbi, Prefix pfx,
   18146                             Long delta, Bool isAvx, UChar opc )
   18147 {
   18148    Long   delta0  = delta;
   18149    UInt   isISTRx = opc & 2;
   18150    UInt   isxSTRM = (opc & 1) ^ 1;
   18151    UInt   regNoL  = 0;
   18152    UInt   regNoR  = 0;
   18153    UChar  imm     = 0;
   18154    IRTemp addr    = IRTemp_INVALID;
   18155    Int    alen    = 0;
   18156    HChar  dis_buf[50];
   18157 
   18158    /* This is a nasty kludge.  We need to pass 2 x V128 to the helper
   18159       (which is clean).  Since we can't do that, use a dirty helper to
   18160       compute the results directly from the XMM regs in the guest
   18161       state.  That means for the memory case, we need to move the left
   18162       operand into a pseudo-register (XMM16, let's call it). */
   18163    UChar modrm = getUChar(delta);
   18164    if (epartIsReg(modrm)) {
   18165       regNoL = eregOfRexRM(pfx, modrm);
   18166       regNoR = gregOfRexRM(pfx, modrm);
   18167       imm = getUChar(delta+1);
   18168       delta += 1+1;
   18169    } else {
   18170       regNoL = 16; /* use XMM16 as an intermediary */
   18171       regNoR = gregOfRexRM(pfx, modrm);
   18172       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   18173       /* No alignment check; I guess that makes sense, given that
   18174          these insns are for dealing with C style strings. */
   18175       stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) ));
   18176       imm = getUChar(delta+alen);
   18177       delta += alen+1;
   18178    }
   18179 
   18180    /* Print the insn here, since dis_PCMPISTRI_3A doesn't do so
   18181       itself. */
   18182    if (regNoL == 16) {
   18183       DIP("%spcmp%cstr%c $%x,%s,%s\n",
   18184           isAvx ? "v" : "", isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i',
   18185           (UInt)imm, dis_buf, nameXMMReg(regNoR));
   18186    } else {
   18187       DIP("%spcmp%cstr%c $%x,%s,%s\n",
   18188           isAvx ? "v" : "", isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i',
   18189           (UInt)imm, nameXMMReg(regNoL), nameXMMReg(regNoR));
   18190    }
   18191 
   18192    /* Handle special case(s). */
   18193    if (imm == 0x3A && isISTRx && !isxSTRM) {
   18194       return dis_PCMPISTRI_3A ( modrm, regNoL, regNoR, delta,
   18195                                 opc, imm, dis_buf);
   18196    }
   18197 
   18198    /* Now we know the XMM reg numbers for the operands, and the
   18199       immediate byte.  Is it one we can actually handle? Throw out any
   18200       cases for which the helper function has not been verified. */
   18201    switch (imm) {
   18202       case 0x00: case 0x02: case 0x08: case 0x0A: case 0x0C: case 0x0E:
   18203       case 0x12: case 0x14: case 0x1A:
   18204       case 0x30: case 0x34: case 0x38: case 0x3A:
   18205       case 0x40: case 0x44: case 0x46: case 0x4A:
   18206          break;
   18207       // the 16-bit character versions of the above
   18208       case 0x01: case 0x03: case 0x09: case 0x0B: case 0x0D:
   18209       case 0x13:            case 0x1B:
   18210                             case 0x39: case 0x3B:
   18211                  case 0x45:            case 0x4B:
   18212          break;
   18213       default:
   18214          return delta0; /*FAIL*/
   18215    }
   18216 
   18217    /* Who ya gonna call?  Presumably not Ghostbusters. */
   18218    void*  fn = &amd64g_dirtyhelper_PCMPxSTRx;
   18219    const HChar* nm = "amd64g_dirtyhelper_PCMPxSTRx";
   18220 
   18221    /* Round up the arguments.  Note that this is a kludge -- the use
   18222       of mkU64 rather than mkIRExpr_HWord implies the assumption that
   18223       the host's word size is 64-bit. */
   18224    UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL);
   18225    UInt gstOffR = ymmGuestRegOffset(regNoR);
   18226 
   18227    IRExpr*  opc4_and_imm = mkU64((opc << 8) | (imm & 0xFF));
   18228    IRExpr*  gstOffLe     = mkU64(gstOffL);
   18229    IRExpr*  gstOffRe     = mkU64(gstOffR);
   18230    IRExpr*  edxIN        = isISTRx ? mkU64(0) : getIRegRDX(8);
   18231    IRExpr*  eaxIN        = isISTRx ? mkU64(0) : getIRegRAX(8);
   18232    IRExpr** args
   18233       = mkIRExprVec_6( IRExpr_BBPTR(),
   18234                        opc4_and_imm, gstOffLe, gstOffRe, edxIN, eaxIN );
   18235 
   18236    IRTemp   resT = newTemp(Ity_I64);
   18237    IRDirty* d    = unsafeIRDirty_1_N( resT, 0/*regparms*/, nm, fn, args );
   18238    /* It's not really a dirty call, but we can't use the clean helper
   18239       mechanism here for the very lame reason that we can't pass 2 x
   18240       V128s by value to a helper.  Hence this roundabout scheme. */
   18241    d->nFxState = 2;
   18242    vex_bzero(&d->fxState, sizeof(d->fxState));
   18243    d->fxState[0].fx     = Ifx_Read;
   18244    d->fxState[0].offset = gstOffL;
   18245    d->fxState[0].size   = sizeof(U128);
   18246    d->fxState[1].fx     = Ifx_Read;
   18247    d->fxState[1].offset = gstOffR;
   18248    d->fxState[1].size   = sizeof(U128);
   18249    if (isxSTRM) {
   18250       /* Declare that the helper writes XMM0. */
   18251       d->nFxState = 3;
   18252       d->fxState[2].fx     = Ifx_Write;
   18253       d->fxState[2].offset = ymmGuestRegOffset(0);
   18254       d->fxState[2].size   = sizeof(U128);
   18255    }
   18256 
   18257    stmt( IRStmt_Dirty(d) );
   18258 
   18259    /* Now resT[15:0] holds the new OSZACP values, so the condition
   18260       codes must be updated. And for a xSTRI case, resT[31:16] holds
   18261       the new ECX value, so stash that too. */
   18262    if (!isxSTRM) {
   18263       putIReg64(R_RCX, binop(Iop_And64,
   18264                              binop(Iop_Shr64, mkexpr(resT), mkU8(16)),
   18265                              mkU64(0xFFFF)));
   18266    }
   18267 
   18268    /* Zap the upper half of the dest reg as per AVX conventions. */
   18269    if (isxSTRM && isAvx)
   18270       putYMMRegLane128(/*YMM*/0, 1, mkV128(0));
   18271 
   18272    stmt( IRStmt_Put(
   18273             OFFB_CC_DEP1,
   18274             binop(Iop_And64, mkexpr(resT), mkU64(0xFFFF))
   18275    ));
   18276    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   18277    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   18278    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   18279 
   18280    return delta;
   18281 }
   18282 
   18283 
   18284 static IRTemp math_PINSRB_128 ( IRTemp v128, IRTemp u8, UInt imm8 )
   18285 {
   18286    vassert(imm8 >= 0 && imm8 <= 15);
   18287 
   18288    // Create a V128 value which has the selected byte in the
   18289    // specified lane, and zeroes everywhere else.
   18290    IRTemp tmp128    = newTemp(Ity_V128);
   18291    IRTemp halfshift = newTemp(Ity_I64);
   18292    assign(halfshift, binop(Iop_Shl64,
   18293                            unop(Iop_8Uto64, mkexpr(u8)),
   18294                            mkU8(8 * (imm8 & 7))));
   18295    if (imm8 < 8) {
   18296       assign(tmp128, binop(Iop_64HLtoV128, mkU64(0), mkexpr(halfshift)));
   18297    } else {
   18298       assign(tmp128, binop(Iop_64HLtoV128, mkexpr(halfshift), mkU64(0)));
   18299    }
   18300 
   18301    UShort mask = ~(1 << imm8);
   18302    IRTemp res  = newTemp(Ity_V128);
   18303    assign( res, binop(Iop_OrV128,
   18304                       mkexpr(tmp128),
   18305                       binop(Iop_AndV128, mkexpr(v128), mkV128(mask))) );
   18306    return res;
   18307 }
   18308 
   18309 
   18310 static IRTemp math_PINSRD_128 ( IRTemp v128, IRTemp u32, UInt imm8 )
   18311 {
   18312    IRTemp z32 = newTemp(Ity_I32);
   18313    assign(z32, mkU32(0));
   18314 
   18315    /* Surround u32 with zeroes as per imm, giving us something we can
   18316       OR into a suitably masked-out v128.*/
   18317    IRTemp withZs = newTemp(Ity_V128);
   18318    UShort mask = 0;
   18319    switch (imm8) {
   18320       case 3:  mask = 0x0FFF;
   18321                assign(withZs, mkV128from32s(u32, z32, z32, z32));
   18322                break;
   18323       case 2:  mask = 0xF0FF;
   18324                assign(withZs, mkV128from32s(z32, u32, z32, z32));
   18325                break;
   18326       case 1:  mask = 0xFF0F;
   18327                assign(withZs, mkV128from32s(z32, z32, u32, z32));
   18328                break;
   18329       case 0:  mask = 0xFFF0;
   18330                assign(withZs, mkV128from32s(z32, z32, z32, u32));
   18331                break;
   18332       default: vassert(0);
   18333    }
   18334 
   18335    IRTemp res = newTemp(Ity_V128);
   18336    assign(res, binop( Iop_OrV128,
   18337                       mkexpr(withZs),
   18338                       binop( Iop_AndV128, mkexpr(v128), mkV128(mask) ) ) );
   18339    return res;
   18340 }
   18341 
   18342 
   18343 static IRTemp math_PINSRQ_128 ( IRTemp v128, IRTemp u64, UInt imm8 )
   18344 {
   18345    /* Surround u64 with zeroes as per imm, giving us something we can
   18346       OR into a suitably masked-out v128.*/
   18347    IRTemp withZs = newTemp(Ity_V128);
   18348    UShort mask = 0;
   18349    if (imm8 == 0) {
   18350       mask = 0xFF00;
   18351       assign(withZs, binop(Iop_64HLtoV128, mkU64(0), mkexpr(u64)));
   18352    } else {
   18353       vassert(imm8 == 1);
   18354       mask = 0x00FF;
   18355       assign( withZs, binop(Iop_64HLtoV128, mkexpr(u64), mkU64(0)));
   18356    }
   18357 
   18358    IRTemp res = newTemp(Ity_V128);
   18359    assign( res, binop( Iop_OrV128,
   18360                        mkexpr(withZs),
   18361                        binop( Iop_AndV128, mkexpr(v128), mkV128(mask) ) ) );
   18362    return res;
   18363 }
   18364 
   18365 
   18366 static IRTemp math_INSERTPS ( IRTemp dstV, IRTemp toInsertD, UInt imm8 )
   18367 {
   18368    const IRTemp inval = IRTemp_INVALID;
   18369    IRTemp dstDs[4] = { inval, inval, inval, inval };
   18370    breakupV128to32s( dstV, &dstDs[3], &dstDs[2], &dstDs[1], &dstDs[0] );
   18371 
   18372    vassert(imm8 <= 255);
   18373    dstDs[(imm8 >> 4) & 3] = toInsertD; /* "imm8_count_d" */
   18374 
   18375    UInt imm8_zmask = (imm8 & 15);
   18376    IRTemp zero_32 = newTemp(Ity_I32);
   18377    assign( zero_32, mkU32(0) );
   18378    IRTemp resV = newTemp(Ity_V128);
   18379    assign( resV, mkV128from32s(
   18380                     ((imm8_zmask & 8) == 8) ? zero_32 : dstDs[3],
   18381                     ((imm8_zmask & 4) == 4) ? zero_32 : dstDs[2],
   18382                     ((imm8_zmask & 2) == 2) ? zero_32 : dstDs[1],
   18383                     ((imm8_zmask & 1) == 1) ? zero_32 : dstDs[0]) );
   18384    return resV;
   18385 }
   18386 
   18387 
   18388 static Long dis_PEXTRB_128_GtoE ( VexAbiInfo* vbi, Prefix pfx,
   18389                                   Long delta, Bool isAvx )
   18390 {
   18391    IRTemp addr     = IRTemp_INVALID;
   18392    Int    alen     = 0;
   18393    HChar  dis_buf[50];
   18394    IRTemp xmm_vec  = newTemp(Ity_V128);
   18395    IRTemp sel_lane = newTemp(Ity_I32);
   18396    IRTemp shr_lane = newTemp(Ity_I32);
   18397    const HChar* mbV = isAvx ? "v" : "";
   18398    UChar  modrm    = getUChar(delta);
   18399    IRTemp t3, t2, t1, t0;
   18400    Int    imm8;
   18401    assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
   18402    t3 = t2 = t1 = t0 = IRTemp_INVALID;
   18403    breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
   18404 
   18405    if ( epartIsReg( modrm ) ) {
   18406       imm8 = (Int)getUChar(delta+1);
   18407    } else {
   18408       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   18409       imm8 = (Int)getUChar(delta+alen);
   18410    }
   18411    switch ( (imm8 >> 2) & 3 ) {
   18412       case 0:  assign( sel_lane, mkexpr(t0) ); break;
   18413       case 1:  assign( sel_lane, mkexpr(t1) ); break;
   18414       case 2:  assign( sel_lane, mkexpr(t2) ); break;
   18415       case 3:  assign( sel_lane, mkexpr(t3) ); break;
   18416       default: vassert(0);
   18417    }
   18418    assign( shr_lane,
   18419            binop( Iop_Shr32, mkexpr(sel_lane), mkU8(((imm8 & 3)*8)) ) );
   18420 
   18421    if ( epartIsReg( modrm ) ) {
   18422       putIReg64( eregOfRexRM(pfx,modrm),
   18423                  unop( Iop_32Uto64,
   18424                        binop(Iop_And32, mkexpr(shr_lane), mkU32(255)) ) );
   18425       delta += 1+1;
   18426       DIP( "%spextrb $%d, %s,%s\n", mbV, imm8,
   18427            nameXMMReg( gregOfRexRM(pfx, modrm) ),
   18428            nameIReg64( eregOfRexRM(pfx, modrm) ) );
   18429    } else {
   18430       storeLE( mkexpr(addr), unop(Iop_32to8, mkexpr(shr_lane) ) );
   18431       delta += alen+1;
   18432       DIP( "%spextrb $%d,%s,%s\n", mbV,
   18433            imm8, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
   18434    }
   18435 
   18436    return delta;
   18437 }
   18438 
   18439 
   18440 static IRTemp math_DPPD_128 ( IRTemp src_vec, IRTemp dst_vec, UInt imm8 )
   18441 {
   18442    vassert(imm8 < 256);
   18443    UShort imm8_perms[4] = { 0x0000, 0x00FF, 0xFF00, 0xFFFF };
   18444    IRTemp and_vec = newTemp(Ity_V128);
   18445    IRTemp sum_vec = newTemp(Ity_V128);
   18446    IRTemp rm      = newTemp(Ity_I32);
   18447    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   18448    assign( and_vec, binop( Iop_AndV128,
   18449                            triop( Iop_Mul64Fx2,
   18450                                   mkexpr(rm),
   18451                                   mkexpr(dst_vec), mkexpr(src_vec) ),
   18452                            mkV128( imm8_perms[ ((imm8 >> 4) & 3) ] ) ) );
   18453 
   18454    assign( sum_vec, binop( Iop_Add64F0x2,
   18455                            binop( Iop_InterleaveHI64x2,
   18456                                   mkexpr(and_vec), mkexpr(and_vec) ),
   18457                            binop( Iop_InterleaveLO64x2,
   18458                                   mkexpr(and_vec), mkexpr(and_vec) ) ) );
   18459    IRTemp res = newTemp(Ity_V128);
   18460    assign(res, binop( Iop_AndV128,
   18461                       binop( Iop_InterleaveLO64x2,
   18462                              mkexpr(sum_vec), mkexpr(sum_vec) ),
   18463                       mkV128( imm8_perms[ (imm8 & 3) ] ) ) );
   18464    return res;
   18465 }
   18466 
   18467 
   18468 static IRTemp math_DPPS_128 ( IRTemp src_vec, IRTemp dst_vec, UInt imm8 )
   18469 {
   18470    vassert(imm8 < 256);
   18471    IRTemp tmp_prod_vec = newTemp(Ity_V128);
   18472    IRTemp prod_vec     = newTemp(Ity_V128);
   18473    IRTemp sum_vec      = newTemp(Ity_V128);
   18474    IRTemp rm           = newTemp(Ity_I32);
   18475    IRTemp v3, v2, v1, v0;
   18476    v3 = v2 = v1 = v0   = IRTemp_INVALID;
   18477    UShort imm8_perms[16] = { 0x0000, 0x000F, 0x00F0, 0x00FF, 0x0F00,
   18478                              0x0F0F, 0x0FF0, 0x0FFF, 0xF000, 0xF00F,
   18479                              0xF0F0, 0xF0FF, 0xFF00, 0xFF0F, 0xFFF0,
   18480                              0xFFFF };
   18481 
   18482    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   18483    assign( tmp_prod_vec,
   18484            binop( Iop_AndV128,
   18485                   triop( Iop_Mul32Fx4,
   18486                          mkexpr(rm), mkexpr(dst_vec), mkexpr(src_vec) ),
   18487                   mkV128( imm8_perms[((imm8 >> 4)& 15)] ) ) );
   18488    breakupV128to32s( tmp_prod_vec, &v3, &v2, &v1, &v0 );
   18489    assign( prod_vec, mkV128from32s( v3, v1, v2, v0 ) );
   18490 
   18491    assign( sum_vec, triop( Iop_Add32Fx4,
   18492                            mkexpr(rm),
   18493                            binop( Iop_InterleaveHI32x4,
   18494                                   mkexpr(prod_vec), mkexpr(prod_vec) ),
   18495                            binop( Iop_InterleaveLO32x4,
   18496                                   mkexpr(prod_vec), mkexpr(prod_vec) ) ) );
   18497 
   18498    IRTemp res = newTemp(Ity_V128);
   18499    assign( res, binop( Iop_AndV128,
   18500                        triop( Iop_Add32Fx4,
   18501                               mkexpr(rm),
   18502                               binop( Iop_InterleaveHI32x4,
   18503                                      mkexpr(sum_vec), mkexpr(sum_vec) ),
   18504                               binop( Iop_InterleaveLO32x4,
   18505                                      mkexpr(sum_vec), mkexpr(sum_vec) ) ),
   18506                        mkV128( imm8_perms[ (imm8 & 15) ] ) ) );
   18507    return res;
   18508 }
   18509 
   18510 
   18511 static IRTemp math_MPSADBW_128 ( IRTemp dst_vec, IRTemp src_vec, UInt imm8 )
   18512 {
   18513    /* Mask out bits of the operands we don't need.  This isn't
   18514       strictly necessary, but it does ensure Memcheck doesn't
   18515       give us any false uninitialised value errors as a
   18516       result. */
   18517    UShort src_mask[4] = { 0x000F, 0x00F0, 0x0F00, 0xF000 };
   18518    UShort dst_mask[2] = { 0x07FF, 0x7FF0 };
   18519 
   18520    IRTemp src_maskV = newTemp(Ity_V128);
   18521    IRTemp dst_maskV = newTemp(Ity_V128);
   18522    assign(src_maskV, mkV128( src_mask[ imm8 & 3 ] ));
   18523    assign(dst_maskV, mkV128( dst_mask[ (imm8 >> 2) & 1 ] ));
   18524 
   18525    IRTemp src_masked = newTemp(Ity_V128);
   18526    IRTemp dst_masked = newTemp(Ity_V128);
   18527    assign(src_masked, binop(Iop_AndV128, mkexpr(src_vec), mkexpr(src_maskV)));
   18528    assign(dst_masked, binop(Iop_AndV128, mkexpr(dst_vec), mkexpr(dst_maskV)));
   18529 
   18530    /* Generate 4 64 bit values that we can hand to a clean helper */
   18531    IRTemp sHi = newTemp(Ity_I64);
   18532    IRTemp sLo = newTemp(Ity_I64);
   18533    assign( sHi, unop(Iop_V128HIto64, mkexpr(src_masked)) );
   18534    assign( sLo, unop(Iop_V128to64,   mkexpr(src_masked)) );
   18535 
   18536    IRTemp dHi = newTemp(Ity_I64);
   18537    IRTemp dLo = newTemp(Ity_I64);
   18538    assign( dHi, unop(Iop_V128HIto64, mkexpr(dst_masked)) );
   18539    assign( dLo, unop(Iop_V128to64,   mkexpr(dst_masked)) );
   18540 
   18541    /* Compute halves of the result separately */
   18542    IRTemp resHi = newTemp(Ity_I64);
   18543    IRTemp resLo = newTemp(Ity_I64);
   18544 
   18545    IRExpr** argsHi
   18546       = mkIRExprVec_5( mkexpr(sHi), mkexpr(sLo), mkexpr(dHi), mkexpr(dLo),
   18547                        mkU64( 0x80 | (imm8 & 7) ));
   18548    IRExpr** argsLo
   18549       = mkIRExprVec_5( mkexpr(sHi), mkexpr(sLo), mkexpr(dHi), mkexpr(dLo),
   18550                        mkU64( 0x00 | (imm8 & 7) ));
   18551 
   18552    assign(resHi, mkIRExprCCall( Ity_I64, 0/*regparm*/,
   18553                                 "amd64g_calc_mpsadbw",
   18554                                 &amd64g_calc_mpsadbw, argsHi ));
   18555    assign(resLo, mkIRExprCCall( Ity_I64, 0/*regparm*/,
   18556                                 "amd64g_calc_mpsadbw",
   18557                                 &amd64g_calc_mpsadbw, argsLo ));
   18558 
   18559    IRTemp res = newTemp(Ity_V128);
   18560    assign(res, binop(Iop_64HLtoV128, mkexpr(resHi), mkexpr(resLo)));
   18561    return res;
   18562 }
   18563 
   18564 static Long dis_EXTRACTPS ( VexAbiInfo* vbi, Prefix pfx,
   18565                             Long delta, Bool isAvx )
   18566 {
   18567    IRTemp addr       = IRTemp_INVALID;
   18568    Int    alen       = 0;
   18569    HChar  dis_buf[50];
   18570    UChar  modrm      = getUChar(delta);
   18571    Int imm8_10;
   18572    IRTemp xmm_vec    = newTemp(Ity_V128);
   18573    IRTemp src_dword  = newTemp(Ity_I32);
   18574    UInt   rG         = gregOfRexRM(pfx,modrm);
   18575    IRTemp t3, t2, t1, t0;
   18576    t3 = t2 = t1 = t0 = IRTemp_INVALID;
   18577 
   18578    assign( xmm_vec, getXMMReg( rG ) );
   18579    breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
   18580 
   18581    if ( epartIsReg( modrm ) ) {
   18582       imm8_10 = (Int)(getUChar(delta+1) & 3);
   18583    } else {
   18584       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   18585       imm8_10 = (Int)(getUChar(delta+alen) & 3);
   18586    }
   18587 
   18588    switch ( imm8_10 ) {
   18589       case 0:  assign( src_dword, mkexpr(t0) ); break;
   18590       case 1:  assign( src_dword, mkexpr(t1) ); break;
   18591       case 2:  assign( src_dword, mkexpr(t2) ); break;
   18592       case 3:  assign( src_dword, mkexpr(t3) ); break;
   18593       default: vassert(0);
   18594    }
   18595 
   18596    if ( epartIsReg( modrm ) ) {
   18597       UInt rE = eregOfRexRM(pfx,modrm);
   18598       putIReg32( rE, mkexpr(src_dword) );
   18599       delta += 1+1;
   18600       DIP( "%sextractps $%d, %s,%s\n", isAvx ? "v" : "", imm8_10,
   18601            nameXMMReg( rG ), nameIReg32( rE ) );
   18602    } else {
   18603       storeLE( mkexpr(addr), mkexpr(src_dword) );
   18604       delta += alen+1;
   18605       DIP( "%sextractps $%d, %s,%s\n", isAvx ? "v" : "", imm8_10,
   18606            nameXMMReg( rG ), dis_buf );
   18607    }
   18608 
   18609    return delta;
   18610 }
   18611 
   18612 
   18613 static IRTemp math_PCLMULQDQ( IRTemp dV, IRTemp sV, UInt imm8 )
   18614 {
   18615    IRTemp t0 = newTemp(Ity_I64);
   18616    IRTemp t1 = newTemp(Ity_I64);
   18617    assign(t0, unop((imm8&1)? Iop_V128HIto64 : Iop_V128to64,
   18618               mkexpr(dV)));
   18619    assign(t1, unop((imm8&16) ? Iop_V128HIto64 : Iop_V128to64,
   18620               mkexpr(sV)));
   18621 
   18622    IRTemp t2 = newTemp(Ity_I64);
   18623    IRTemp t3 = newTemp(Ity_I64);
   18624 
   18625    IRExpr** args;
   18626 
   18627    args = mkIRExprVec_3(mkexpr(t0), mkexpr(t1), mkU64(0));
   18628    assign(t2, mkIRExprCCall(Ity_I64,0, "amd64g_calculate_pclmul",
   18629                             &amd64g_calculate_pclmul, args));
   18630    args = mkIRExprVec_3(mkexpr(t0), mkexpr(t1), mkU64(1));
   18631    assign(t3, mkIRExprCCall(Ity_I64,0, "amd64g_calculate_pclmul",
   18632                             &amd64g_calculate_pclmul, args));
   18633 
   18634    IRTemp res     = newTemp(Ity_V128);
   18635    assign(res, binop(Iop_64HLtoV128, mkexpr(t3), mkexpr(t2)));
   18636    return res;
   18637 }
   18638 
   18639 
   18640 __attribute__((noinline))
   18641 static
   18642 Long dis_ESC_0F3A__SSE4 ( Bool* decode_OK,
   18643                           VexAbiInfo* vbi,
   18644                           Prefix pfx, Int sz, Long deltaIN )
   18645 {
   18646    IRTemp addr  = IRTemp_INVALID;
   18647    UChar  modrm = 0;
   18648    Int    alen  = 0;
   18649    HChar  dis_buf[50];
   18650 
   18651    *decode_OK = False;
   18652 
   18653    Long   delta = deltaIN;
   18654    UChar  opc   = getUChar(delta);
   18655    delta++;
   18656    switch (opc) {
   18657 
   18658    case 0x08:
   18659       /* 66 0F 3A 08 /r ib = ROUNDPS imm8, xmm2/m128, xmm1 */
   18660       if (have66noF2noF3(pfx) && sz == 2) {
   18661 
   18662          IRTemp src0 = newTemp(Ity_F32);
   18663          IRTemp src1 = newTemp(Ity_F32);
   18664          IRTemp src2 = newTemp(Ity_F32);
   18665          IRTemp src3 = newTemp(Ity_F32);
   18666          IRTemp res0 = newTemp(Ity_F32);
   18667          IRTemp res1 = newTemp(Ity_F32);
   18668          IRTemp res2 = newTemp(Ity_F32);
   18669          IRTemp res3 = newTemp(Ity_F32);
   18670          IRTemp rm   = newTemp(Ity_I32);
   18671          Int    imm  = 0;
   18672 
   18673          modrm = getUChar(delta);
   18674 
   18675          if (epartIsReg(modrm)) {
   18676             assign( src0,
   18677                     getXMMRegLane32F( eregOfRexRM(pfx, modrm), 0 ) );
   18678             assign( src1,
   18679                     getXMMRegLane32F( eregOfRexRM(pfx, modrm), 1 ) );
   18680             assign( src2,
   18681                     getXMMRegLane32F( eregOfRexRM(pfx, modrm), 2 ) );
   18682             assign( src3,
   18683                     getXMMRegLane32F( eregOfRexRM(pfx, modrm), 3 ) );
   18684             imm = getUChar(delta+1);
   18685             if (imm & ~15) goto decode_failure;
   18686             delta += 1+1;
   18687             DIP( "roundps $%d,%s,%s\n",
   18688                  imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
   18689                       nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18690          } else {
   18691             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   18692             gen_SEGV_if_not_16_aligned(addr);
   18693             assign( src0, loadLE(Ity_F32,
   18694                                  binop(Iop_Add64, mkexpr(addr), mkU64(0) )));
   18695             assign( src1, loadLE(Ity_F32,
   18696                                  binop(Iop_Add64, mkexpr(addr), mkU64(4) )));
   18697             assign( src2, loadLE(Ity_F32,
   18698                                  binop(Iop_Add64, mkexpr(addr), mkU64(8) )));
   18699             assign( src3, loadLE(Ity_F32,
   18700                                  binop(Iop_Add64, mkexpr(addr), mkU64(12) )));
   18701             imm = getUChar(delta+alen);
   18702             if (imm & ~15) goto decode_failure;
   18703             delta += alen+1;
   18704             DIP( "roundps $%d,%s,%s\n",
   18705                  imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18706          }
   18707 
   18708          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   18709             that encoding is the same as the encoding for IRRoundingMode,
   18710             we can use that value directly in the IR as a rounding
   18711             mode. */
   18712          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   18713 
   18714          assign(res0, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src0)) );
   18715          assign(res1, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src1)) );
   18716          assign(res2, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src2)) );
   18717          assign(res3, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src3)) );
   18718 
   18719          putXMMRegLane32F( gregOfRexRM(pfx, modrm), 0, mkexpr(res0) );
   18720          putXMMRegLane32F( gregOfRexRM(pfx, modrm), 1, mkexpr(res1) );
   18721          putXMMRegLane32F( gregOfRexRM(pfx, modrm), 2, mkexpr(res2) );
   18722          putXMMRegLane32F( gregOfRexRM(pfx, modrm), 3, mkexpr(res3) );
   18723 
   18724          goto decode_success;
   18725       }
   18726       break;
   18727 
   18728    case 0x09:
   18729       /* 66 0F 3A 09 /r ib = ROUNDPD imm8, xmm2/m128, xmm1 */
   18730       if (have66noF2noF3(pfx) && sz == 2) {
   18731 
   18732          IRTemp src0 = newTemp(Ity_F64);
   18733          IRTemp src1 = newTemp(Ity_F64);
   18734          IRTemp res0 = newTemp(Ity_F64);
   18735          IRTemp res1 = newTemp(Ity_F64);
   18736          IRTemp rm   = newTemp(Ity_I32);
   18737          Int    imm  = 0;
   18738 
   18739          modrm = getUChar(delta);
   18740 
   18741          if (epartIsReg(modrm)) {
   18742             assign( src0,
   18743                     getXMMRegLane64F( eregOfRexRM(pfx, modrm), 0 ) );
   18744             assign( src1,
   18745                     getXMMRegLane64F( eregOfRexRM(pfx, modrm), 1 ) );
   18746             imm = getUChar(delta+1);
   18747             if (imm & ~15) goto decode_failure;
   18748             delta += 1+1;
   18749             DIP( "roundpd $%d,%s,%s\n",
   18750                  imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
   18751                       nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18752          } else {
   18753             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   18754             gen_SEGV_if_not_16_aligned(addr);
   18755             assign( src0, loadLE(Ity_F64,
   18756                                  binop(Iop_Add64, mkexpr(addr), mkU64(0) )));
   18757             assign( src1, loadLE(Ity_F64,
   18758                                  binop(Iop_Add64, mkexpr(addr), mkU64(8) )));
   18759             imm = getUChar(delta+alen);
   18760             if (imm & ~15) goto decode_failure;
   18761             delta += alen+1;
   18762             DIP( "roundpd $%d,%s,%s\n",
   18763                  imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18764          }
   18765 
   18766          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   18767             that encoding is the same as the encoding for IRRoundingMode,
   18768             we can use that value directly in the IR as a rounding
   18769             mode. */
   18770          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   18771 
   18772          assign(res0, binop(Iop_RoundF64toInt, mkexpr(rm), mkexpr(src0)) );
   18773          assign(res1, binop(Iop_RoundF64toInt, mkexpr(rm), mkexpr(src1)) );
   18774 
   18775          putXMMRegLane64F( gregOfRexRM(pfx, modrm), 0, mkexpr(res0) );
   18776          putXMMRegLane64F( gregOfRexRM(pfx, modrm), 1, mkexpr(res1) );
   18777 
   18778          goto decode_success;
   18779       }
   18780       break;
   18781 
   18782    case 0x0A:
   18783    case 0x0B:
   18784       /* 66 0F 3A 0A /r ib = ROUNDSS imm8, xmm2/m32, xmm1
   18785          66 0F 3A 0B /r ib = ROUNDSD imm8, xmm2/m64, xmm1
   18786       */
   18787       if (have66noF2noF3(pfx) && sz == 2) {
   18788 
   18789          Bool   isD = opc == 0x0B;
   18790          IRTemp src = newTemp(isD ? Ity_F64 : Ity_F32);
   18791          IRTemp res = newTemp(isD ? Ity_F64 : Ity_F32);
   18792          Int    imm = 0;
   18793 
   18794          modrm = getUChar(delta);
   18795 
   18796          if (epartIsReg(modrm)) {
   18797             assign( src,
   18798                     isD ? getXMMRegLane64F( eregOfRexRM(pfx, modrm), 0 )
   18799                         : getXMMRegLane32F( eregOfRexRM(pfx, modrm), 0 ) );
   18800             imm = getUChar(delta+1);
   18801             if (imm & ~15) goto decode_failure;
   18802             delta += 1+1;
   18803             DIP( "rounds%c $%d,%s,%s\n",
   18804                  isD ? 'd' : 's',
   18805                  imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
   18806                       nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18807          } else {
   18808             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   18809             assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
   18810             imm = getUChar(delta+alen);
   18811             if (imm & ~15) goto decode_failure;
   18812             delta += alen+1;
   18813             DIP( "rounds%c $%d,%s,%s\n",
   18814                  isD ? 'd' : 's',
   18815                  imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18816          }
   18817 
   18818          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   18819             that encoding is the same as the encoding for IRRoundingMode,
   18820             we can use that value directly in the IR as a rounding
   18821             mode. */
   18822          assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
   18823                            (imm & 4) ? get_sse_roundingmode()
   18824                                      : mkU32(imm & 3),
   18825                            mkexpr(src)) );
   18826 
   18827          if (isD)
   18828             putXMMRegLane64F( gregOfRexRM(pfx, modrm), 0, mkexpr(res) );
   18829          else
   18830             putXMMRegLane32F( gregOfRexRM(pfx, modrm), 0, mkexpr(res) );
   18831 
   18832          goto decode_success;
   18833       }
   18834       break;
   18835 
   18836    case 0x0C:
   18837       /* 66 0F 3A 0C /r ib = BLENDPS xmm1, xmm2/m128, imm8
   18838          Blend Packed Single Precision Floating-Point Values (XMM) */
   18839       if (have66noF2noF3(pfx) && sz == 2) {
   18840 
   18841          Int imm8;
   18842          IRTemp dst_vec = newTemp(Ity_V128);
   18843          IRTemp src_vec = newTemp(Ity_V128);
   18844 
   18845          modrm = getUChar(delta);
   18846 
   18847          assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
   18848 
   18849          if ( epartIsReg( modrm ) ) {
   18850             imm8 = (Int)getUChar(delta+1);
   18851             assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   18852             delta += 1+1;
   18853             DIP( "blendps $%d, %s,%s\n", imm8,
   18854                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   18855                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18856          } else {
   18857             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   18858                              1/* imm8 is 1 byte after the amode */ );
   18859             gen_SEGV_if_not_16_aligned( addr );
   18860             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   18861             imm8 = (Int)getUChar(delta+alen);
   18862             delta += alen+1;
   18863             DIP( "blendpd $%d, %s,%s\n",
   18864                  imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18865          }
   18866 
   18867          putXMMReg( gregOfRexRM(pfx, modrm),
   18868                     mkexpr( math_BLENDPS_128( src_vec, dst_vec, imm8) ) );
   18869          goto decode_success;
   18870       }
   18871       break;
   18872 
   18873    case 0x0D:
   18874       /* 66 0F 3A 0D /r ib = BLENDPD xmm1, xmm2/m128, imm8
   18875          Blend Packed Double Precision Floating-Point Values (XMM) */
   18876       if (have66noF2noF3(pfx) && sz == 2) {
   18877 
   18878          Int imm8;
   18879          IRTemp dst_vec = newTemp(Ity_V128);
   18880          IRTemp src_vec = newTemp(Ity_V128);
   18881 
   18882          modrm = getUChar(delta);
   18883          assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
   18884 
   18885          if ( epartIsReg( modrm ) ) {
   18886             imm8 = (Int)getUChar(delta+1);
   18887             assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   18888             delta += 1+1;
   18889             DIP( "blendpd $%d, %s,%s\n", imm8,
   18890                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   18891                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18892          } else {
   18893             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   18894                              1/* imm8 is 1 byte after the amode */ );
   18895             gen_SEGV_if_not_16_aligned( addr );
   18896             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   18897             imm8 = (Int)getUChar(delta+alen);
   18898             delta += alen+1;
   18899             DIP( "blendpd $%d, %s,%s\n",
   18900                  imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18901          }
   18902 
   18903          putXMMReg( gregOfRexRM(pfx, modrm),
   18904                     mkexpr( math_BLENDPD_128( src_vec, dst_vec, imm8) ) );
   18905          goto decode_success;
   18906       }
   18907       break;
   18908 
   18909    case 0x0E:
   18910       /* 66 0F 3A 0E /r ib = PBLENDW xmm1, xmm2/m128, imm8
   18911          Blend Packed Words (XMM) */
   18912       if (have66noF2noF3(pfx) && sz == 2) {
   18913 
   18914          Int imm8;
   18915          IRTemp dst_vec = newTemp(Ity_V128);
   18916          IRTemp src_vec = newTemp(Ity_V128);
   18917 
   18918          modrm = getUChar(delta);
   18919 
   18920          assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
   18921 
   18922          if ( epartIsReg( modrm ) ) {
   18923             imm8 = (Int)getUChar(delta+1);
   18924             assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
   18925             delta += 1+1;
   18926             DIP( "pblendw $%d, %s,%s\n", imm8,
   18927                  nameXMMReg( eregOfRexRM(pfx, modrm) ),
   18928                  nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18929          } else {
   18930             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   18931                              1/* imm8 is 1 byte after the amode */ );
   18932             gen_SEGV_if_not_16_aligned( addr );
   18933             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   18934             imm8 = (Int)getUChar(delta+alen);
   18935             delta += alen+1;
   18936             DIP( "pblendw $%d, %s,%s\n",
   18937                  imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
   18938          }
   18939 
   18940          putXMMReg( gregOfRexRM(pfx, modrm),
   18941                     mkexpr( math_PBLENDW_128( src_vec, dst_vec, imm8) ) );
   18942          goto decode_success;
   18943       }
   18944       break;
   18945 
   18946    case 0x14:
   18947       /* 66 0F 3A 14 /r ib = PEXTRB r/m16, xmm, imm8
   18948          Extract Byte from xmm, store in mem or zero-extend + store in gen.reg.
   18949          (XMM) */
   18950       if (have66noF2noF3(pfx) && sz == 2) {
   18951          delta = dis_PEXTRB_128_GtoE( vbi, pfx, delta, False/*!isAvx*/ );
   18952          goto decode_success;
   18953       }
   18954       break;
   18955 
   18956    case 0x15:
   18957       /* 66 0F 3A 15 /r ib = PEXTRW r/m16, xmm, imm8
   18958          Extract Word from xmm, store in mem or zero-extend + store in gen.reg.
   18959          (XMM) */
   18960       if (have66noF2noF3(pfx) && sz == 2) {
   18961          delta = dis_PEXTRW( vbi, pfx, delta, False/*!isAvx*/ );
   18962          goto decode_success;
   18963       }
   18964       break;
   18965 
   18966    case 0x16:
   18967       /* 66 no-REX.W 0F 3A 16 /r ib = PEXTRD reg/mem32, xmm2, imm8
   18968          Extract Doubleword int from xmm reg and store in gen.reg or mem. (XMM)
   18969          Note that this insn has the same opcodes as PEXTRQ, but
   18970          here the REX.W bit is _not_ present */
   18971       if (have66noF2noF3(pfx)
   18972           && sz == 2 /* REX.W is _not_ present */) {
   18973          delta = dis_PEXTRD( vbi, pfx, delta, False/*!isAvx*/ );
   18974          goto decode_success;
   18975       }
   18976       /* 66 REX.W 0F 3A 16 /r ib = PEXTRQ reg/mem64, xmm2, imm8
   18977          Extract Quadword int from xmm reg and store in gen.reg or mem. (XMM)
   18978          Note that this insn has the same opcodes as PEXTRD, but
   18979          here the REX.W bit is present */
   18980       if (have66noF2noF3(pfx)
   18981           && sz == 8 /* REX.W is present */) {
   18982          delta = dis_PEXTRQ( vbi, pfx, delta, False/*!isAvx*/);
   18983          goto decode_success;
   18984       }
   18985       break;
   18986 
   18987    case 0x17:
   18988       /* 66 0F 3A 17 /r ib = EXTRACTPS reg/mem32, xmm2, imm8 Extract
   18989          float from xmm reg and store in gen.reg or mem.  This is
   18990          identical to PEXTRD, except that REX.W appears to be ignored.
   18991       */
   18992       if (have66noF2noF3(pfx)
   18993           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   18994          delta = dis_EXTRACTPS( vbi, pfx, delta, False/*!isAvx*/ );
   18995          goto decode_success;
   18996       }
   18997       break;
   18998 
   18999    case 0x20:
   19000       /* 66 0F 3A 20 /r ib = PINSRB xmm1, r32/m8, imm8
   19001          Extract byte from r32/m8 and insert into xmm1 */
   19002       if (have66noF2noF3(pfx) && sz == 2) {
   19003          Int    imm8;
   19004          IRTemp new8 = newTemp(Ity_I8);
   19005          modrm = getUChar(delta);
   19006          UInt rG = gregOfRexRM(pfx, modrm);
   19007          if ( epartIsReg( modrm ) ) {
   19008             UInt rE = eregOfRexRM(pfx,modrm);
   19009             imm8 = (Int)(getUChar(delta+1) & 0xF);
   19010             assign( new8, unop(Iop_32to8, getIReg32(rE)) );
   19011             delta += 1+1;
   19012             DIP( "pinsrb $%d,%s,%s\n", imm8,
   19013                  nameIReg32(rE), nameXMMReg(rG) );
   19014          } else {
   19015             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   19016             imm8 = (Int)(getUChar(delta+alen) & 0xF);
   19017             assign( new8, loadLE( Ity_I8, mkexpr(addr) ) );
   19018             delta += alen+1;
   19019             DIP( "pinsrb $%d,%s,%s\n",
   19020                  imm8, dis_buf, nameXMMReg(rG) );
   19021          }
   19022          IRTemp src_vec = newTemp(Ity_V128);
   19023          assign(src_vec, getXMMReg( gregOfRexRM(pfx, modrm) ));
   19024          IRTemp res = math_PINSRB_128( src_vec, new8, imm8 );
   19025          putXMMReg( rG, mkexpr(res) );
   19026          goto decode_success;
   19027       }
   19028       break;
   19029 
   19030    case 0x21:
   19031       /* 66 0F 3A 21 /r ib = INSERTPS imm8, xmm2/m32, xmm1
   19032          Insert Packed Single Precision Floating-Point Value (XMM) */
   19033       if (have66noF2noF3(pfx) && sz == 2) {
   19034          UInt   imm8;
   19035          IRTemp d2ins = newTemp(Ity_I32); /* comes from the E part */
   19036          const IRTemp inval = IRTemp_INVALID;
   19037 
   19038          modrm = getUChar(delta);
   19039          UInt rG = gregOfRexRM(pfx, modrm);
   19040 
   19041          if ( epartIsReg( modrm ) ) {
   19042             UInt   rE = eregOfRexRM(pfx, modrm);
   19043             IRTemp vE = newTemp(Ity_V128);
   19044             assign( vE, getXMMReg(rE) );
   19045             IRTemp dsE[4] = { inval, inval, inval, inval };
   19046             breakupV128to32s( vE, &dsE[3], &dsE[2], &dsE[1], &dsE[0] );
   19047             imm8 = getUChar(delta+1);
   19048             d2ins = dsE[(imm8 >> 6) & 3]; /* "imm8_count_s" */
   19049             delta += 1+1;
   19050             DIP( "insertps $%u, %s,%s\n",
   19051                  imm8, nameXMMReg(rE), nameXMMReg(rG) );
   19052          } else {
   19053             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   19054             assign( d2ins, loadLE( Ity_I32, mkexpr(addr) ) );
   19055             imm8 = getUChar(delta+alen);
   19056             delta += alen+1;
   19057             DIP( "insertps $%u, %s,%s\n",
   19058                  imm8, dis_buf, nameXMMReg(rG) );
   19059          }
   19060 
   19061          IRTemp vG = newTemp(Ity_V128);
   19062          assign( vG, getXMMReg(rG) );
   19063 
   19064          putXMMReg( rG, mkexpr(math_INSERTPS( vG, d2ins, imm8 )) );
   19065          goto decode_success;
   19066       }
   19067       break;
   19068 
   19069    case 0x22:
   19070       /* 66 no-REX.W 0F 3A 22 /r ib = PINSRD xmm1, r/m32, imm8
   19071          Extract Doubleword int from gen.reg/mem32 and insert into xmm1 */
   19072       if (have66noF2noF3(pfx)
   19073           && sz == 2 /* REX.W is NOT present */) {
   19074          Int    imm8_10;
   19075          IRTemp src_u32 = newTemp(Ity_I32);
   19076          modrm = getUChar(delta);
   19077          UInt rG = gregOfRexRM(pfx, modrm);
   19078 
   19079          if ( epartIsReg( modrm ) ) {
   19080             UInt rE = eregOfRexRM(pfx,modrm);
   19081             imm8_10 = (Int)(getUChar(delta+1) & 3);
   19082             assign( src_u32, getIReg32( rE ) );
   19083             delta += 1+1;
   19084             DIP( "pinsrd $%d, %s,%s\n",
   19085                  imm8_10, nameIReg32(rE), nameXMMReg(rG) );
   19086          } else {
   19087             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   19088             imm8_10 = (Int)(getUChar(delta+alen) & 3);
   19089             assign( src_u32, loadLE( Ity_I32, mkexpr(addr) ) );
   19090             delta += alen+1;
   19091             DIP( "pinsrd $%d, %s,%s\n",
   19092                  imm8_10, dis_buf, nameXMMReg(rG) );
   19093          }
   19094 
   19095          IRTemp src_vec = newTemp(Ity_V128);
   19096          assign(src_vec, getXMMReg( rG ));
   19097          IRTemp res_vec = math_PINSRD_128( src_vec, src_u32, imm8_10 );
   19098          putXMMReg( rG, mkexpr(res_vec) );
   19099          goto decode_success;
   19100       }
   19101       /* 66 REX.W 0F 3A 22 /r ib = PINSRQ xmm1, r/m64, imm8
   19102          Extract Quadword int from gen.reg/mem64 and insert into xmm1 */
   19103       if (have66noF2noF3(pfx)
   19104           && sz == 8 /* REX.W is present */) {
   19105          Int imm8_0;
   19106          IRTemp src_u64 = newTemp(Ity_I64);
   19107          modrm = getUChar(delta);
   19108          UInt rG = gregOfRexRM(pfx, modrm);
   19109 
   19110          if ( epartIsReg( modrm ) ) {
   19111             UInt rE = eregOfRexRM(pfx,modrm);
   19112             imm8_0 = (Int)(getUChar(delta+1) & 1);
   19113             assign( src_u64, getIReg64( rE ) );
   19114             delta += 1+1;
   19115             DIP( "pinsrq $%d, %s,%s\n",
   19116                  imm8_0, nameIReg64(rE), nameXMMReg(rG) );
   19117          } else {
   19118             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   19119             imm8_0 = (Int)(getUChar(delta+alen) & 1);
   19120             assign( src_u64, loadLE( Ity_I64, mkexpr(addr) ) );
   19121             delta += alen+1;
   19122             DIP( "pinsrq $%d, %s,%s\n",
   19123                  imm8_0, dis_buf, nameXMMReg(rG) );
   19124          }
   19125 
   19126          IRTemp src_vec = newTemp(Ity_V128);
   19127          assign(src_vec, getXMMReg( rG ));
   19128          IRTemp res_vec = math_PINSRQ_128( src_vec, src_u64, imm8_0 );
   19129          putXMMReg( rG, mkexpr(res_vec) );
   19130          goto decode_success;
   19131       }
   19132       break;
   19133 
   19134    case 0x40:
   19135       /* 66 0F 3A 40 /r ib = DPPS xmm1, xmm2/m128, imm8
   19136          Dot Product of Packed Single Precision Floating-Point Values (XMM) */
   19137       if (have66noF2noF3(pfx) && sz == 2) {
   19138          modrm = getUChar(delta);
   19139          Int    imm8;
   19140          IRTemp src_vec = newTemp(Ity_V128);
   19141          IRTemp dst_vec = newTemp(Ity_V128);
   19142          UInt   rG      = gregOfRexRM(pfx, modrm);
   19143          assign( dst_vec, getXMMReg( rG ) );
   19144          if ( epartIsReg( modrm ) ) {
   19145             UInt rE = eregOfRexRM(pfx, modrm);
   19146             imm8 = (Int)getUChar(delta+1);
   19147             assign( src_vec, getXMMReg(rE) );
   19148             delta += 1+1;
   19149             DIP( "dpps $%d, %s,%s\n",
   19150                  imm8, nameXMMReg(rE), nameXMMReg(rG) );
   19151          } else {
   19152             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   19153                              1/* imm8 is 1 byte after the amode */ );
   19154             gen_SEGV_if_not_16_aligned( addr );
   19155             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   19156             imm8 = (Int)getUChar(delta+alen);
   19157             delta += alen+1;
   19158             DIP( "dpps $%d, %s,%s\n",
   19159                  imm8, dis_buf, nameXMMReg(rG) );
   19160          }
   19161          IRTemp res = math_DPPS_128( src_vec, dst_vec, imm8 );
   19162          putXMMReg( rG, mkexpr(res) );
   19163          goto decode_success;
   19164       }
   19165       break;
   19166 
   19167    case 0x41:
   19168       /* 66 0F 3A 41 /r ib = DPPD xmm1, xmm2/m128, imm8
   19169          Dot Product of Packed Double Precision Floating-Point Values (XMM) */
   19170       if (have66noF2noF3(pfx) && sz == 2) {
   19171          modrm = getUChar(delta);
   19172          Int    imm8;
   19173          IRTemp src_vec = newTemp(Ity_V128);
   19174          IRTemp dst_vec = newTemp(Ity_V128);
   19175          UInt   rG      = gregOfRexRM(pfx, modrm);
   19176          assign( dst_vec, getXMMReg( rG ) );
   19177          if ( epartIsReg( modrm ) ) {
   19178             UInt rE = eregOfRexRM(pfx, modrm);
   19179             imm8 = (Int)getUChar(delta+1);
   19180             assign( src_vec, getXMMReg(rE) );
   19181             delta += 1+1;
   19182             DIP( "dppd $%d, %s,%s\n",
   19183                  imm8, nameXMMReg(rE), nameXMMReg(rG) );
   19184          } else {
   19185             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   19186                              1/* imm8 is 1 byte after the amode */ );
   19187             gen_SEGV_if_not_16_aligned( addr );
   19188             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   19189             imm8 = (Int)getUChar(delta+alen);
   19190             delta += alen+1;
   19191             DIP( "dppd $%d, %s,%s\n",
   19192                  imm8, dis_buf, nameXMMReg(rG) );
   19193          }
   19194          IRTemp res = math_DPPD_128( src_vec, dst_vec, imm8 );
   19195          putXMMReg( rG, mkexpr(res) );
   19196          goto decode_success;
   19197       }
   19198       break;
   19199 
   19200    case 0x42:
   19201       /* 66 0F 3A 42 /r ib = MPSADBW xmm1, xmm2/m128, imm8
   19202          Multiple Packed Sums of Absolule Difference (XMM) */
   19203       if (have66noF2noF3(pfx) && sz == 2) {
   19204          Int    imm8;
   19205          IRTemp src_vec = newTemp(Ity_V128);
   19206          IRTemp dst_vec = newTemp(Ity_V128);
   19207          modrm          = getUChar(delta);
   19208          UInt   rG      = gregOfRexRM(pfx, modrm);
   19209 
   19210          assign( dst_vec, getXMMReg(rG) );
   19211 
   19212          if ( epartIsReg( modrm ) ) {
   19213             UInt rE = eregOfRexRM(pfx, modrm);
   19214 
   19215             imm8 = (Int)getUChar(delta+1);
   19216             assign( src_vec, getXMMReg(rE) );
   19217             delta += 1+1;
   19218             DIP( "mpsadbw $%d, %s,%s\n", imm8,
   19219                  nameXMMReg(rE), nameXMMReg(rG) );
   19220          } else {
   19221             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   19222                              1/* imm8 is 1 byte after the amode */ );
   19223             gen_SEGV_if_not_16_aligned( addr );
   19224             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   19225             imm8 = (Int)getUChar(delta+alen);
   19226             delta += alen+1;
   19227             DIP( "mpsadbw $%d, %s,%s\n", imm8, dis_buf, nameXMMReg(rG) );
   19228          }
   19229 
   19230          putXMMReg( rG, mkexpr( math_MPSADBW_128(dst_vec, src_vec, imm8) ) );
   19231          goto decode_success;
   19232       }
   19233       break;
   19234 
   19235    case 0x44:
   19236       /* 66 0F 3A 44 /r ib = PCLMULQDQ xmm1, xmm2/m128, imm8
   19237        * Carry-less multiplication of selected XMM quadwords into XMM
   19238        * registers (a.k.a multiplication of polynomials over GF(2))
   19239        */
   19240       if (have66noF2noF3(pfx) && sz == 2) {
   19241 
   19242          Int imm8;
   19243          IRTemp svec = newTemp(Ity_V128);
   19244          IRTemp dvec = newTemp(Ity_V128);
   19245          modrm       = getUChar(delta);
   19246          UInt   rG   = gregOfRexRM(pfx, modrm);
   19247 
   19248          assign( dvec, getXMMReg(rG) );
   19249 
   19250          if ( epartIsReg( modrm ) ) {
   19251             UInt rE = eregOfRexRM(pfx, modrm);
   19252             imm8 = (Int)getUChar(delta+1);
   19253             assign( svec, getXMMReg(rE) );
   19254             delta += 1+1;
   19255             DIP( "pclmulqdq $%d, %s,%s\n", imm8,
   19256                  nameXMMReg(rE), nameXMMReg(rG) );
   19257          } else {
   19258             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   19259                              1/* imm8 is 1 byte after the amode */ );
   19260             gen_SEGV_if_not_16_aligned( addr );
   19261             assign( svec, loadLE( Ity_V128, mkexpr(addr) ) );
   19262             imm8 = (Int)getUChar(delta+alen);
   19263             delta += alen+1;
   19264             DIP( "pclmulqdq $%d, %s,%s\n",
   19265                  imm8, dis_buf, nameXMMReg(rG) );
   19266          }
   19267 
   19268          putXMMReg( rG, mkexpr( math_PCLMULQDQ(dvec, svec, imm8) ) );
   19269          goto decode_success;
   19270       }
   19271       break;
   19272 
   19273    case 0x60:
   19274    case 0x61:
   19275    case 0x62:
   19276    case 0x63:
   19277       /* 66 0F 3A 63 /r ib = PCMPISTRI imm8, xmm2/m128, xmm1
   19278          66 0F 3A 62 /r ib = PCMPISTRM imm8, xmm2/m128, xmm1
   19279          66 0F 3A 61 /r ib = PCMPESTRI imm8, xmm2/m128, xmm1
   19280          66 0F 3A 60 /r ib = PCMPESTRM imm8, xmm2/m128, xmm1
   19281          (selected special cases that actually occur in glibc,
   19282           not by any means a complete implementation.)
   19283       */
   19284       if (have66noF2noF3(pfx) && sz == 2) {
   19285          Long delta0 = delta;
   19286          delta = dis_PCMPxSTRx( vbi, pfx, delta, False/*!isAvx*/, opc );
   19287          if (delta > delta0) goto decode_success;
   19288          /* else fall though; dis_PCMPxSTRx failed to decode it */
   19289       }
   19290       break;
   19291 
   19292    case 0xDF:
   19293       /* 66 0F 3A DF /r ib = AESKEYGENASSIST imm8, xmm2/m128, xmm1 */
   19294       if (have66noF2noF3(pfx) && sz == 2) {
   19295          delta = dis_AESKEYGENASSIST( vbi, pfx, delta, False/*!isAvx*/ );
   19296          goto decode_success;
   19297       }
   19298       break;
   19299 
   19300    default:
   19301       break;
   19302 
   19303    }
   19304 
   19305   decode_failure:
   19306    *decode_OK = False;
   19307    return deltaIN;
   19308 
   19309   decode_success:
   19310    *decode_OK = True;
   19311    return delta;
   19312 }
   19313 
   19314 
   19315 /*------------------------------------------------------------*/
   19316 /*---                                                      ---*/
   19317 /*--- Top-level post-escape decoders: dis_ESC_NONE         ---*/
   19318 /*---                                                      ---*/
   19319 /*------------------------------------------------------------*/
   19320 
   19321 __attribute__((noinline))
   19322 static
   19323 Long dis_ESC_NONE (
   19324         /*MB_OUT*/DisResult* dres,
   19325         /*MB_OUT*/Bool*      expect_CAS,
   19326         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
   19327         Bool         resteerCisOk,
   19328         void*        callback_opaque,
   19329         VexArchInfo* archinfo,
   19330         VexAbiInfo*  vbi,
   19331         Prefix pfx, Int sz, Long deltaIN
   19332      )
   19333 {
   19334    Long   d64   = 0;
   19335    UChar  abyte = 0;
   19336    IRTemp addr  = IRTemp_INVALID;
   19337    IRTemp t1    = IRTemp_INVALID;
   19338    IRTemp t2    = IRTemp_INVALID;
   19339    IRTemp t3    = IRTemp_INVALID;
   19340    IRTemp t4    = IRTemp_INVALID;
   19341    IRTemp t5    = IRTemp_INVALID;
   19342    IRType ty    = Ity_INVALID;
   19343    UChar  modrm = 0;
   19344    Int    am_sz = 0;
   19345    Int    d_sz  = 0;
   19346    Int    alen  = 0;
   19347    HChar  dis_buf[50];
   19348 
   19349    Long   delta = deltaIN;
   19350    UChar  opc   = getUChar(delta); delta++;
   19351 
   19352    /* delta now points at the modrm byte.  In most of the cases that
   19353       follow, neither the F2 nor F3 prefixes are allowed.  However,
   19354       for some basic arithmetic operations we have to allow F2/XACQ or
   19355       F3/XREL in the case where the destination is memory and the LOCK
   19356       prefix is also present.  Do this check by looking at the modrm
   19357       byte but not advancing delta over it. */
   19358    /* By default, F2 and F3 are not allowed, so let's start off with
   19359       that setting. */
   19360    Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
   19361    { UChar tmp_modrm = getUChar(delta);
   19362      switch (opc) {
   19363         case 0x00: /* ADD Gb,Eb */  case 0x01: /* ADD Gv,Ev */
   19364         case 0x08: /* OR  Gb,Eb */  case 0x09: /* OR  Gv,Ev */
   19365         case 0x10: /* ADC Gb,Eb */  case 0x11: /* ADC Gv,Ev */
   19366         case 0x18: /* SBB Gb,Eb */  case 0x19: /* SBB Gv,Ev */
   19367         case 0x20: /* AND Gb,Eb */  case 0x21: /* AND Gv,Ev */
   19368         case 0x28: /* SUB Gb,Eb */  case 0x29: /* SUB Gv,Ev */
   19369         case 0x30: /* XOR Gb,Eb */  case 0x31: /* XOR Gv,Ev */
   19370            if (!epartIsReg(tmp_modrm)
   19371                && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
   19372               /* dst is mem, and we have F2 or F3 but not both */
   19373               validF2orF3 = True;
   19374            }
   19375            break;
   19376         default:
   19377            break;
   19378      }
   19379    }
   19380 
   19381    /* Now, in the switch below, for the opc values examined by the
   19382       switch above, use validF2orF3 rather than looking at pfx
   19383       directly. */
   19384    switch (opc) {
   19385 
   19386    case 0x00: /* ADD Gb,Eb */
   19387       if (!validF2orF3) goto decode_failure;
   19388       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Add8, True, 1, delta, "add" );
   19389       return delta;
   19390    case 0x01: /* ADD Gv,Ev */
   19391       if (!validF2orF3) goto decode_failure;
   19392       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Add8, True, sz, delta, "add" );
   19393       return delta;
   19394 
   19395    case 0x02: /* ADD Eb,Gb */
   19396       if (haveF2orF3(pfx)) goto decode_failure;
   19397       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Add8, True, 1, delta, "add" );
   19398       return delta;
   19399    case 0x03: /* ADD Ev,Gv */
   19400       if (haveF2orF3(pfx)) goto decode_failure;
   19401       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Add8, True, sz, delta, "add" );
   19402       return delta;
   19403 
   19404    case 0x04: /* ADD Ib, AL */
   19405       if (haveF2orF3(pfx)) goto decode_failure;
   19406       delta = dis_op_imm_A( 1, False, Iop_Add8, True, delta, "add" );
   19407       return delta;
   19408    case 0x05: /* ADD Iv, eAX */
   19409       if (haveF2orF3(pfx)) goto decode_failure;
   19410       delta = dis_op_imm_A(sz, False, Iop_Add8, True, delta, "add" );
   19411       return delta;
   19412 
   19413    case 0x08: /* OR Gb,Eb */
   19414       if (!validF2orF3) goto decode_failure;
   19415       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Or8, True, 1, delta, "or" );
   19416       return delta;
   19417    case 0x09: /* OR Gv,Ev */
   19418       if (!validF2orF3) goto decode_failure;
   19419       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Or8, True, sz, delta, "or" );
   19420       return delta;
   19421 
   19422    case 0x0A: /* OR Eb,Gb */
   19423       if (haveF2orF3(pfx)) goto decode_failure;
   19424       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Or8, True, 1, delta, "or" );
   19425       return delta;
   19426    case 0x0B: /* OR Ev,Gv */
   19427       if (haveF2orF3(pfx)) goto decode_failure;
   19428       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Or8, True, sz, delta, "or" );
   19429       return delta;
   19430 
   19431    case 0x0C: /* OR Ib, AL */
   19432       if (haveF2orF3(pfx)) goto decode_failure;
   19433       delta = dis_op_imm_A( 1, False, Iop_Or8, True, delta, "or" );
   19434       return delta;
   19435    case 0x0D: /* OR Iv, eAX */
   19436       if (haveF2orF3(pfx)) goto decode_failure;
   19437       delta = dis_op_imm_A( sz, False, Iop_Or8, True, delta, "or" );
   19438       return delta;
   19439 
   19440    case 0x10: /* ADC Gb,Eb */
   19441       if (!validF2orF3) goto decode_failure;
   19442       delta = dis_op2_G_E ( vbi, pfx, True, Iop_Add8, True, 1, delta, "adc" );
   19443       return delta;
   19444    case 0x11: /* ADC Gv,Ev */
   19445       if (!validF2orF3) goto decode_failure;
   19446       delta = dis_op2_G_E ( vbi, pfx, True, Iop_Add8, True, sz, delta, "adc" );
   19447       return delta;
   19448 
   19449    case 0x12: /* ADC Eb,Gb */
   19450       if (haveF2orF3(pfx)) goto decode_failure;
   19451       delta = dis_op2_E_G ( vbi, pfx, True, Iop_Add8, True, 1, delta, "adc" );
   19452       return delta;
   19453    case 0x13: /* ADC Ev,Gv */
   19454       if (haveF2orF3(pfx)) goto decode_failure;
   19455       delta = dis_op2_E_G ( vbi, pfx, True, Iop_Add8, True, sz, delta, "adc" );
   19456       return delta;
   19457 
   19458    case 0x14: /* ADC Ib, AL */
   19459       if (haveF2orF3(pfx)) goto decode_failure;
   19460       delta = dis_op_imm_A( 1, True, Iop_Add8, True, delta, "adc" );
   19461       return delta;
   19462    case 0x15: /* ADC Iv, eAX */
   19463       if (haveF2orF3(pfx)) goto decode_failure;
   19464       delta = dis_op_imm_A( sz, True, Iop_Add8, True, delta, "adc" );
   19465       return delta;
   19466 
   19467    case 0x18: /* SBB Gb,Eb */
   19468       if (!validF2orF3) goto decode_failure;
   19469       delta = dis_op2_G_E ( vbi, pfx, True, Iop_Sub8, True, 1, delta, "sbb" );
   19470       return delta;
   19471    case 0x19: /* SBB Gv,Ev */
   19472       if (!validF2orF3) goto decode_failure;
   19473       delta = dis_op2_G_E ( vbi, pfx, True, Iop_Sub8, True, sz, delta, "sbb" );
   19474       return delta;
   19475 
   19476    case 0x1A: /* SBB Eb,Gb */
   19477       if (haveF2orF3(pfx)) goto decode_failure;
   19478       delta = dis_op2_E_G ( vbi, pfx, True, Iop_Sub8, True, 1, delta, "sbb" );
   19479       return delta;
   19480    case 0x1B: /* SBB Ev,Gv */
   19481       if (haveF2orF3(pfx)) goto decode_failure;
   19482       delta = dis_op2_E_G ( vbi, pfx, True, Iop_Sub8, True, sz, delta, "sbb" );
   19483       return delta;
   19484 
   19485    case 0x1C: /* SBB Ib, AL */
   19486       if (haveF2orF3(pfx)) goto decode_failure;
   19487       delta = dis_op_imm_A( 1, True, Iop_Sub8, True, delta, "sbb" );
   19488       return delta;
   19489    case 0x1D: /* SBB Iv, eAX */
   19490       if (haveF2orF3(pfx)) goto decode_failure;
   19491       delta = dis_op_imm_A( sz, True, Iop_Sub8, True, delta, "sbb" );
   19492       return delta;
   19493 
   19494    case 0x20: /* AND Gb,Eb */
   19495       if (!validF2orF3) goto decode_failure;
   19496       delta = dis_op2_G_E ( vbi, pfx, False, Iop_And8, True, 1, delta, "and" );
   19497       return delta;
   19498    case 0x21: /* AND Gv,Ev */
   19499       if (!validF2orF3) goto decode_failure;
   19500       delta = dis_op2_G_E ( vbi, pfx, False, Iop_And8, True, sz, delta, "and" );
   19501       return delta;
   19502 
   19503    case 0x22: /* AND Eb,Gb */
   19504       if (haveF2orF3(pfx)) goto decode_failure;
   19505       delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, True, 1, delta, "and" );
   19506       return delta;
   19507    case 0x23: /* AND Ev,Gv */
   19508       if (haveF2orF3(pfx)) goto decode_failure;
   19509       delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, True, sz, delta, "and" );
   19510       return delta;
   19511 
   19512    case 0x24: /* AND Ib, AL */
   19513       if (haveF2orF3(pfx)) goto decode_failure;
   19514       delta = dis_op_imm_A( 1, False, Iop_And8, True, delta, "and" );
   19515       return delta;
   19516    case 0x25: /* AND Iv, eAX */
   19517       if (haveF2orF3(pfx)) goto decode_failure;
   19518       delta = dis_op_imm_A( sz, False, Iop_And8, True, delta, "and" );
   19519       return delta;
   19520 
   19521    case 0x28: /* SUB Gb,Eb */
   19522       if (!validF2orF3) goto decode_failure;
   19523       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, True, 1, delta, "sub" );
   19524       return delta;
   19525    case 0x29: /* SUB Gv,Ev */
   19526       if (!validF2orF3) goto decode_failure;
   19527       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, True, sz, delta, "sub" );
   19528       return delta;
   19529 
   19530    case 0x2A: /* SUB Eb,Gb */
   19531       if (haveF2orF3(pfx)) goto decode_failure;
   19532       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, True, 1, delta, "sub" );
   19533       return delta;
   19534    case 0x2B: /* SUB Ev,Gv */
   19535       if (haveF2orF3(pfx)) goto decode_failure;
   19536       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, True, sz, delta, "sub" );
   19537       return delta;
   19538 
   19539    case 0x2C: /* SUB Ib, AL */
   19540       if (haveF2orF3(pfx)) goto decode_failure;
   19541       delta = dis_op_imm_A(1, False, Iop_Sub8, True, delta, "sub" );
   19542       return delta;
   19543    case 0x2D: /* SUB Iv, eAX */
   19544       if (haveF2orF3(pfx)) goto decode_failure;
   19545       delta = dis_op_imm_A( sz, False, Iop_Sub8, True, delta, "sub" );
   19546       return delta;
   19547 
   19548    case 0x30: /* XOR Gb,Eb */
   19549       if (!validF2orF3) goto decode_failure;
   19550       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Xor8, True, 1, delta, "xor" );
   19551       return delta;
   19552    case 0x31: /* XOR Gv,Ev */
   19553       if (!validF2orF3) goto decode_failure;
   19554       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Xor8, True, sz, delta, "xor" );
   19555       return delta;
   19556 
   19557    case 0x32: /* XOR Eb,Gb */
   19558       if (haveF2orF3(pfx)) goto decode_failure;
   19559       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Xor8, True, 1, delta, "xor" );
   19560       return delta;
   19561    case 0x33: /* XOR Ev,Gv */
   19562       if (haveF2orF3(pfx)) goto decode_failure;
   19563       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Xor8, True, sz, delta, "xor" );
   19564       return delta;
   19565 
   19566    case 0x34: /* XOR Ib, AL */
   19567       if (haveF2orF3(pfx)) goto decode_failure;
   19568       delta = dis_op_imm_A( 1, False, Iop_Xor8, True, delta, "xor" );
   19569       return delta;
   19570    case 0x35: /* XOR Iv, eAX */
   19571       if (haveF2orF3(pfx)) goto decode_failure;
   19572       delta = dis_op_imm_A( sz, False, Iop_Xor8, True, delta, "xor" );
   19573       return delta;
   19574 
   19575    case 0x38: /* CMP Gb,Eb */
   19576       if (haveF2orF3(pfx)) goto decode_failure;
   19577       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, False, 1, delta, "cmp" );
   19578       return delta;
   19579    case 0x39: /* CMP Gv,Ev */
   19580       if (haveF2orF3(pfx)) goto decode_failure;
   19581       delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, False, sz, delta, "cmp" );
   19582       return delta;
   19583 
   19584    case 0x3A: /* CMP Eb,Gb */
   19585       if (haveF2orF3(pfx)) goto decode_failure;
   19586       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, False, 1, delta, "cmp" );
   19587       return delta;
   19588    case 0x3B: /* CMP Ev,Gv */
   19589       if (haveF2orF3(pfx)) goto decode_failure;
   19590       delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, False, sz, delta, "cmp" );
   19591       return delta;
   19592 
   19593    case 0x3C: /* CMP Ib, AL */
   19594       if (haveF2orF3(pfx)) goto decode_failure;
   19595       delta = dis_op_imm_A( 1, False, Iop_Sub8, False, delta, "cmp" );
   19596       return delta;
   19597    case 0x3D: /* CMP Iv, eAX */
   19598       if (haveF2orF3(pfx)) goto decode_failure;
   19599       delta = dis_op_imm_A( sz, False, Iop_Sub8, False, delta, "cmp" );
   19600       return delta;
   19601 
   19602    case 0x50: /* PUSH eAX */
   19603    case 0x51: /* PUSH eCX */
   19604    case 0x52: /* PUSH eDX */
   19605    case 0x53: /* PUSH eBX */
   19606    case 0x55: /* PUSH eBP */
   19607    case 0x56: /* PUSH eSI */
   19608    case 0x57: /* PUSH eDI */
   19609    case 0x54: /* PUSH eSP */
   19610       /* This is the Right Way, in that the value to be pushed is
   19611          established before %rsp is changed, so that pushq %rsp
   19612          correctly pushes the old value. */
   19613       if (haveF2orF3(pfx)) goto decode_failure;
   19614       vassert(sz == 2 || sz == 4 || sz == 8);
   19615       if (sz == 4)
   19616          sz = 8; /* there is no encoding for 32-bit push in 64-bit mode */
   19617       ty = sz==2 ? Ity_I16 : Ity_I64;
   19618       t1 = newTemp(ty);
   19619       t2 = newTemp(Ity_I64);
   19620       assign(t1, getIRegRexB(sz, pfx, opc-0x50));
   19621       assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(sz)));
   19622       putIReg64(R_RSP, mkexpr(t2) );
   19623       storeLE(mkexpr(t2),mkexpr(t1));
   19624       DIP("push%c %s\n", nameISize(sz), nameIRegRexB(sz,pfx,opc-0x50));
   19625       return delta;
   19626 
   19627    case 0x58: /* POP eAX */
   19628    case 0x59: /* POP eCX */
   19629    case 0x5A: /* POP eDX */
   19630    case 0x5B: /* POP eBX */
   19631    case 0x5D: /* POP eBP */
   19632    case 0x5E: /* POP eSI */
   19633    case 0x5F: /* POP eDI */
   19634    case 0x5C: /* POP eSP */
   19635       if (haveF2orF3(pfx)) goto decode_failure;
   19636       vassert(sz == 2 || sz == 4 || sz == 8);
   19637       if (sz == 4)
   19638          sz = 8; /* there is no encoding for 32-bit pop in 64-bit mode */
   19639       t1 = newTemp(szToITy(sz));
   19640       t2 = newTemp(Ity_I64);
   19641       assign(t2, getIReg64(R_RSP));
   19642       assign(t1, loadLE(szToITy(sz),mkexpr(t2)));
   19643       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t2), mkU64(sz)));
   19644       putIRegRexB(sz, pfx, opc-0x58, mkexpr(t1));
   19645       DIP("pop%c %s\n", nameISize(sz), nameIRegRexB(sz,pfx,opc-0x58));
   19646       return delta;
   19647 
   19648    case 0x63: /* MOVSX */
   19649       if (haveF2orF3(pfx)) goto decode_failure;
   19650       if (haveREX(pfx) && 1==getRexW(pfx)) {
   19651          vassert(sz == 8);
   19652          /* movsx r/m32 to r64 */
   19653          modrm = getUChar(delta);
   19654          if (epartIsReg(modrm)) {
   19655             delta++;
   19656             putIRegG(8, pfx, modrm,
   19657                              unop(Iop_32Sto64,
   19658                                   getIRegE(4, pfx, modrm)));
   19659             DIP("movslq %s,%s\n",
   19660                 nameIRegE(4, pfx, modrm),
   19661                 nameIRegG(8, pfx, modrm));
   19662             return delta;
   19663          } else {
   19664             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   19665             delta += alen;
   19666             putIRegG(8, pfx, modrm,
   19667                              unop(Iop_32Sto64,
   19668                                   loadLE(Ity_I32, mkexpr(addr))));
   19669             DIP("movslq %s,%s\n", dis_buf,
   19670                 nameIRegG(8, pfx, modrm));
   19671             return delta;
   19672          }
   19673       } else {
   19674          goto decode_failure;
   19675       }
   19676 
   19677    case 0x68: /* PUSH Iv */
   19678       if (haveF2orF3(pfx)) goto decode_failure;
   19679       /* Note, sz==4 is not possible in 64-bit mode.  Hence ... */
   19680       if (sz == 4) sz = 8;
   19681       d64 = getSDisp(imin(4,sz),delta);
   19682       delta += imin(4,sz);
   19683       goto do_push_I;
   19684 
   19685    case 0x69: /* IMUL Iv, Ev, Gv */
   19686       if (haveF2orF3(pfx)) goto decode_failure;
   19687       delta = dis_imul_I_E_G ( vbi, pfx, sz, delta, sz );
   19688       return delta;
   19689 
   19690    case 0x6A: /* PUSH Ib, sign-extended to sz */
   19691       if (haveF2orF3(pfx)) goto decode_failure;
   19692       /* Note, sz==4 is not possible in 64-bit mode.  Hence ... */
   19693       if (sz == 4) sz = 8;
   19694       d64 = getSDisp8(delta); delta += 1;
   19695       goto do_push_I;
   19696    do_push_I:
   19697       ty = szToITy(sz);
   19698       t1 = newTemp(Ity_I64);
   19699       t2 = newTemp(ty);
   19700       assign( t1, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
   19701       putIReg64(R_RSP, mkexpr(t1) );
   19702       /* stop mkU16 asserting if d32 is a negative 16-bit number
   19703          (bug #132813) */
   19704       if (ty == Ity_I16)
   19705          d64 &= 0xFFFF;
   19706       storeLE( mkexpr(t1), mkU(ty,d64) );
   19707       DIP("push%c $%lld\n", nameISize(sz), (Long)d64);
   19708       return delta;
   19709 
   19710    case 0x6B: /* IMUL Ib, Ev, Gv */
   19711       delta = dis_imul_I_E_G ( vbi, pfx, sz, delta, 1 );
   19712       return delta;
   19713 
   19714    case 0x70:
   19715    case 0x71:
   19716    case 0x72:   /* JBb/JNAEb (jump below) */
   19717    case 0x73:   /* JNBb/JAEb (jump not below) */
   19718    case 0x74:   /* JZb/JEb (jump zero) */
   19719    case 0x75:   /* JNZb/JNEb (jump not zero) */
   19720    case 0x76:   /* JBEb/JNAb (jump below or equal) */
   19721    case 0x77:   /* JNBEb/JAb (jump not below or equal) */
   19722    case 0x78:   /* JSb (jump negative) */
   19723    case 0x79:   /* JSb (jump not negative) */
   19724    case 0x7A:   /* JP (jump parity even) */
   19725    case 0x7B:   /* JNP/JPO (jump parity odd) */
   19726    case 0x7C:   /* JLb/JNGEb (jump less) */
   19727    case 0x7D:   /* JGEb/JNLb (jump greater or equal) */
   19728    case 0x7E:   /* JLEb/JNGb (jump less or equal) */
   19729    case 0x7F: { /* JGb/JNLEb (jump greater) */
   19730       Long   jmpDelta;
   19731       const HChar* comment  = "";
   19732       if (haveF3(pfx)) goto decode_failure;
   19733       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   19734       jmpDelta = getSDisp8(delta);
   19735       vassert(-128 <= jmpDelta && jmpDelta < 128);
   19736       d64 = (guest_RIP_bbstart+delta+1) + jmpDelta;
   19737       delta++;
   19738       if (resteerCisOk
   19739           && vex_control.guest_chase_cond
   19740           && (Addr64)d64 != (Addr64)guest_RIP_bbstart
   19741           && jmpDelta < 0
   19742           && resteerOkFn( callback_opaque, d64) ) {
   19743          /* Speculation: assume this backward branch is taken.  So we
   19744             need to emit a side-exit to the insn following this one,
   19745             on the negation of the condition, and continue at the
   19746             branch target address (d64).  If we wind up back at the
   19747             first instruction of the trace, just stop; it's better to
   19748             let the IR loop unroller handle that case. */
   19749          stmt( IRStmt_Exit(
   19750                   mk_amd64g_calculate_condition(
   19751                      (AMD64Condcode)(1 ^ (opc - 0x70))),
   19752                   Ijk_Boring,
   19753                   IRConst_U64(guest_RIP_bbstart+delta),
   19754                   OFFB_RIP ) );
   19755          dres->whatNext   = Dis_ResteerC;
   19756          dres->continueAt = d64;
   19757          comment = "(assumed taken)";
   19758       }
   19759       else
   19760       if (resteerCisOk
   19761           && vex_control.guest_chase_cond
   19762           && (Addr64)d64 != (Addr64)guest_RIP_bbstart
   19763           && jmpDelta >= 0
   19764           && resteerOkFn( callback_opaque, guest_RIP_bbstart+delta ) ) {
   19765          /* Speculation: assume this forward branch is not taken.  So
   19766             we need to emit a side-exit to d64 (the dest) and continue
   19767             disassembling at the insn immediately following this
   19768             one. */
   19769          stmt( IRStmt_Exit(
   19770                   mk_amd64g_calculate_condition((AMD64Condcode)(opc - 0x70)),
   19771                   Ijk_Boring,
   19772                   IRConst_U64(d64),
   19773                   OFFB_RIP ) );
   19774          dres->whatNext   = Dis_ResteerC;
   19775          dres->continueAt = guest_RIP_bbstart+delta;
   19776          comment = "(assumed not taken)";
   19777       }
   19778       else {
   19779          /* Conservative default translation - end the block at this
   19780             point. */
   19781          jcc_01( dres, (AMD64Condcode)(opc - 0x70),
   19782                  guest_RIP_bbstart+delta, d64 );
   19783          vassert(dres->whatNext == Dis_StopHere);
   19784       }
   19785       DIP("j%s-8 0x%llx %s\n", name_AMD64Condcode(opc - 0x70), d64, comment);
   19786       return delta;
   19787    }
   19788 
   19789    case 0x80: /* Grp1 Ib,Eb */
   19790       modrm = getUChar(delta);
   19791       /* Disallow F2/XACQ and F3/XREL for the non-mem case.  Allow
   19792          just one for the mem case and also require LOCK in this case.
   19793          Note that this erroneously allows XACQ/XREL on CMP since we
   19794          don't check the subopcode here.  No big deal. */
   19795       if (epartIsReg(modrm) && haveF2orF3(pfx))
   19796          goto decode_failure;
   19797       if (!epartIsReg(modrm) && haveF2andF3(pfx))
   19798          goto decode_failure;
   19799       if (!epartIsReg(modrm) && haveF2orF3(pfx) && !haveLOCK(pfx))
   19800          goto decode_failure;
   19801       am_sz = lengthAMode(pfx,delta);
   19802       sz    = 1;
   19803       d_sz  = 1;
   19804       d64   = getSDisp8(delta + am_sz);
   19805       delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
   19806       return delta;
   19807 
   19808    case 0x81: /* Grp1 Iv,Ev */
   19809       modrm = getUChar(delta);
   19810       /* Same comment as for case 0x80 just above. */
   19811       if (epartIsReg(modrm) && haveF2orF3(pfx))
   19812          goto decode_failure;
   19813       if (!epartIsReg(modrm) && haveF2andF3(pfx))
   19814          goto decode_failure;
   19815       if (!epartIsReg(modrm) && haveF2orF3(pfx) && !haveLOCK(pfx))
   19816          goto decode_failure;
   19817       am_sz = lengthAMode(pfx,delta);
   19818       d_sz  = imin(sz,4);
   19819       d64   = getSDisp(d_sz, delta + am_sz);
   19820       delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
   19821       return delta;
   19822 
   19823    case 0x83: /* Grp1 Ib,Ev */
   19824       if (haveF2orF3(pfx)) goto decode_failure;
   19825       modrm = getUChar(delta);
   19826       am_sz = lengthAMode(pfx,delta);
   19827       d_sz  = 1;
   19828       d64   = getSDisp8(delta + am_sz);
   19829       delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
   19830       return delta;
   19831 
   19832    case 0x84: /* TEST Eb,Gb */
   19833       if (haveF2orF3(pfx)) goto decode_failure;
   19834       delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, False, 1, delta, "test" );
   19835       return delta;
   19836 
   19837    case 0x85: /* TEST Ev,Gv */
   19838       if (haveF2orF3(pfx)) goto decode_failure;
   19839       delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, False, sz, delta, "test" );
   19840       return delta;
   19841 
   19842    /* XCHG reg,mem automatically asserts LOCK# even without a LOCK
   19843       prefix.  Therefore, generate CAS regardless of the presence or
   19844       otherwise of a LOCK prefix. */
   19845    case 0x86: /* XCHG Gb,Eb */
   19846       sz = 1;
   19847       /* Fall through ... */
   19848    case 0x87: /* XCHG Gv,Ev */
   19849       modrm = getUChar(delta);
   19850       /* Check whether F2 or F3 are allowable.  For the mem case, one
   19851          or the othter but not both are.  We don't care about the
   19852          presence of LOCK in this case -- XCHG is unusual in this
   19853          respect. */
   19854       if (haveF2orF3(pfx)) {
   19855          if (epartIsReg(modrm)) {
   19856             goto decode_failure;
   19857          } else {
   19858             if (haveF2andF3(pfx))
   19859                goto decode_failure;
   19860          }
   19861       }
   19862       ty = szToITy(sz);
   19863       t1 = newTemp(ty); t2 = newTemp(ty);
   19864       if (epartIsReg(modrm)) {
   19865          assign(t1, getIRegE(sz, pfx, modrm));
   19866          assign(t2, getIRegG(sz, pfx, modrm));
   19867          putIRegG(sz, pfx, modrm, mkexpr(t1));
   19868          putIRegE(sz, pfx, modrm, mkexpr(t2));
   19869          delta++;
   19870          DIP("xchg%c %s, %s\n",
   19871              nameISize(sz), nameIRegG(sz, pfx, modrm),
   19872                             nameIRegE(sz, pfx, modrm));
   19873       } else {
   19874          *expect_CAS = True;
   19875          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   19876          assign( t1, loadLE(ty, mkexpr(addr)) );
   19877          assign( t2, getIRegG(sz, pfx, modrm) );
   19878          casLE( mkexpr(addr),
   19879                 mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
   19880          putIRegG( sz, pfx, modrm, mkexpr(t1) );
   19881          delta += alen;
   19882          DIP("xchg%c %s, %s\n", nameISize(sz),
   19883                                 nameIRegG(sz, pfx, modrm), dis_buf);
   19884       }
   19885       return delta;
   19886 
   19887    case 0x88: { /* MOV Gb,Eb */
   19888       /* We let dis_mov_G_E decide whether F3(XRELEASE) is allowable. */
   19889       Bool ok = True;
   19890       delta = dis_mov_G_E(vbi, pfx, 1, delta, &ok);
   19891       if (!ok) goto decode_failure;
   19892       return delta;
   19893    }
   19894 
   19895    case 0x89: { /* MOV Gv,Ev */
   19896       /* We let dis_mov_G_E decide whether F3(XRELEASE) is allowable. */
   19897       Bool ok = True;
   19898       delta = dis_mov_G_E(vbi, pfx, sz, delta, &ok);
   19899       if (!ok) goto decode_failure;
   19900       return delta;
   19901    }
   19902 
   19903    case 0x8A: /* MOV Eb,Gb */
   19904       if (haveF2orF3(pfx)) goto decode_failure;
   19905       delta = dis_mov_E_G(vbi, pfx, 1, delta);
   19906       return delta;
   19907 
   19908    case 0x8B: /* MOV Ev,Gv */
   19909       if (haveF2orF3(pfx)) goto decode_failure;
   19910       delta = dis_mov_E_G(vbi, pfx, sz, delta);
   19911       return delta;
   19912 
   19913    case 0x8D: /* LEA M,Gv */
   19914       if (haveF2orF3(pfx)) goto decode_failure;
   19915       if (sz != 4 && sz != 8)
   19916          goto decode_failure;
   19917       modrm = getUChar(delta);
   19918       if (epartIsReg(modrm))
   19919          goto decode_failure;
   19920       /* NOTE!  this is the one place where a segment override prefix
   19921          has no effect on the address calculation.  Therefore we clear
   19922          any segment override bits in pfx. */
   19923       addr = disAMode ( &alen, vbi, clearSegBits(pfx), delta, dis_buf, 0 );
   19924       delta += alen;
   19925       /* This is a hack.  But it isn't clear that really doing the
   19926          calculation at 32 bits is really worth it.  Hence for leal,
   19927          do the full 64-bit calculation and then truncate it. */
   19928       putIRegG( sz, pfx, modrm,
   19929                          sz == 4
   19930                             ? unop(Iop_64to32, mkexpr(addr))
   19931                             : mkexpr(addr)
   19932               );
   19933       DIP("lea%c %s, %s\n", nameISize(sz), dis_buf,
   19934                             nameIRegG(sz,pfx,modrm));
   19935       return delta;
   19936 
   19937    case 0x8F: { /* POPQ m64 / POPW m16 */
   19938       Int   len;
   19939       UChar rm;
   19940       /* There is no encoding for 32-bit pop in 64-bit mode.
   19941          So sz==4 actually means sz==8. */
   19942       if (haveF2orF3(pfx)) goto decode_failure;
   19943       vassert(sz == 2 || sz == 4
   19944               || /* tolerate redundant REX.W, see #210481 */ sz == 8);
   19945       if (sz == 4) sz = 8;
   19946       if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
   19947 
   19948       rm = getUChar(delta);
   19949 
   19950       /* make sure this instruction is correct POP */
   19951       if (epartIsReg(rm) || gregLO3ofRM(rm) != 0)
   19952          goto decode_failure;
   19953       /* and has correct size */
   19954       vassert(sz == 8);
   19955 
   19956       t1 = newTemp(Ity_I64);
   19957       t3 = newTemp(Ity_I64);
   19958       assign( t1, getIReg64(R_RSP) );
   19959       assign( t3, loadLE(Ity_I64, mkexpr(t1)) );
   19960 
   19961       /* Increase RSP; must be done before the STORE.  Intel manual
   19962          says: If the RSP register is used as a base register for
   19963          addressing a destination operand in memory, the POP
   19964          instruction computes the effective address of the operand
   19965          after it increments the RSP register.  */
   19966       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t1), mkU64(sz)) );
   19967 
   19968       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   19969       storeLE( mkexpr(addr), mkexpr(t3) );
   19970 
   19971       DIP("popl %s\n", dis_buf);
   19972 
   19973       delta += len;
   19974       return delta;
   19975    }
   19976 
   19977    case 0x90: /* XCHG eAX,eAX */
   19978       /* detect and handle F3 90 (rep nop) specially */
   19979       if (!have66(pfx) && !haveF2(pfx) && haveF3(pfx)) {
   19980          DIP("rep nop (P4 pause)\n");
   19981          /* "observe" the hint.  The Vex client needs to be careful not
   19982             to cause very long delays as a result, though. */
   19983          jmp_lit(dres, Ijk_Yield, guest_RIP_bbstart+delta);
   19984          vassert(dres->whatNext == Dis_StopHere);
   19985          return delta;
   19986       }
   19987       /* detect and handle NOPs specially */
   19988       if (/* F2/F3 probably change meaning completely */
   19989           !haveF2orF3(pfx)
   19990           /* If REX.B is 1, we're not exchanging rAX with itself */
   19991           && getRexB(pfx)==0 ) {
   19992          DIP("nop\n");
   19993          return delta;
   19994       }
   19995       /* else fall through to normal case. */
   19996    case 0x91: /* XCHG rAX,rCX */
   19997    case 0x92: /* XCHG rAX,rDX */
   19998    case 0x93: /* XCHG rAX,rBX */
   19999    case 0x94: /* XCHG rAX,rSP */
   20000    case 0x95: /* XCHG rAX,rBP */
   20001    case 0x96: /* XCHG rAX,rSI */
   20002    case 0x97: /* XCHG rAX,rDI */
   20003       /* guard against mutancy */
   20004       if (haveF2orF3(pfx)) goto decode_failure;
   20005       codegen_xchg_rAX_Reg ( pfx, sz, opc - 0x90 );
   20006       return delta;
   20007 
   20008    case 0x98: /* CBW */
   20009       if (haveF2orF3(pfx)) goto decode_failure;
   20010       if (sz == 8) {
   20011          putIRegRAX( 8, unop(Iop_32Sto64, getIRegRAX(4)) );
   20012          DIP(/*"cdqe\n"*/"cltq");
   20013          return delta;
   20014       }
   20015       if (sz == 4) {
   20016          putIRegRAX( 4, unop(Iop_16Sto32, getIRegRAX(2)) );
   20017          DIP("cwtl\n");
   20018          return delta;
   20019       }
   20020       if (sz == 2) {
   20021          putIRegRAX( 2, unop(Iop_8Sto16, getIRegRAX(1)) );
   20022          DIP("cbw\n");
   20023          return delta;
   20024       }
   20025       goto decode_failure;
   20026 
   20027    case 0x99: /* CWD/CDQ/CQO */
   20028       if (haveF2orF3(pfx)) goto decode_failure;
   20029       vassert(sz == 2 || sz == 4 || sz == 8);
   20030       ty = szToITy(sz);
   20031       putIRegRDX( sz,
   20032                   binop(mkSizedOp(ty,Iop_Sar8),
   20033                         getIRegRAX(sz),
   20034                         mkU8(sz == 2 ? 15 : (sz == 4 ? 31 : 63))) );
   20035       DIP(sz == 2 ? "cwd\n"
   20036                   : (sz == 4 ? /*"cdq\n"*/ "cltd\n"
   20037                              : "cqo\n"));
   20038       return delta;
   20039 
   20040    case 0x9B: /* FWAIT (X87 insn) */
   20041       /* ignore? */
   20042       DIP("fwait\n");
   20043       return delta;
   20044 
   20045    case 0x9C: /* PUSHF */ {
   20046       /* Note.  There is no encoding for a 32-bit pushf in 64-bit
   20047          mode.  So sz==4 actually means sz==8. */
   20048       /* 24 July 06: has also been seen with a redundant REX prefix,
   20049          so must also allow sz==8. */
   20050       if (haveF2orF3(pfx)) goto decode_failure;
   20051       vassert(sz == 2 || sz == 4 || sz == 8);
   20052       if (sz == 4) sz = 8;
   20053       if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
   20054 
   20055       t1 = newTemp(Ity_I64);
   20056       assign( t1, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
   20057       putIReg64(R_RSP, mkexpr(t1) );
   20058 
   20059       t2 = newTemp(Ity_I64);
   20060       assign( t2, mk_amd64g_calculate_rflags_all() );
   20061 
   20062       /* Patch in the D flag.  This can simply be a copy of bit 10 of
   20063          baseBlock[OFFB_DFLAG]. */
   20064       t3 = newTemp(Ity_I64);
   20065       assign( t3, binop(Iop_Or64,
   20066                         mkexpr(t2),
   20067                         binop(Iop_And64,
   20068                               IRExpr_Get(OFFB_DFLAG,Ity_I64),
   20069                               mkU64(1<<10)))
   20070             );
   20071 
   20072       /* And patch in the ID flag. */
   20073       t4 = newTemp(Ity_I64);
   20074       assign( t4, binop(Iop_Or64,
   20075                         mkexpr(t3),
   20076                         binop(Iop_And64,
   20077                               binop(Iop_Shl64, IRExpr_Get(OFFB_IDFLAG,Ity_I64),
   20078                                                mkU8(21)),
   20079                               mkU64(1<<21)))
   20080             );
   20081 
   20082       /* And patch in the AC flag too. */
   20083       t5 = newTemp(Ity_I64);
   20084       assign( t5, binop(Iop_Or64,
   20085                         mkexpr(t4),
   20086                         binop(Iop_And64,
   20087                               binop(Iop_Shl64, IRExpr_Get(OFFB_ACFLAG,Ity_I64),
   20088                                                mkU8(18)),
   20089                               mkU64(1<<18)))
   20090             );
   20091 
   20092       /* if sz==2, the stored value needs to be narrowed. */
   20093       if (sz == 2)
   20094         storeLE( mkexpr(t1), unop(Iop_32to16,
   20095                              unop(Iop_64to32,mkexpr(t5))) );
   20096       else
   20097         storeLE( mkexpr(t1), mkexpr(t5) );
   20098 
   20099       DIP("pushf%c\n", nameISize(sz));
   20100       return delta;
   20101    }
   20102 
   20103    case 0x9D: /* POPF */
   20104       /* Note.  There is no encoding for a 32-bit popf in 64-bit mode.
   20105          So sz==4 actually means sz==8. */
   20106       if (haveF2orF3(pfx)) goto decode_failure;
   20107       vassert(sz == 2 || sz == 4);
   20108       if (sz == 4) sz = 8;
   20109       if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
   20110       t1 = newTemp(Ity_I64); t2 = newTemp(Ity_I64);
   20111       assign(t2, getIReg64(R_RSP));
   20112       assign(t1, widenUto64(loadLE(szToITy(sz),mkexpr(t2))));
   20113       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t2), mkU64(sz)));
   20114       /* t1 is the flag word.  Mask out everything except OSZACP and
   20115          set the flags thunk to AMD64G_CC_OP_COPY. */
   20116       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   20117       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   20118       stmt( IRStmt_Put( OFFB_CC_DEP1,
   20119                         binop(Iop_And64,
   20120                               mkexpr(t1),
   20121                               mkU64( AMD64G_CC_MASK_C | AMD64G_CC_MASK_P
   20122                                      | AMD64G_CC_MASK_A | AMD64G_CC_MASK_Z
   20123                                      | AMD64G_CC_MASK_S| AMD64G_CC_MASK_O )
   20124                              )
   20125                        )
   20126           );
   20127 
   20128       /* Also need to set the D flag, which is held in bit 10 of t1.
   20129          If zero, put 1 in OFFB_DFLAG, else -1 in OFFB_DFLAG. */
   20130       stmt( IRStmt_Put(
   20131                OFFB_DFLAG,
   20132                IRExpr_ITE(
   20133                   unop(Iop_64to1,
   20134                        binop(Iop_And64,
   20135                              binop(Iop_Shr64, mkexpr(t1), mkU8(10)),
   20136                              mkU64(1))),
   20137                   mkU64(0xFFFFFFFFFFFFFFFFULL),
   20138                   mkU64(1)))
   20139           );
   20140 
   20141       /* And set the ID flag */
   20142       stmt( IRStmt_Put(
   20143                OFFB_IDFLAG,
   20144                IRExpr_ITE(
   20145                   unop(Iop_64to1,
   20146                        binop(Iop_And64,
   20147                              binop(Iop_Shr64, mkexpr(t1), mkU8(21)),
   20148                              mkU64(1))),
   20149                   mkU64(1),
   20150                   mkU64(0)))
   20151           );
   20152 
   20153       /* And set the AC flag too */
   20154       stmt( IRStmt_Put(
   20155                OFFB_ACFLAG,
   20156                IRExpr_ITE(
   20157                   unop(Iop_64to1,
   20158                        binop(Iop_And64,
   20159                              binop(Iop_Shr64, mkexpr(t1), mkU8(18)),
   20160                              mkU64(1))),
   20161                   mkU64(1),
   20162                   mkU64(0)))
   20163           );
   20164 
   20165       DIP("popf%c\n", nameISize(sz));
   20166       return delta;
   20167 
   20168    case 0x9E: /* SAHF */
   20169       codegen_SAHF();
   20170       DIP("sahf\n");
   20171       return delta;
   20172 
   20173    case 0x9F: /* LAHF */
   20174       codegen_LAHF();
   20175       DIP("lahf\n");
   20176       return delta;
   20177 
   20178    case 0xA0: /* MOV Ob,AL */
   20179       if (have66orF2orF3(pfx)) goto decode_failure;
   20180       sz = 1;
   20181       /* Fall through ... */
   20182    case 0xA1: /* MOV Ov,eAX */
   20183       if (sz != 8 && sz != 4 && sz != 2 && sz != 1)
   20184          goto decode_failure;
   20185       d64 = getDisp64(delta);
   20186       delta += 8;
   20187       ty = szToITy(sz);
   20188       addr = newTemp(Ity_I64);
   20189       assign( addr, handleAddrOverrides(vbi, pfx, mkU64(d64)) );
   20190       putIRegRAX(sz, loadLE( ty, mkexpr(addr) ));
   20191       DIP("mov%c %s0x%llx, %s\n", nameISize(sz),
   20192                                   segRegTxt(pfx), d64,
   20193                                   nameIRegRAX(sz));
   20194       return delta;
   20195 
   20196    case 0xA2: /* MOV AL,Ob */
   20197       if (have66orF2orF3(pfx)) goto decode_failure;
   20198       sz = 1;
   20199       /* Fall through ... */
   20200    case 0xA3: /* MOV eAX,Ov */
   20201       if (sz != 8 && sz != 4 && sz != 2 && sz != 1)
   20202          goto decode_failure;
   20203       d64 = getDisp64(delta);
   20204       delta += 8;
   20205       ty = szToITy(sz);
   20206       addr = newTemp(Ity_I64);
   20207       assign( addr, handleAddrOverrides(vbi, pfx, mkU64(d64)) );
   20208       storeLE( mkexpr(addr), getIRegRAX(sz) );
   20209       DIP("mov%c %s, %s0x%llx\n", nameISize(sz), nameIRegRAX(sz),
   20210                                   segRegTxt(pfx), d64);
   20211       return delta;
   20212 
   20213    case 0xA4:
   20214    case 0xA5:
   20215       /* F3 A4: rep movsb */
   20216       if (haveF3(pfx) && !haveF2(pfx)) {
   20217          if (opc == 0xA4)
   20218             sz = 1;
   20219          dis_REP_op ( dres, AMD64CondAlways, dis_MOVS, sz,
   20220                       guest_RIP_curr_instr,
   20221                       guest_RIP_bbstart+delta, "rep movs", pfx );
   20222         dres->whatNext = Dis_StopHere;
   20223         return delta;
   20224       }
   20225       /* A4: movsb */
   20226       if (!haveF3(pfx) && !haveF2(pfx)) {
   20227          if (opc == 0xA4)
   20228             sz = 1;
   20229          dis_string_op( dis_MOVS, sz, "movs", pfx );
   20230          return delta;
   20231       }
   20232       goto decode_failure;
   20233 
   20234    case 0xA6:
   20235    case 0xA7:
   20236       /* F3 A6/A7: repe cmps/rep cmps{w,l,q} */
   20237       if (haveF3(pfx) && !haveF2(pfx)) {
   20238          if (opc == 0xA6)
   20239             sz = 1;
   20240          dis_REP_op ( dres, AMD64CondZ, dis_CMPS, sz,
   20241                       guest_RIP_curr_instr,
   20242                       guest_RIP_bbstart+delta, "repe cmps", pfx );
   20243          dres->whatNext = Dis_StopHere;
   20244          return delta;
   20245       }
   20246       goto decode_failure;
   20247 
   20248    case 0xAA:
   20249    case 0xAB:
   20250       /* F3 AA/AB: rep stosb/rep stos{w,l,q} */
   20251       if (haveF3(pfx) && !haveF2(pfx)) {
   20252          if (opc == 0xAA)
   20253             sz = 1;
   20254          dis_REP_op ( dres, AMD64CondAlways, dis_STOS, sz,
   20255                       guest_RIP_curr_instr,
   20256                       guest_RIP_bbstart+delta, "rep stos", pfx );
   20257          vassert(dres->whatNext == Dis_StopHere);
   20258          return delta;
   20259       }
   20260       /* AA/AB: stosb/stos{w,l,q} */
   20261       if (!haveF3(pfx) && !haveF2(pfx)) {
   20262          if (opc == 0xAA)
   20263             sz = 1;
   20264          dis_string_op( dis_STOS, sz, "stos", pfx );
   20265          return delta;
   20266       }
   20267       goto decode_failure;
   20268 
   20269    case 0xA8: /* TEST Ib, AL */
   20270       if (haveF2orF3(pfx)) goto decode_failure;
   20271       delta = dis_op_imm_A( 1, False, Iop_And8, False, delta, "test" );
   20272       return delta;
   20273    case 0xA9: /* TEST Iv, eAX */
   20274       if (haveF2orF3(pfx)) goto decode_failure;
   20275       delta = dis_op_imm_A( sz, False, Iop_And8, False, delta, "test" );
   20276       return delta;
   20277 
   20278    case 0xAC: /* LODS, no REP prefix */
   20279    case 0xAD:
   20280       dis_string_op( dis_LODS, ( opc == 0xAC ? 1 : sz ), "lods", pfx );
   20281       return delta;
   20282 
   20283    case 0xAE:
   20284    case 0xAF:
   20285       /* F2 AE/AF: repne scasb/repne scas{w,l,q} */
   20286       if (haveF2(pfx) && !haveF3(pfx)) {
   20287          if (opc == 0xAE)
   20288             sz = 1;
   20289          dis_REP_op ( dres, AMD64CondNZ, dis_SCAS, sz,
   20290                       guest_RIP_curr_instr,
   20291                       guest_RIP_bbstart+delta, "repne scas", pfx );
   20292          vassert(dres->whatNext == Dis_StopHere);
   20293          return delta;
   20294       }
   20295       /* F3 AE/AF: repe scasb/repe scas{w,l,q} */
   20296       if (!haveF2(pfx) && haveF3(pfx)) {
   20297          if (opc == 0xAE)
   20298             sz = 1;
   20299          dis_REP_op ( dres, AMD64CondZ, dis_SCAS, sz,
   20300                       guest_RIP_curr_instr,
   20301                       guest_RIP_bbstart+delta, "repe scas", pfx );
   20302          vassert(dres->whatNext == Dis_StopHere);
   20303          return delta;
   20304       }
   20305       /* AE/AF: scasb/scas{w,l,q} */
   20306       if (!haveF2(pfx) && !haveF3(pfx)) {
   20307          if (opc == 0xAE)
   20308             sz = 1;
   20309          dis_string_op( dis_SCAS, sz, "scas", pfx );
   20310          return delta;
   20311       }
   20312       goto decode_failure;
   20313 
   20314    /* XXXX be careful here with moves to AH/BH/CH/DH */
   20315    case 0xB0: /* MOV imm,AL */
   20316    case 0xB1: /* MOV imm,CL */
   20317    case 0xB2: /* MOV imm,DL */
   20318    case 0xB3: /* MOV imm,BL */
   20319    case 0xB4: /* MOV imm,AH */
   20320    case 0xB5: /* MOV imm,CH */
   20321    case 0xB6: /* MOV imm,DH */
   20322    case 0xB7: /* MOV imm,BH */
   20323       if (haveF2orF3(pfx)) goto decode_failure;
   20324       d64 = getUChar(delta);
   20325       delta += 1;
   20326       putIRegRexB(1, pfx, opc-0xB0, mkU8(d64));
   20327       DIP("movb $%lld,%s\n", d64, nameIRegRexB(1,pfx,opc-0xB0));
   20328       return delta;
   20329 
   20330    case 0xB8: /* MOV imm,eAX */
   20331    case 0xB9: /* MOV imm,eCX */
   20332    case 0xBA: /* MOV imm,eDX */
   20333    case 0xBB: /* MOV imm,eBX */
   20334    case 0xBC: /* MOV imm,eSP */
   20335    case 0xBD: /* MOV imm,eBP */
   20336    case 0xBE: /* MOV imm,eSI */
   20337    case 0xBF: /* MOV imm,eDI */
   20338       /* This is the one-and-only place where 64-bit literals are
   20339          allowed in the instruction stream. */
   20340       if (haveF2orF3(pfx)) goto decode_failure;
   20341       if (sz == 8) {
   20342          d64 = getDisp64(delta);
   20343          delta += 8;
   20344          putIRegRexB(8, pfx, opc-0xB8, mkU64(d64));
   20345          DIP("movabsq $%lld,%s\n", (Long)d64,
   20346                                    nameIRegRexB(8,pfx,opc-0xB8));
   20347       } else {
   20348          d64 = getSDisp(imin(4,sz),delta);
   20349          delta += imin(4,sz);
   20350          putIRegRexB(sz, pfx, opc-0xB8,
   20351                          mkU(szToITy(sz), d64 & mkSizeMask(sz)));
   20352          DIP("mov%c $%lld,%s\n", nameISize(sz),
   20353                                  (Long)d64,
   20354                                  nameIRegRexB(sz,pfx,opc-0xB8));
   20355       }
   20356       return delta;
   20357 
   20358    case 0xC0: { /* Grp2 Ib,Eb */
   20359       Bool decode_OK = True;
   20360       if (haveF2orF3(pfx)) goto decode_failure;
   20361       modrm = getUChar(delta);
   20362       am_sz = lengthAMode(pfx,delta);
   20363       d_sz  = 1;
   20364       d64   = getUChar(delta + am_sz);
   20365       sz    = 1;
   20366       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   20367                          mkU8(d64 & 0xFF), NULL, &decode_OK );
   20368       if (!decode_OK) goto decode_failure;
   20369       return delta;
   20370    }
   20371 
   20372    case 0xC1: { /* Grp2 Ib,Ev */
   20373       Bool decode_OK = True;
   20374       if (haveF2orF3(pfx)) goto decode_failure;
   20375       modrm = getUChar(delta);
   20376       am_sz = lengthAMode(pfx,delta);
   20377       d_sz  = 1;
   20378       d64   = getUChar(delta + am_sz);
   20379       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   20380                          mkU8(d64 & 0xFF), NULL, &decode_OK );
   20381       if (!decode_OK) goto decode_failure;
   20382       return delta;
   20383    }
   20384 
   20385    case 0xC2: /* RET imm16 */
   20386       if (have66orF3(pfx)) goto decode_failure;
   20387       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   20388       d64 = getUDisp16(delta);
   20389       delta += 2;
   20390       dis_ret(dres, vbi, d64);
   20391       DIP("ret $%lld\n", d64);
   20392       return delta;
   20393 
   20394    case 0xC3: /* RET */
   20395       if (have66(pfx)) goto decode_failure;
   20396       /* F3 is acceptable on AMD. */
   20397       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   20398       dis_ret(dres, vbi, 0);
   20399       DIP(haveF3(pfx) ? "rep ; ret\n" : "ret\n");
   20400       return delta;
   20401 
   20402    case 0xC6: /* C6 /0 = MOV Ib,Eb */
   20403       sz = 1;
   20404       goto maybe_do_Mov_I_E;
   20405    case 0xC7: /* C7 /0 = MOV Iv,Ev */
   20406       goto maybe_do_Mov_I_E;
   20407    maybe_do_Mov_I_E:
   20408       modrm = getUChar(delta);
   20409       if (gregLO3ofRM(modrm) == 0) {
   20410          if (epartIsReg(modrm)) {
   20411             /* Neither F2 nor F3 are allowable. */
   20412             if (haveF2orF3(pfx)) goto decode_failure;
   20413             delta++; /* mod/rm byte */
   20414             d64 = getSDisp(imin(4,sz),delta);
   20415             delta += imin(4,sz);
   20416             putIRegE(sz, pfx, modrm,
   20417                          mkU(szToITy(sz), d64 & mkSizeMask(sz)));
   20418             DIP("mov%c $%lld, %s\n", nameISize(sz),
   20419                                      (Long)d64,
   20420                                      nameIRegE(sz,pfx,modrm));
   20421          } else {
   20422             if (haveF2(pfx)) goto decode_failure;
   20423             /* F3(XRELEASE) is allowable here */
   20424             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   20425                               /*xtra*/imin(4,sz) );
   20426             delta += alen;
   20427             d64 = getSDisp(imin(4,sz),delta);
   20428             delta += imin(4,sz);
   20429             storeLE(mkexpr(addr),
   20430                     mkU(szToITy(sz), d64 & mkSizeMask(sz)));
   20431             DIP("mov%c $%lld, %s\n", nameISize(sz), (Long)d64, dis_buf);
   20432          }
   20433          return delta;
   20434       }
   20435       /* BEGIN HACKY SUPPORT FOR xbegin */
   20436       if (opc == 0xC7 && modrm == 0xF8 && !have66orF2orF3(pfx) && sz == 4
   20437           && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   20438          delta++; /* mod/rm byte */
   20439          d64 = getSDisp(4,delta);
   20440          delta += 4;
   20441          guest_RIP_next_mustcheck = True;
   20442          guest_RIP_next_assumed   = guest_RIP_bbstart + delta;
   20443          Addr64 failAddr = guest_RIP_bbstart + delta + d64;
   20444          /* EAX contains the failure status code.  Bit 3 is "Set if an
   20445             internal buffer overflowed", which seems like the
   20446             least-bogus choice we can make here. */
   20447          putIRegRAX(4, mkU32(1<<3));
   20448          /* And jump to the fail address. */
   20449          jmp_lit(dres, Ijk_Boring, failAddr);
   20450          vassert(dres->whatNext == Dis_StopHere);
   20451          DIP("xbeginq 0x%llx\n", failAddr);
   20452          return delta;
   20453       }
   20454       /* END HACKY SUPPORT FOR xbegin */
   20455       /* BEGIN HACKY SUPPORT FOR xabort */
   20456       if (opc == 0xC6 && modrm == 0xF8 && !have66orF2orF3(pfx) && sz == 1
   20457           && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   20458          delta++; /* mod/rm byte */
   20459          abyte = getUChar(delta); delta++;
   20460          /* There is never a real transaction in progress, so do nothing. */
   20461          DIP("xabort $%d", (Int)abyte);
   20462          return delta;
   20463       }
   20464       /* END HACKY SUPPORT FOR xabort */
   20465       goto decode_failure;
   20466 
   20467    case 0xC8: /* ENTER */
   20468       /* Same comments re operand size as for LEAVE below apply.
   20469          Also, only handles the case "enter $imm16, $0"; other cases
   20470          for the second operand (nesting depth) are not handled. */
   20471       if (sz != 4)
   20472          goto decode_failure;
   20473       d64 = getUDisp16(delta);
   20474       delta += 2;
   20475       vassert(d64 >= 0 && d64 <= 0xFFFF);
   20476       if (getUChar(delta) != 0)
   20477          goto decode_failure;
   20478       delta++;
   20479       /* Intel docs seem to suggest:
   20480            push rbp
   20481            temp = rsp
   20482            rbp = temp
   20483            rsp = rsp - imm16
   20484       */
   20485       t1 = newTemp(Ity_I64);
   20486       assign(t1, getIReg64(R_RBP));
   20487       t2 = newTemp(Ity_I64);
   20488       assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   20489       putIReg64(R_RSP, mkexpr(t2));
   20490       storeLE(mkexpr(t2), mkexpr(t1));
   20491       putIReg64(R_RBP, mkexpr(t2));
   20492       if (d64 > 0) {
   20493          putIReg64(R_RSP, binop(Iop_Sub64, mkexpr(t2), mkU64(d64)));
   20494       }
   20495       DIP("enter $%u, $0\n", (UInt)d64);
   20496       return delta;
   20497 
   20498    case 0xC9: /* LEAVE */
   20499       /* In 64-bit mode this defaults to a 64-bit operand size.  There
   20500          is no way to encode a 32-bit variant.  Hence sz==4 but we do
   20501          it as if sz=8. */
   20502       if (sz != 4)
   20503          goto decode_failure;
   20504       t1 = newTemp(Ity_I64);
   20505       t2 = newTemp(Ity_I64);
   20506       assign(t1, getIReg64(R_RBP));
   20507       /* First PUT RSP looks redundant, but need it because RSP must
   20508          always be up-to-date for Memcheck to work... */
   20509       putIReg64(R_RSP, mkexpr(t1));
   20510       assign(t2, loadLE(Ity_I64,mkexpr(t1)));
   20511       putIReg64(R_RBP, mkexpr(t2));
   20512       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t1), mkU64(8)) );
   20513       DIP("leave\n");
   20514       return delta;
   20515 
   20516    case 0xCC: /* INT 3 */
   20517       jmp_lit(dres, Ijk_SigTRAP, guest_RIP_bbstart + delta);
   20518       vassert(dres->whatNext == Dis_StopHere);
   20519       DIP("int $0x3\n");
   20520       return delta;
   20521 
   20522    case 0xD0: { /* Grp2 1,Eb */
   20523       Bool decode_OK = True;
   20524       if (haveF2orF3(pfx)) goto decode_failure;
   20525       modrm = getUChar(delta);
   20526       am_sz = lengthAMode(pfx,delta);
   20527       d_sz  = 0;
   20528       d64   = 1;
   20529       sz    = 1;
   20530       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   20531                          mkU8(d64), NULL, &decode_OK );
   20532       if (!decode_OK) goto decode_failure;
   20533       return delta;
   20534    }
   20535 
   20536    case 0xD1: { /* Grp2 1,Ev */
   20537       Bool decode_OK = True;
   20538       if (haveF2orF3(pfx)) goto decode_failure;
   20539       modrm = getUChar(delta);
   20540       am_sz = lengthAMode(pfx,delta);
   20541       d_sz  = 0;
   20542       d64   = 1;
   20543       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   20544                          mkU8(d64), NULL, &decode_OK );
   20545       if (!decode_OK) goto decode_failure;
   20546       return delta;
   20547    }
   20548 
   20549    case 0xD2: { /* Grp2 CL,Eb */
   20550       Bool decode_OK = True;
   20551       if (haveF2orF3(pfx)) goto decode_failure;
   20552       modrm = getUChar(delta);
   20553       am_sz = lengthAMode(pfx,delta);
   20554       d_sz  = 0;
   20555       sz    = 1;
   20556       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   20557                          getIRegCL(), "%cl", &decode_OK );
   20558       if (!decode_OK) goto decode_failure;
   20559       return delta;
   20560    }
   20561 
   20562    case 0xD3: { /* Grp2 CL,Ev */
   20563       Bool decode_OK = True;
   20564       if (haveF2orF3(pfx)) goto decode_failure;
   20565       modrm = getUChar(delta);
   20566       am_sz = lengthAMode(pfx,delta);
   20567       d_sz  = 0;
   20568       delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
   20569                          getIRegCL(), "%cl", &decode_OK );
   20570       if (!decode_OK) goto decode_failure;
   20571       return delta;
   20572    }
   20573 
   20574    case 0xD8: /* X87 instructions */
   20575    case 0xD9:
   20576    case 0xDA:
   20577    case 0xDB:
   20578    case 0xDC:
   20579    case 0xDD:
   20580    case 0xDE:
   20581    case 0xDF: {
   20582       Bool redundantREXWok = False;
   20583 
   20584       if (haveF2orF3(pfx))
   20585          goto decode_failure;
   20586 
   20587       /* kludge to tolerate redundant rex.w prefixes (should do this
   20588          properly one day) */
   20589       /* mono 1.1.18.1 produces 48 D9 FA, which is rex.w fsqrt */
   20590       if ( (opc == 0xD9 && getUChar(delta+0) == 0xFA)/*fsqrt*/ )
   20591          redundantREXWok = True;
   20592 
   20593       Bool size_OK = False;
   20594       if ( sz == 4 )
   20595          size_OK = True;
   20596       else if ( sz == 8 )
   20597          size_OK = redundantREXWok;
   20598       else if ( sz == 2 ) {
   20599          int mod_rm = getUChar(delta+0);
   20600          int reg = gregLO3ofRM(mod_rm);
   20601          /* The HotSpot JVM uses these */
   20602          if ( (opc == 0xDD) && (reg == 0 /* FLDL   */ ||
   20603                                 reg == 4 /* FNSAVE */ ||
   20604                                 reg == 6 /* FRSTOR */ ) )
   20605             size_OK = True;
   20606       }
   20607       /* AMD manual says 0x66 size override is ignored, except where
   20608          it is meaningful */
   20609       if (!size_OK)
   20610          goto decode_failure;
   20611 
   20612       Bool decode_OK = False;
   20613       delta = dis_FPU ( &decode_OK, vbi, pfx, delta );
   20614       if (!decode_OK)
   20615          goto decode_failure;
   20616 
   20617       return delta;
   20618    }
   20619 
   20620    case 0xE0: /* LOOPNE disp8: decrement count, jump if count != 0 && ZF==0 */
   20621    case 0xE1: /* LOOPE  disp8: decrement count, jump if count != 0 && ZF==1 */
   20622    case 0xE2: /* LOOP   disp8: decrement count, jump if count != 0 */
   20623     { /* The docs say this uses rCX as a count depending on the
   20624          address size override, not the operand one. */
   20625       IRExpr* zbit  = NULL;
   20626       IRExpr* count = NULL;
   20627       IRExpr* cond  = NULL;
   20628       const HChar* xtra = NULL;
   20629 
   20630       if (have66orF2orF3(pfx) || 1==getRexW(pfx)) goto decode_failure;
   20631       /* So at this point we've rejected any variants which appear to
   20632          be governed by the usual operand-size modifiers.  Hence only
   20633          the address size prefix can have an effect.  It changes the
   20634          size from 64 (default) to 32. */
   20635       d64 = guest_RIP_bbstart+delta+1 + getSDisp8(delta);
   20636       delta++;
   20637       if (haveASO(pfx)) {
   20638          /* 64to32 of 64-bit get is merely a get-put improvement
   20639             trick. */
   20640          putIReg32(R_RCX, binop(Iop_Sub32,
   20641                                 unop(Iop_64to32, getIReg64(R_RCX)),
   20642                                 mkU32(1)));
   20643       } else {
   20644          putIReg64(R_RCX, binop(Iop_Sub64, getIReg64(R_RCX), mkU64(1)));
   20645       }
   20646 
   20647       /* This is correct, both for 32- and 64-bit versions.  If we're
   20648          doing a 32-bit dec and the result is zero then the default
   20649          zero extension rule will cause the upper 32 bits to be zero
   20650          too.  Hence a 64-bit check against zero is OK. */
   20651       count = getIReg64(R_RCX);
   20652       cond = binop(Iop_CmpNE64, count, mkU64(0));
   20653       switch (opc) {
   20654          case 0xE2:
   20655             xtra = "";
   20656             break;
   20657          case 0xE1:
   20658             xtra = "e";
   20659             zbit = mk_amd64g_calculate_condition( AMD64CondZ );
   20660             cond = mkAnd1(cond, zbit);
   20661             break;
   20662          case 0xE0:
   20663             xtra = "ne";
   20664             zbit = mk_amd64g_calculate_condition( AMD64CondNZ );
   20665             cond = mkAnd1(cond, zbit);
   20666             break;
   20667          default:
   20668             vassert(0);
   20669       }
   20670       stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U64(d64), OFFB_RIP) );
   20671 
   20672       DIP("loop%s%s 0x%llx\n", xtra, haveASO(pfx) ? "l" : "", d64);
   20673       return delta;
   20674     }
   20675 
   20676    case 0xE3:
   20677       /* JRCXZ or JECXZ, depending address size override. */
   20678       if (have66orF2orF3(pfx)) goto decode_failure;
   20679       d64 = (guest_RIP_bbstart+delta+1) + getSDisp8(delta);
   20680       delta++;
   20681       if (haveASO(pfx)) {
   20682          /* 32-bit */
   20683          stmt( IRStmt_Exit( binop(Iop_CmpEQ64,
   20684                                   unop(Iop_32Uto64, getIReg32(R_RCX)),
   20685                                   mkU64(0)),
   20686                             Ijk_Boring,
   20687                             IRConst_U64(d64),
   20688                             OFFB_RIP
   20689              ));
   20690          DIP("jecxz 0x%llx\n", d64);
   20691       } else {
   20692          /* 64-bit */
   20693          stmt( IRStmt_Exit( binop(Iop_CmpEQ64,
   20694                                   getIReg64(R_RCX),
   20695                                   mkU64(0)),
   20696                             Ijk_Boring,
   20697                             IRConst_U64(d64),
   20698                             OFFB_RIP
   20699                ));
   20700          DIP("jrcxz 0x%llx\n", d64);
   20701       }
   20702       return delta;
   20703 
   20704    case 0xE4: /* IN imm8, AL */
   20705       sz = 1;
   20706       t1 = newTemp(Ity_I64);
   20707       abyte = getUChar(delta); delta++;
   20708       assign(t1, mkU64( abyte & 0xFF ));
   20709       DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIRegRAX(sz));
   20710       goto do_IN;
   20711    case 0xE5: /* IN imm8, eAX */
   20712       if (!(sz == 2 || sz == 4)) goto decode_failure;
   20713       t1 = newTemp(Ity_I64);
   20714       abyte = getUChar(delta); delta++;
   20715       assign(t1, mkU64( abyte & 0xFF ));
   20716       DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIRegRAX(sz));
   20717       goto do_IN;
   20718    case 0xEC: /* IN %DX, AL */
   20719       sz = 1;
   20720       t1 = newTemp(Ity_I64);
   20721       assign(t1, unop(Iop_16Uto64, getIRegRDX(2)));
   20722       DIP("in%c %s,%s\n", nameISize(sz), nameIRegRDX(2),
   20723                                          nameIRegRAX(sz));
   20724       goto do_IN;
   20725    case 0xED: /* IN %DX, eAX */
   20726       if (!(sz == 2 || sz == 4)) goto decode_failure;
   20727       t1 = newTemp(Ity_I64);
   20728       assign(t1, unop(Iop_16Uto64, getIRegRDX(2)));
   20729       DIP("in%c %s,%s\n", nameISize(sz), nameIRegRDX(2),
   20730                                          nameIRegRAX(sz));
   20731       goto do_IN;
   20732    do_IN: {
   20733       /* At this point, sz indicates the width, and t1 is a 64-bit
   20734          value giving port number. */
   20735       IRDirty* d;
   20736       if (haveF2orF3(pfx)) goto decode_failure;
   20737       vassert(sz == 1 || sz == 2 || sz == 4);
   20738       ty = szToITy(sz);
   20739       t2 = newTemp(Ity_I64);
   20740       d = unsafeIRDirty_1_N(
   20741              t2,
   20742              0/*regparms*/,
   20743              "amd64g_dirtyhelper_IN",
   20744              &amd64g_dirtyhelper_IN,
   20745              mkIRExprVec_2( mkexpr(t1), mkU64(sz) )
   20746           );
   20747       /* do the call, dumping the result in t2. */
   20748       stmt( IRStmt_Dirty(d) );
   20749       putIRegRAX(sz, narrowTo( ty, mkexpr(t2) ) );
   20750       return delta;
   20751    }
   20752 
   20753    case 0xE6: /* OUT AL, imm8 */
   20754       sz = 1;
   20755       t1 = newTemp(Ity_I64);
   20756       abyte = getUChar(delta); delta++;
   20757       assign( t1, mkU64( abyte & 0xFF ) );
   20758       DIP("out%c %s,$%d\n", nameISize(sz), nameIRegRAX(sz), (Int)abyte);
   20759       goto do_OUT;
   20760    case 0xE7: /* OUT eAX, imm8 */
   20761       if (!(sz == 2 || sz == 4)) goto decode_failure;
   20762       t1 = newTemp(Ity_I64);
   20763       abyte = getUChar(delta); delta++;
   20764       assign( t1, mkU64( abyte & 0xFF ) );
   20765       DIP("out%c %s,$%d\n", nameISize(sz), nameIRegRAX(sz), (Int)abyte);
   20766       goto do_OUT;
   20767    case 0xEE: /* OUT AL, %DX */
   20768       sz = 1;
   20769       t1 = newTemp(Ity_I64);
   20770       assign( t1, unop(Iop_16Uto64, getIRegRDX(2)) );
   20771       DIP("out%c %s,%s\n", nameISize(sz), nameIRegRAX(sz),
   20772                                           nameIRegRDX(2));
   20773       goto do_OUT;
   20774    case 0xEF: /* OUT eAX, %DX */
   20775       if (!(sz == 2 || sz == 4)) goto decode_failure;
   20776       t1 = newTemp(Ity_I64);
   20777       assign( t1, unop(Iop_16Uto64, getIRegRDX(2)) );
   20778       DIP("out%c %s,%s\n", nameISize(sz), nameIRegRAX(sz),
   20779                                           nameIRegRDX(2));
   20780       goto do_OUT;
   20781    do_OUT: {
   20782       /* At this point, sz indicates the width, and t1 is a 64-bit
   20783          value giving port number. */
   20784       IRDirty* d;
   20785       if (haveF2orF3(pfx)) goto decode_failure;
   20786       vassert(sz == 1 || sz == 2 || sz == 4);
   20787       ty = szToITy(sz);
   20788       d = unsafeIRDirty_0_N(
   20789              0/*regparms*/,
   20790              "amd64g_dirtyhelper_OUT",
   20791              &amd64g_dirtyhelper_OUT,
   20792              mkIRExprVec_3( mkexpr(t1),
   20793                             widenUto64( getIRegRAX(sz) ),
   20794                             mkU64(sz) )
   20795           );
   20796       stmt( IRStmt_Dirty(d) );
   20797       return delta;
   20798    }
   20799 
   20800    case 0xE8: /* CALL J4 */
   20801       if (haveF3(pfx)) goto decode_failure;
   20802       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   20803       d64 = getSDisp32(delta); delta += 4;
   20804       d64 += (guest_RIP_bbstart+delta);
   20805       /* (guest_RIP_bbstart+delta) == return-to addr, d64 == call-to addr */
   20806       t1 = newTemp(Ity_I64);
   20807       assign(t1, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   20808       putIReg64(R_RSP, mkexpr(t1));
   20809       storeLE( mkexpr(t1), mkU64(guest_RIP_bbstart+delta));
   20810       t2 = newTemp(Ity_I64);
   20811       assign(t2, mkU64((Addr64)d64));
   20812       make_redzone_AbiHint(vbi, t1, t2/*nia*/, "call-d32");
   20813       if (resteerOkFn( callback_opaque, (Addr64)d64) ) {
   20814          /* follow into the call target. */
   20815          dres->whatNext   = Dis_ResteerU;
   20816          dres->continueAt = d64;
   20817       } else {
   20818          jmp_lit(dres, Ijk_Call, d64);
   20819          vassert(dres->whatNext == Dis_StopHere);
   20820       }
   20821       DIP("call 0x%llx\n",d64);
   20822       return delta;
   20823 
   20824    case 0xE9: /* Jv (jump, 16/32 offset) */
   20825       if (haveF3(pfx)) goto decode_failure;
   20826       if (sz != 4)
   20827          goto decode_failure; /* JRS added 2004 July 11 */
   20828       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   20829       d64 = (guest_RIP_bbstart+delta+sz) + getSDisp(sz,delta);
   20830       delta += sz;
   20831       if (resteerOkFn(callback_opaque,d64)) {
   20832          dres->whatNext   = Dis_ResteerU;
   20833          dres->continueAt = d64;
   20834       } else {
   20835          jmp_lit(dres, Ijk_Boring, d64);
   20836          vassert(dres->whatNext == Dis_StopHere);
   20837       }
   20838       DIP("jmp 0x%llx\n", d64);
   20839       return delta;
   20840 
   20841    case 0xEB: /* Jb (jump, byte offset) */
   20842       if (haveF3(pfx)) goto decode_failure;
   20843       if (sz != 4)
   20844          goto decode_failure; /* JRS added 2004 July 11 */
   20845       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   20846       d64 = (guest_RIP_bbstart+delta+1) + getSDisp8(delta);
   20847       delta++;
   20848       if (resteerOkFn(callback_opaque,d64)) {
   20849          dres->whatNext   = Dis_ResteerU;
   20850          dres->continueAt = d64;
   20851       } else {
   20852          jmp_lit(dres, Ijk_Boring, d64);
   20853          vassert(dres->whatNext == Dis_StopHere);
   20854       }
   20855       DIP("jmp-8 0x%llx\n", d64);
   20856       return delta;
   20857 
   20858    case 0xF5: /* CMC */
   20859    case 0xF8: /* CLC */
   20860    case 0xF9: /* STC */
   20861       t1 = newTemp(Ity_I64);
   20862       t2 = newTemp(Ity_I64);
   20863       assign( t1, mk_amd64g_calculate_rflags_all() );
   20864       switch (opc) {
   20865          case 0xF5:
   20866             assign( t2, binop(Iop_Xor64, mkexpr(t1),
   20867                                          mkU64(AMD64G_CC_MASK_C)));
   20868             DIP("cmc\n");
   20869             break;
   20870          case 0xF8:
   20871             assign( t2, binop(Iop_And64, mkexpr(t1),
   20872                                          mkU64(~AMD64G_CC_MASK_C)));
   20873             DIP("clc\n");
   20874             break;
   20875          case 0xF9:
   20876             assign( t2, binop(Iop_Or64, mkexpr(t1),
   20877                                         mkU64(AMD64G_CC_MASK_C)));
   20878             DIP("stc\n");
   20879             break;
   20880          default:
   20881             vpanic("disInstr(x64)(cmc/clc/stc)");
   20882       }
   20883       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   20884       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   20885       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t2) ));
   20886       /* Set NDEP even though it isn't used.  This makes redundant-PUT
   20887          elimination of previous stores to this field work better. */
   20888       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   20889       return delta;
   20890 
   20891    case 0xF6: { /* Grp3 Eb */
   20892       Bool decode_OK = True;
   20893       /* RM'd: if (haveF2orF3(pfx)) goto decode_failure; */
   20894       /* We now let dis_Grp3 itself decide if F2 and/or F3 are valid */
   20895       delta = dis_Grp3 ( vbi, pfx, 1, delta, &decode_OK );
   20896       if (!decode_OK) goto decode_failure;
   20897       return delta;
   20898    }
   20899 
   20900    case 0xF7: { /* Grp3 Ev */
   20901       Bool decode_OK = True;
   20902       /* RM'd: if (haveF2orF3(pfx)) goto decode_failure; */
   20903       /* We now let dis_Grp3 itself decide if F2 and/or F3 are valid */
   20904       delta = dis_Grp3 ( vbi, pfx, sz, delta, &decode_OK );
   20905       if (!decode_OK) goto decode_failure;
   20906       return delta;
   20907    }
   20908 
   20909    case 0xFC: /* CLD */
   20910       if (haveF2orF3(pfx)) goto decode_failure;
   20911       stmt( IRStmt_Put( OFFB_DFLAG, mkU64(1)) );
   20912       DIP("cld\n");
   20913       return delta;
   20914 
   20915    case 0xFD: /* STD */
   20916       if (haveF2orF3(pfx)) goto decode_failure;
   20917       stmt( IRStmt_Put( OFFB_DFLAG, mkU64(-1ULL)) );
   20918       DIP("std\n");
   20919       return delta;
   20920 
   20921    case 0xFE: { /* Grp4 Eb */
   20922       Bool decode_OK = True;
   20923       /* RM'd: if (haveF2orF3(pfx)) goto decode_failure; */
   20924       /* We now let dis_Grp4 itself decide if F2 and/or F3 are valid */
   20925       delta = dis_Grp4 ( vbi, pfx, delta, &decode_OK );
   20926       if (!decode_OK) goto decode_failure;
   20927       return delta;
   20928    }
   20929 
   20930    case 0xFF: { /* Grp5 Ev */
   20931       Bool decode_OK = True;
   20932       /* RM'd: if (haveF2orF3(pfx)) goto decode_failure; */
   20933       /* We now let dis_Grp5 itself decide if F2 and/or F3 are valid */
   20934       delta = dis_Grp5 ( vbi, pfx, sz, delta, dres, &decode_OK );
   20935       if (!decode_OK) goto decode_failure;
   20936       return delta;
   20937    }
   20938 
   20939    default:
   20940       break;
   20941 
   20942    }
   20943 
   20944   decode_failure:
   20945    return deltaIN; /* fail */
   20946 }
   20947 
   20948 
   20949 /*------------------------------------------------------------*/
   20950 /*---                                                      ---*/
   20951 /*--- Top-level post-escape decoders: dis_ESC_0F           ---*/
   20952 /*---                                                      ---*/
   20953 /*------------------------------------------------------------*/
   20954 
   20955 static IRTemp math_BSWAP ( IRTemp t1, IRType ty )
   20956 {
   20957    IRTemp t2 = newTemp(ty);
   20958    if (ty == Ity_I64) {
   20959       IRTemp m8  = newTemp(Ity_I64);
   20960       IRTemp s8  = newTemp(Ity_I64);
   20961       IRTemp m16 = newTemp(Ity_I64);
   20962       IRTemp s16 = newTemp(Ity_I64);
   20963       IRTemp m32 = newTemp(Ity_I64);
   20964       assign( m8, mkU64(0xFF00FF00FF00FF00ULL) );
   20965       assign( s8,
   20966               binop(Iop_Or64,
   20967                     binop(Iop_Shr64,
   20968                           binop(Iop_And64,mkexpr(t1),mkexpr(m8)),
   20969                           mkU8(8)),
   20970                     binop(Iop_And64,
   20971                           binop(Iop_Shl64,mkexpr(t1),mkU8(8)),
   20972                           mkexpr(m8))
   20973                    )
   20974             );
   20975 
   20976       assign( m16, mkU64(0xFFFF0000FFFF0000ULL) );
   20977       assign( s16,
   20978               binop(Iop_Or64,
   20979                     binop(Iop_Shr64,
   20980                           binop(Iop_And64,mkexpr(s8),mkexpr(m16)),
   20981                           mkU8(16)),
   20982                     binop(Iop_And64,
   20983                           binop(Iop_Shl64,mkexpr(s8),mkU8(16)),
   20984                           mkexpr(m16))
   20985                    )
   20986             );
   20987 
   20988       assign( m32, mkU64(0xFFFFFFFF00000000ULL) );
   20989       assign( t2,
   20990               binop(Iop_Or64,
   20991                     binop(Iop_Shr64,
   20992                           binop(Iop_And64,mkexpr(s16),mkexpr(m32)),
   20993                           mkU8(32)),
   20994                     binop(Iop_And64,
   20995                           binop(Iop_Shl64,mkexpr(s16),mkU8(32)),
   20996                           mkexpr(m32))
   20997                    )
   20998             );
   20999       return t2;
   21000    }
   21001    if (ty == Ity_I32) {
   21002       assign( t2,
   21003          binop(
   21004             Iop_Or32,
   21005             binop(Iop_Shl32, mkexpr(t1), mkU8(24)),
   21006             binop(
   21007                Iop_Or32,
   21008                binop(Iop_And32, binop(Iop_Shl32, mkexpr(t1), mkU8(8)),
   21009                                 mkU32(0x00FF0000)),
   21010                binop(Iop_Or32,
   21011                      binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(8)),
   21012                                       mkU32(0x0000FF00)),
   21013                      binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(24)),
   21014                                       mkU32(0x000000FF) )
   21015             )))
   21016       );
   21017       return t2;
   21018    }
   21019    if (ty == Ity_I16) {
   21020       assign(t2,
   21021              binop(Iop_Or16,
   21022                    binop(Iop_Shl16, mkexpr(t1), mkU8(8)),
   21023                    binop(Iop_Shr16, mkexpr(t1), mkU8(8)) ));
   21024       return t2;
   21025    }
   21026    vassert(0);
   21027    /*NOTREACHED*/
   21028    return IRTemp_INVALID;
   21029 }
   21030 
   21031 
   21032 __attribute__((noinline))
   21033 static
   21034 Long dis_ESC_0F (
   21035         /*MB_OUT*/DisResult* dres,
   21036         /*MB_OUT*/Bool*      expect_CAS,
   21037         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
   21038         Bool         resteerCisOk,
   21039         void*        callback_opaque,
   21040         VexArchInfo* archinfo,
   21041         VexAbiInfo*  vbi,
   21042         Prefix pfx, Int sz, Long deltaIN
   21043      )
   21044 {
   21045    Long   d64   = 0;
   21046    IRTemp addr  = IRTemp_INVALID;
   21047    IRTemp t1    = IRTemp_INVALID;
   21048    IRTemp t2    = IRTemp_INVALID;
   21049    UChar  modrm = 0;
   21050    Int    am_sz = 0;
   21051    Int    alen  = 0;
   21052    HChar  dis_buf[50];
   21053 
   21054    /* In the first switch, look for ordinary integer insns. */
   21055    Long   delta = deltaIN;
   21056    UChar  opc   = getUChar(delta);
   21057    delta++;
   21058    switch (opc) { /* first switch */
   21059 
   21060    case 0x01:
   21061    {
   21062       modrm = getUChar(delta);
   21063       /* 0F 01 /0 -- SGDT */
   21064       /* 0F 01 /1 -- SIDT */
   21065       if (!epartIsReg(modrm)
   21066           && (gregLO3ofRM(modrm) == 0 || gregLO3ofRM(modrm) == 1)) {
   21067          /* This is really revolting, but ... since each processor
   21068             (core) only has one IDT and one GDT, just let the guest
   21069             see it (pass-through semantics).  I can't see any way to
   21070             construct a faked-up value, so don't bother to try. */
   21071          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21072          delta += alen;
   21073          switch (gregLO3ofRM(modrm)) {
   21074             case 0: DIP("sgdt %s\n", dis_buf); break;
   21075             case 1: DIP("sidt %s\n", dis_buf); break;
   21076             default: vassert(0); /*NOTREACHED*/
   21077          }
   21078          IRDirty* d = unsafeIRDirty_0_N (
   21079                           0/*regparms*/,
   21080                           "amd64g_dirtyhelper_SxDT",
   21081                           &amd64g_dirtyhelper_SxDT,
   21082                           mkIRExprVec_2( mkexpr(addr),
   21083                                          mkU64(gregLO3ofRM(modrm)) )
   21084                       );
   21085          /* declare we're writing memory */
   21086          d->mFx   = Ifx_Write;
   21087          d->mAddr = mkexpr(addr);
   21088          d->mSize = 6;
   21089          stmt( IRStmt_Dirty(d) );
   21090          return delta;
   21091       }
   21092       /* 0F 01 D0 = XGETBV */
   21093       if (modrm == 0xD0 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   21094          delta += 1;
   21095          DIP("xgetbv\n");
   21096          /* Fault (SEGV) if ECX isn't zero.  Intel docs say #GP and I
   21097             am not sure if that translates in to SEGV or to something
   21098             else, in user space. */
   21099          t1 = newTemp(Ity_I32);
   21100          assign( t1, getIReg32(R_RCX) );
   21101          stmt( IRStmt_Exit(binop(Iop_CmpNE32, mkexpr(t1), mkU32(0)),
   21102                            Ijk_SigSEGV,
   21103                            IRConst_U64(guest_RIP_curr_instr),
   21104                            OFFB_RIP
   21105          ));
   21106          putIRegRAX(4, mkU32(7));
   21107          putIRegRDX(4, mkU32(0));
   21108          return delta;
   21109       }
   21110       /* BEGIN HACKY SUPPORT FOR xtest */
   21111       /* 0F 01 D6 = XTEST */
   21112       if (modrm == 0xD6 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   21113          /* Sets ZF because there never is a transaction, and all
   21114             CF, OF, SF, PF and AF are always cleared by xtest. */
   21115          delta += 1;
   21116          DIP("xtest\n");
   21117          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   21118          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   21119          stmt( IRStmt_Put( OFFB_CC_DEP1, mkU64(AMD64G_CC_MASK_Z) ));
   21120          /* Set NDEP even though it isn't used.  This makes redundant-PUT
   21121             elimination of previous stores to this field work better. */
   21122          stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   21123          return delta;
   21124       }
   21125       /* END HACKY SUPPORT FOR xtest */
   21126       /* 0F 01 F9 = RDTSCP */
   21127       if (modrm == 0xF9 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_RDTSCP)) {
   21128          delta += 1;
   21129          /* Uses dirty helper:
   21130             void amd64g_dirtyhelper_RDTSCP ( VexGuestAMD64State* )
   21131             declared to wr rax, rcx, rdx
   21132          */
   21133          const HChar* fName = "amd64g_dirtyhelper_RDTSCP";
   21134          void*        fAddr = &amd64g_dirtyhelper_RDTSCP;
   21135          IRDirty* d
   21136             = unsafeIRDirty_0_N ( 0/*regparms*/,
   21137                                   fName, fAddr, mkIRExprVec_1(IRExpr_BBPTR()) );
   21138          /* declare guest state effects */
   21139          d->nFxState = 3;
   21140          vex_bzero(&d->fxState, sizeof(d->fxState));
   21141          d->fxState[0].fx     = Ifx_Write;
   21142          d->fxState[0].offset = OFFB_RAX;
   21143          d->fxState[0].size   = 8;
   21144          d->fxState[1].fx     = Ifx_Write;
   21145          d->fxState[1].offset = OFFB_RCX;
   21146          d->fxState[1].size   = 8;
   21147          d->fxState[2].fx     = Ifx_Write;
   21148          d->fxState[2].offset = OFFB_RDX;
   21149          d->fxState[2].size   = 8;
   21150          /* execute the dirty call, side-effecting guest state */
   21151          stmt( IRStmt_Dirty(d) );
   21152          /* RDTSCP is a serialising insn.  So, just in case someone is
   21153             using it as a memory fence ... */
   21154          stmt( IRStmt_MBE(Imbe_Fence) );
   21155          DIP("rdtscp\n");
   21156          return delta;
   21157       }
   21158       /* else decode failed */
   21159       break;
   21160    }
   21161 
   21162    case 0x05: /* SYSCALL */
   21163       guest_RIP_next_mustcheck = True;
   21164       guest_RIP_next_assumed = guest_RIP_bbstart + delta;
   21165       putIReg64( R_RCX, mkU64(guest_RIP_next_assumed) );
   21166       /* It's important that all guest state is up-to-date
   21167          at this point.  So we declare an end-of-block here, which
   21168          forces any cached guest state to be flushed. */
   21169       jmp_lit(dres, Ijk_Sys_syscall, guest_RIP_next_assumed);
   21170       vassert(dres->whatNext == Dis_StopHere);
   21171       DIP("syscall\n");
   21172       return delta;
   21173 
   21174    case 0x0B: /* UD2 */
   21175       stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_curr_instr) ) );
   21176       jmp_lit(dres, Ijk_NoDecode, guest_RIP_curr_instr);
   21177       vassert(dres->whatNext == Dis_StopHere);
   21178       DIP("ud2\n");
   21179       return delta;
   21180 
   21181    case 0x0D: /* 0F 0D /0 -- prefetch mem8 */
   21182               /* 0F 0D /1 -- prefetchw mem8 */
   21183       if (have66orF2orF3(pfx)) goto decode_failure;
   21184       modrm = getUChar(delta);
   21185       if (epartIsReg(modrm)) goto decode_failure;
   21186       if (gregLO3ofRM(modrm) != 0 && gregLO3ofRM(modrm) != 1)
   21187          goto decode_failure;
   21188       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21189       delta += alen;
   21190       switch (gregLO3ofRM(modrm)) {
   21191          case 0: DIP("prefetch %s\n", dis_buf); break;
   21192          case 1: DIP("prefetchw %s\n", dis_buf); break;
   21193          default: vassert(0); /*NOTREACHED*/
   21194       }
   21195       return delta;
   21196 
   21197    case 0x1F:
   21198       if (haveF2orF3(pfx)) goto decode_failure;
   21199       modrm = getUChar(delta);
   21200       if (epartIsReg(modrm)) goto decode_failure;
   21201       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21202       delta += alen;
   21203       DIP("nop%c %s\n", nameISize(sz), dis_buf);
   21204       return delta;
   21205 
   21206    case 0x31: { /* RDTSC */
   21207       IRTemp   val  = newTemp(Ity_I64);
   21208       IRExpr** args = mkIRExprVec_0();
   21209       IRDirty* d    = unsafeIRDirty_1_N (
   21210                          val,
   21211                          0/*regparms*/,
   21212                          "amd64g_dirtyhelper_RDTSC",
   21213                          &amd64g_dirtyhelper_RDTSC,
   21214                          args
   21215                       );
   21216       if (have66orF2orF3(pfx)) goto decode_failure;
   21217       /* execute the dirty call, dumping the result in val. */
   21218       stmt( IRStmt_Dirty(d) );
   21219       putIRegRDX(4, unop(Iop_64HIto32, mkexpr(val)));
   21220       putIRegRAX(4, unop(Iop_64to32, mkexpr(val)));
   21221       DIP("rdtsc\n");
   21222       return delta;
   21223    }
   21224 
   21225    case 0x40:
   21226    case 0x41:
   21227    case 0x42: /* CMOVBb/CMOVNAEb (cmov below) */
   21228    case 0x43: /* CMOVNBb/CMOVAEb (cmov not below) */
   21229    case 0x44: /* CMOVZb/CMOVEb (cmov zero) */
   21230    case 0x45: /* CMOVNZb/CMOVNEb (cmov not zero) */
   21231    case 0x46: /* CMOVBEb/CMOVNAb (cmov below or equal) */
   21232    case 0x47: /* CMOVNBEb/CMOVAb (cmov not below or equal) */
   21233    case 0x48: /* CMOVSb (cmov negative) */
   21234    case 0x49: /* CMOVSb (cmov not negative) */
   21235    case 0x4A: /* CMOVP (cmov parity even) */
   21236    case 0x4B: /* CMOVNP (cmov parity odd) */
   21237    case 0x4C: /* CMOVLb/CMOVNGEb (cmov less) */
   21238    case 0x4D: /* CMOVGEb/CMOVNLb (cmov greater or equal) */
   21239    case 0x4E: /* CMOVLEb/CMOVNGb (cmov less or equal) */
   21240    case 0x4F: /* CMOVGb/CMOVNLEb (cmov greater) */
   21241       if (haveF2orF3(pfx)) goto decode_failure;
   21242       delta = dis_cmov_E_G(vbi, pfx, sz, (AMD64Condcode)(opc - 0x40), delta);
   21243       return delta;
   21244 
   21245    case 0x80:
   21246    case 0x81:
   21247    case 0x82:   /* JBb/JNAEb (jump below) */
   21248    case 0x83:   /* JNBb/JAEb (jump not below) */
   21249    case 0x84:   /* JZb/JEb (jump zero) */
   21250    case 0x85:   /* JNZb/JNEb (jump not zero) */
   21251    case 0x86:   /* JBEb/JNAb (jump below or equal) */
   21252    case 0x87:   /* JNBEb/JAb (jump not below or equal) */
   21253    case 0x88:   /* JSb (jump negative) */
   21254    case 0x89:   /* JSb (jump not negative) */
   21255    case 0x8A:   /* JP (jump parity even) */
   21256    case 0x8B:   /* JNP/JPO (jump parity odd) */
   21257    case 0x8C:   /* JLb/JNGEb (jump less) */
   21258    case 0x8D:   /* JGEb/JNLb (jump greater or equal) */
   21259    case 0x8E:   /* JLEb/JNGb (jump less or equal) */
   21260    case 0x8F: { /* JGb/JNLEb (jump greater) */
   21261       Long   jmpDelta;
   21262       const HChar* comment  = "";
   21263       if (haveF3(pfx)) goto decode_failure;
   21264       if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   21265       jmpDelta = getSDisp32(delta);
   21266       d64 = (guest_RIP_bbstart+delta+4) + jmpDelta;
   21267       delta += 4;
   21268       if (resteerCisOk
   21269           && vex_control.guest_chase_cond
   21270           && (Addr64)d64 != (Addr64)guest_RIP_bbstart
   21271           && jmpDelta < 0
   21272           && resteerOkFn( callback_opaque, d64) ) {
   21273          /* Speculation: assume this backward branch is taken.  So
   21274             we need to emit a side-exit to the insn following this
   21275             one, on the negation of the condition, and continue at
   21276             the branch target address (d64).  If we wind up back at
   21277             the first instruction of the trace, just stop; it's
   21278             better to let the IR loop unroller handle that case. */
   21279          stmt( IRStmt_Exit(
   21280                   mk_amd64g_calculate_condition(
   21281                      (AMD64Condcode)(1 ^ (opc - 0x80))),
   21282                   Ijk_Boring,
   21283                   IRConst_U64(guest_RIP_bbstart+delta),
   21284                   OFFB_RIP
   21285              ));
   21286          dres->whatNext   = Dis_ResteerC;
   21287          dres->continueAt = d64;
   21288          comment = "(assumed taken)";
   21289       }
   21290       else
   21291       if (resteerCisOk
   21292           && vex_control.guest_chase_cond
   21293           && (Addr64)d64 != (Addr64)guest_RIP_bbstart
   21294           && jmpDelta >= 0
   21295           && resteerOkFn( callback_opaque, guest_RIP_bbstart+delta ) ) {
   21296          /* Speculation: assume this forward branch is not taken.
   21297             So we need to emit a side-exit to d64 (the dest) and
   21298             continue disassembling at the insn immediately
   21299             following this one. */
   21300          stmt( IRStmt_Exit(
   21301                   mk_amd64g_calculate_condition((AMD64Condcode)
   21302                                                 (opc - 0x80)),
   21303                   Ijk_Boring,
   21304                   IRConst_U64(d64),
   21305                   OFFB_RIP
   21306              ));
   21307          dres->whatNext   = Dis_ResteerC;
   21308          dres->continueAt = guest_RIP_bbstart+delta;
   21309          comment = "(assumed not taken)";
   21310       }
   21311       else {
   21312          /* Conservative default translation - end the block at
   21313             this point. */
   21314          jcc_01( dres, (AMD64Condcode)(opc - 0x80),
   21315                  guest_RIP_bbstart+delta, d64 );
   21316          vassert(dres->whatNext == Dis_StopHere);
   21317       }
   21318       DIP("j%s-32 0x%llx %s\n", name_AMD64Condcode(opc - 0x80), d64, comment);
   21319       return delta;
   21320    }
   21321 
   21322    case 0x90:
   21323    case 0x91:
   21324    case 0x92: /* set-Bb/set-NAEb (set if below) */
   21325    case 0x93: /* set-NBb/set-AEb (set if not below) */
   21326    case 0x94: /* set-Zb/set-Eb (set if zero) */
   21327    case 0x95: /* set-NZb/set-NEb (set if not zero) */
   21328    case 0x96: /* set-BEb/set-NAb (set if below or equal) */
   21329    case 0x97: /* set-NBEb/set-Ab (set if not below or equal) */
   21330    case 0x98: /* set-Sb (set if negative) */
   21331    case 0x99: /* set-Sb (set if not negative) */
   21332    case 0x9A: /* set-P (set if parity even) */
   21333    case 0x9B: /* set-NP (set if parity odd) */
   21334    case 0x9C: /* set-Lb/set-NGEb (set if less) */
   21335    case 0x9D: /* set-GEb/set-NLb (set if greater or equal) */
   21336    case 0x9E: /* set-LEb/set-NGb (set if less or equal) */
   21337    case 0x9F: /* set-Gb/set-NLEb (set if greater) */
   21338       if (haveF2orF3(pfx)) goto decode_failure;
   21339       t1 = newTemp(Ity_I8);
   21340       assign( t1, unop(Iop_1Uto8,mk_amd64g_calculate_condition(opc-0x90)) );
   21341       modrm = getUChar(delta);
   21342       if (epartIsReg(modrm)) {
   21343          delta++;
   21344          putIRegE(1, pfx, modrm, mkexpr(t1));
   21345          DIP("set%s %s\n", name_AMD64Condcode(opc-0x90),
   21346                            nameIRegE(1,pfx,modrm));
   21347       } else {
   21348          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21349          delta += alen;
   21350          storeLE( mkexpr(addr), mkexpr(t1) );
   21351          DIP("set%s %s\n", name_AMD64Condcode(opc-0x90), dis_buf);
   21352       }
   21353       return delta;
   21354 
   21355    case 0x1A:
   21356    case 0x1B: { /* Future MPX instructions, currently NOPs.
   21357                    BNDMK b, m     F3 0F 1B
   21358                    BNDCL b, r/m   F3 0F 1A
   21359                    BNDCU b, r/m   F2 0F 1A
   21360                    BNDCN b, r/m   F2 0F 1B
   21361                    BNDMOV b, b/m  66 0F 1A
   21362                    BNDMOV b/m, b  66 0F 1B
   21363                    BNDLDX b, mib     0F 1A
   21364                    BNDSTX mib, b     0F 1B */
   21365 
   21366       /* All instructions have two operands. One operand is always the
   21367          bnd register number (bnd0-bnd3, other register numbers are
   21368          ignored when MPX isn't enabled, but should generate an
   21369          exception if MPX is enabled) given by gregOfRexRM. The other
   21370          operand is either a ModRM:reg, ModRM:r/m or a SIB encoded
   21371          address, all of which can be decoded by using either
   21372          eregOfRexRM or disAMode. */
   21373 
   21374       modrm = getUChar(delta);
   21375       int bnd = gregOfRexRM(pfx,modrm);
   21376       const HChar *oper;
   21377       if (epartIsReg(modrm)) {
   21378          oper = nameIReg64 (eregOfRexRM(pfx,modrm));
   21379          delta += 1;
   21380       } else {
   21381          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21382          delta += alen;
   21383          oper = dis_buf;
   21384       }
   21385 
   21386       if (haveF3no66noF2 (pfx)) {
   21387          if (opc == 0x1B) {
   21388             DIP ("bndmk %s, %%bnd%d\n", oper, bnd);
   21389          } else /* opc == 0x1A */ {
   21390             DIP ("bndcl %s, %%bnd%d\n", oper, bnd);
   21391          }
   21392       } else if (haveF2no66noF3 (pfx)) {
   21393          if (opc == 0x1A) {
   21394             DIP ("bndcu %s, %%bnd%d\n", oper, bnd);
   21395          } else /* opc == 0x1B */ {
   21396             DIP ("bndcn %s, %%bnd%d\n", oper, bnd);
   21397          }
   21398       } else if (have66noF2noF3 (pfx)) {
   21399          if (opc == 0x1A) {
   21400             DIP ("bndmov %s, %%bnd%d\n", oper, bnd);
   21401          } else /* opc == 0x1B */ {
   21402             DIP ("bndmov %%bnd%d, %s\n", bnd, oper);
   21403          }
   21404       } else if (haveNo66noF2noF3 (pfx)) {
   21405          if (opc == 0x1A) {
   21406             DIP ("bndldx %s, %%bnd%d\n", oper, bnd);
   21407          } else /* opc == 0x1B */ {
   21408             DIP ("bndstx %%bnd%d, %s\n", bnd, oper);
   21409          }
   21410       } else goto decode_failure;
   21411 
   21412       return delta;
   21413    }
   21414 
   21415    case 0xA2: { /* CPUID */
   21416       /* Uses dirty helper:
   21417             void amd64g_dirtyhelper_CPUID ( VexGuestAMD64State* )
   21418          declared to mod rax, wr rbx, rcx, rdx
   21419       */
   21420       IRDirty* d     = NULL;
   21421       const HChar*   fName = NULL;
   21422       void*    fAddr = NULL;
   21423       if (haveF2orF3(pfx)) goto decode_failure;
   21424       /* This isn't entirely correct, CPUID should depend on the VEX
   21425          capabilities, not on the underlying CPU. See bug #324882. */
   21426       if ((archinfo->hwcaps & VEX_HWCAPS_AMD64_SSE3) &&
   21427           (archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16) &&
   21428           (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   21429          fName = "amd64g_dirtyhelper_CPUID_avx_and_cx16";
   21430          fAddr = &amd64g_dirtyhelper_CPUID_avx_and_cx16;
   21431          /* This is a Core-i5-2300-like machine */
   21432       }
   21433       else if ((archinfo->hwcaps & VEX_HWCAPS_AMD64_SSE3) &&
   21434                (archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16)) {
   21435          fName = "amd64g_dirtyhelper_CPUID_sse42_and_cx16";
   21436          fAddr = &amd64g_dirtyhelper_CPUID_sse42_and_cx16;
   21437          /* This is a Core-i5-670-like machine */
   21438       }
   21439       else {
   21440          /* Give a CPUID for at least a baseline machine, SSE2
   21441             only, and no CX16 */
   21442          fName = "amd64g_dirtyhelper_CPUID_baseline";
   21443          fAddr = &amd64g_dirtyhelper_CPUID_baseline;
   21444       }
   21445 
   21446       vassert(fName); vassert(fAddr);
   21447       d = unsafeIRDirty_0_N ( 0/*regparms*/,
   21448                               fName, fAddr, mkIRExprVec_1(IRExpr_BBPTR()) );
   21449       /* declare guest state effects */
   21450       d->nFxState = 4;
   21451       vex_bzero(&d->fxState, sizeof(d->fxState));
   21452       d->fxState[0].fx     = Ifx_Modify;
   21453       d->fxState[0].offset = OFFB_RAX;
   21454       d->fxState[0].size   = 8;
   21455       d->fxState[1].fx     = Ifx_Write;
   21456       d->fxState[1].offset = OFFB_RBX;
   21457       d->fxState[1].size   = 8;
   21458       d->fxState[2].fx     = Ifx_Modify;
   21459       d->fxState[2].offset = OFFB_RCX;
   21460       d->fxState[2].size   = 8;
   21461       d->fxState[3].fx     = Ifx_Write;
   21462       d->fxState[3].offset = OFFB_RDX;
   21463       d->fxState[3].size   = 8;
   21464       /* execute the dirty call, side-effecting guest state */
   21465       stmt( IRStmt_Dirty(d) );
   21466       /* CPUID is a serialising insn.  So, just in case someone is
   21467          using it as a memory fence ... */
   21468       stmt( IRStmt_MBE(Imbe_Fence) );
   21469       DIP("cpuid\n");
   21470       return delta;
   21471    }
   21472 
   21473    case 0xA3: { /* BT Gv,Ev */
   21474       /* We let dis_bt_G_E decide whether F2 or F3 are allowable. */
   21475       Bool ok = True;
   21476       if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
   21477       delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpNone, &ok );
   21478       if (!ok) goto decode_failure;
   21479       return delta;
   21480    }
   21481 
   21482    case 0xA4: /* SHLDv imm8,Gv,Ev */
   21483       modrm = getUChar(delta);
   21484       d64   = delta + lengthAMode(pfx, delta);
   21485       vex_sprintf(dis_buf, "$%d", (Int)getUChar(d64));
   21486       delta = dis_SHLRD_Gv_Ev (
   21487                  vbi, pfx, delta, modrm, sz,
   21488                  mkU8(getUChar(d64)), True, /* literal */
   21489                  dis_buf, True /* left */ );
   21490       return delta;
   21491 
   21492    case 0xA5: /* SHLDv %cl,Gv,Ev */
   21493       modrm = getUChar(delta);
   21494       delta = dis_SHLRD_Gv_Ev (
   21495                  vbi, pfx, delta, modrm, sz,
   21496                  getIRegCL(), False, /* not literal */
   21497                  "%cl", True /* left */ );
   21498       return delta;
   21499 
   21500    case 0xAB: { /* BTS Gv,Ev */
   21501       /* We let dis_bt_G_E decide whether F2 or F3 are allowable. */
   21502       Bool ok = True;
   21503       if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
   21504       delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpSet, &ok );
   21505       if (!ok) goto decode_failure;
   21506       return delta;
   21507    }
   21508 
   21509    case 0xAC: /* SHRDv imm8,Gv,Ev */
   21510       modrm = getUChar(delta);
   21511       d64   = delta + lengthAMode(pfx, delta);
   21512       vex_sprintf(dis_buf, "$%d", (Int)getUChar(d64));
   21513       delta = dis_SHLRD_Gv_Ev (
   21514                  vbi, pfx, delta, modrm, sz,
   21515                  mkU8(getUChar(d64)), True, /* literal */
   21516                  dis_buf, False /* right */ );
   21517       return delta;
   21518 
   21519    case 0xAD: /* SHRDv %cl,Gv,Ev */
   21520       modrm = getUChar(delta);
   21521       delta = dis_SHLRD_Gv_Ev (
   21522                  vbi, pfx, delta, modrm, sz,
   21523                  getIRegCL(), False, /* not literal */
   21524                  "%cl", False /* right */);
   21525       return delta;
   21526 
   21527    case 0xAF: /* IMUL Ev, Gv */
   21528       if (haveF2orF3(pfx)) goto decode_failure;
   21529       delta = dis_mul_E_G ( vbi, pfx, sz, delta );
   21530       return delta;
   21531 
   21532    case 0xB0: { /* CMPXCHG Gb,Eb */
   21533       Bool ok = True;
   21534       /* We let dis_cmpxchg_G_E decide whether F2 or F3 are allowable. */
   21535       delta = dis_cmpxchg_G_E ( &ok, vbi, pfx, 1, delta );
   21536       if (!ok) goto decode_failure;
   21537       return delta;
   21538    }
   21539 
   21540    case 0xB1: { /* CMPXCHG Gv,Ev (allowed in 16,32,64 bit) */
   21541       Bool ok = True;
   21542       /* We let dis_cmpxchg_G_E decide whether F2 or F3 are allowable. */
   21543       if (sz != 2 && sz != 4 && sz != 8) goto decode_failure;
   21544       delta = dis_cmpxchg_G_E ( &ok, vbi, pfx, sz, delta );
   21545       if (!ok) goto decode_failure;
   21546       return delta;
   21547    }
   21548 
   21549    case 0xB3: { /* BTR Gv,Ev */
   21550       /* We let dis_bt_G_E decide whether F2 or F3 are allowable. */
   21551       Bool ok = True;
   21552       if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
   21553       delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpReset, &ok );
   21554       if (!ok) goto decode_failure;
   21555       return delta;
   21556    }
   21557 
   21558    case 0xB6: /* MOVZXb Eb,Gv */
   21559       if (haveF2orF3(pfx)) goto decode_failure;
   21560       if (sz != 2 && sz != 4 && sz != 8)
   21561          goto decode_failure;
   21562       delta = dis_movx_E_G ( vbi, pfx, delta, 1, sz, False );
   21563       return delta;
   21564 
   21565    case 0xB7: /* MOVZXw Ew,Gv */
   21566       if (haveF2orF3(pfx)) goto decode_failure;
   21567       if (sz != 4 && sz != 8)
   21568          goto decode_failure;
   21569       delta = dis_movx_E_G ( vbi, pfx, delta, 2, sz, False );
   21570       return delta;
   21571 
   21572    case 0xBA: { /* Grp8 Ib,Ev */
   21573       /* We let dis_Grp8_Imm decide whether F2 or F3 are allowable. */
   21574       Bool decode_OK = False;
   21575       modrm = getUChar(delta);
   21576       am_sz = lengthAMode(pfx,delta);
   21577       d64   = getSDisp8(delta + am_sz);
   21578       delta = dis_Grp8_Imm ( vbi, pfx, delta, modrm, am_sz, sz, d64,
   21579                              &decode_OK );
   21580       if (!decode_OK)
   21581          goto decode_failure;
   21582       return delta;
   21583    }
   21584 
   21585    case 0xBB: { /* BTC Gv,Ev */
   21586       /* We let dis_bt_G_E decide whether F2 or F3 are allowable. */
   21587       Bool ok = False;
   21588       if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
   21589       delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpComp, &ok );
   21590       if (!ok) goto decode_failure;
   21591       return delta;
   21592    }
   21593 
   21594    case 0xBC: /* BSF Gv,Ev */
   21595       if (!haveF2orF3(pfx)
   21596           || (haveF3noF2(pfx)
   21597               && 0 == (archinfo->hwcaps & VEX_HWCAPS_AMD64_BMI))) {
   21598          /* no-F2 no-F3 0F BC = BSF
   21599                   or F3 0F BC = REP; BSF on older CPUs.  */
   21600          delta = dis_bs_E_G ( vbi, pfx, sz, delta, True );
   21601          return delta;
   21602       }
   21603       /* Fall through, since F3 0F BC is TZCNT, and needs to
   21604          be handled by dis_ESC_0F__SSE4. */
   21605       break;
   21606 
   21607    case 0xBD: /* BSR Gv,Ev */
   21608       if (!haveF2orF3(pfx)
   21609           || (haveF3noF2(pfx)
   21610               && 0 == (archinfo->hwcaps & VEX_HWCAPS_AMD64_LZCNT))) {
   21611          /* no-F2 no-F3 0F BD = BSR
   21612                   or F3 0F BD = REP; BSR on older CPUs.  */
   21613          delta = dis_bs_E_G ( vbi, pfx, sz, delta, False );
   21614          return delta;
   21615       }
   21616       /* Fall through, since F3 0F BD is LZCNT, and needs to
   21617          be handled by dis_ESC_0F__SSE4. */
   21618       break;
   21619 
   21620    case 0xBE: /* MOVSXb Eb,Gv */
   21621       if (haveF2orF3(pfx)) goto decode_failure;
   21622       if (sz != 2 && sz != 4 && sz != 8)
   21623          goto decode_failure;
   21624       delta = dis_movx_E_G ( vbi, pfx, delta, 1, sz, True );
   21625       return delta;
   21626 
   21627    case 0xBF: /* MOVSXw Ew,Gv */
   21628       if (haveF2orF3(pfx)) goto decode_failure;
   21629       if (sz != 4 && sz != 8)
   21630          goto decode_failure;
   21631       delta = dis_movx_E_G ( vbi, pfx, delta, 2, sz, True );
   21632       return delta;
   21633 
   21634    case 0xC0: { /* XADD Gb,Eb */
   21635       Bool decode_OK = False;
   21636       delta = dis_xadd_G_E ( &decode_OK, vbi, pfx, 1, delta );
   21637       if (!decode_OK)
   21638          goto decode_failure;
   21639       return delta;
   21640    }
   21641 
   21642    case 0xC1: { /* XADD Gv,Ev */
   21643       Bool decode_OK = False;
   21644       delta = dis_xadd_G_E ( &decode_OK, vbi, pfx, sz, delta );
   21645       if (!decode_OK)
   21646          goto decode_failure;
   21647       return delta;
   21648    }
   21649 
   21650    case 0xC7: { /* CMPXCHG8B Ev, CMPXCHG16B Ev */
   21651       IRType  elemTy     = sz==4 ? Ity_I32 : Ity_I64;
   21652       IRTemp  expdHi     = newTemp(elemTy);
   21653       IRTemp  expdLo     = newTemp(elemTy);
   21654       IRTemp  dataHi     = newTemp(elemTy);
   21655       IRTemp  dataLo     = newTemp(elemTy);
   21656       IRTemp  oldHi      = newTemp(elemTy);
   21657       IRTemp  oldLo      = newTemp(elemTy);
   21658       IRTemp  flags_old  = newTemp(Ity_I64);
   21659       IRTemp  flags_new  = newTemp(Ity_I64);
   21660       IRTemp  success    = newTemp(Ity_I1);
   21661       IROp    opOR       = sz==4 ? Iop_Or32    : Iop_Or64;
   21662       IROp    opXOR      = sz==4 ? Iop_Xor32   : Iop_Xor64;
   21663       IROp    opCasCmpEQ = sz==4 ? Iop_CasCmpEQ32 : Iop_CasCmpEQ64;
   21664       IRExpr* zero       = sz==4 ? mkU32(0)    : mkU64(0);
   21665       IRTemp expdHi64    = newTemp(Ity_I64);
   21666       IRTemp expdLo64    = newTemp(Ity_I64);
   21667 
   21668       /* Translate this using a DCAS, even if there is no LOCK
   21669          prefix.  Life is too short to bother with generating two
   21670          different translations for the with/without-LOCK-prefix
   21671          cases. */
   21672       *expect_CAS = True;
   21673 
   21674       /* Decode, and generate address. */
   21675       if (have66(pfx)) goto decode_failure;
   21676       if (sz != 4 && sz != 8) goto decode_failure;
   21677       if (sz == 8 && !(archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16))
   21678          goto decode_failure;
   21679       modrm = getUChar(delta);
   21680       if (epartIsReg(modrm)) goto decode_failure;
   21681       if (gregLO3ofRM(modrm) != 1) goto decode_failure;
   21682       if (haveF2orF3(pfx)) {
   21683          /* Since the e-part is memory only, F2 or F3 (one or the
   21684             other) is acceptable if LOCK is also present.  But only
   21685             for cmpxchg8b. */
   21686          if (sz == 8) goto decode_failure;
   21687          if (haveF2andF3(pfx) || !haveLOCK(pfx)) goto decode_failure;
   21688       }
   21689 
   21690       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21691       delta += alen;
   21692 
   21693       /* cmpxchg16b requires an alignment check. */
   21694       if (sz == 8)
   21695          gen_SEGV_if_not_16_aligned( addr );
   21696 
   21697       /* Get the expected and new values. */
   21698       assign( expdHi64, getIReg64(R_RDX) );
   21699       assign( expdLo64, getIReg64(R_RAX) );
   21700 
   21701       /* These are the correctly-sized expected and new values.
   21702          However, we also get expdHi64/expdLo64 above as 64-bits
   21703          regardless, because we will need them later in the 32-bit
   21704          case (paradoxically). */
   21705       assign( expdHi, sz==4 ? unop(Iop_64to32, mkexpr(expdHi64))
   21706                             : mkexpr(expdHi64) );
   21707       assign( expdLo, sz==4 ? unop(Iop_64to32, mkexpr(expdLo64))
   21708                             : mkexpr(expdLo64) );
   21709       assign( dataHi, sz==4 ? getIReg32(R_RCX) : getIReg64(R_RCX) );
   21710       assign( dataLo, sz==4 ? getIReg32(R_RBX) : getIReg64(R_RBX) );
   21711 
   21712       /* Do the DCAS */
   21713       stmt( IRStmt_CAS(
   21714                mkIRCAS( oldHi, oldLo,
   21715                         Iend_LE, mkexpr(addr),
   21716                         mkexpr(expdHi), mkexpr(expdLo),
   21717                         mkexpr(dataHi), mkexpr(dataLo)
   21718             )));
   21719 
   21720       /* success when oldHi:oldLo == expdHi:expdLo */
   21721       assign( success,
   21722               binop(opCasCmpEQ,
   21723                     binop(opOR,
   21724                           binop(opXOR, mkexpr(oldHi), mkexpr(expdHi)),
   21725                           binop(opXOR, mkexpr(oldLo), mkexpr(expdLo))
   21726                     ),
   21727                     zero
   21728               ));
   21729 
   21730       /* If the DCAS is successful, that is to say oldHi:oldLo ==
   21731          expdHi:expdLo, then put expdHi:expdLo back in RDX:RAX,
   21732          which is where they came from originally.  Both the actual
   21733          contents of these two regs, and any shadow values, are
   21734          unchanged.  If the DCAS fails then we're putting into
   21735          RDX:RAX the value seen in memory. */
   21736       /* Now of course there's a complication in the 32-bit case
   21737          (bah!): if the DCAS succeeds, we need to leave RDX:RAX
   21738          unchanged; but if we use the same scheme as in the 64-bit
   21739          case, we get hit by the standard rule that a write to the
   21740          bottom 32 bits of an integer register zeros the upper 32
   21741          bits.  And so the upper halves of RDX and RAX mysteriously
   21742          become zero.  So we have to stuff back in the original
   21743          64-bit values which we previously stashed in
   21744          expdHi64:expdLo64, even if we're doing a cmpxchg8b. */
   21745       /* It's just _so_ much fun ... */
   21746       putIRegRDX( 8,
   21747                   IRExpr_ITE( mkexpr(success),
   21748                               mkexpr(expdHi64),
   21749                               sz == 4 ? unop(Iop_32Uto64, mkexpr(oldHi))
   21750                                       : mkexpr(oldHi)
   21751                 ));
   21752       putIRegRAX( 8,
   21753                   IRExpr_ITE( mkexpr(success),
   21754                               mkexpr(expdLo64),
   21755                               sz == 4 ? unop(Iop_32Uto64, mkexpr(oldLo))
   21756                                       : mkexpr(oldLo)
   21757                 ));
   21758 
   21759       /* Copy the success bit into the Z flag and leave the others
   21760          unchanged */
   21761       assign( flags_old, widenUto64(mk_amd64g_calculate_rflags_all()));
   21762       assign(
   21763          flags_new,
   21764          binop(Iop_Or64,
   21765                binop(Iop_And64, mkexpr(flags_old),
   21766                                 mkU64(~AMD64G_CC_MASK_Z)),
   21767                binop(Iop_Shl64,
   21768                      binop(Iop_And64,
   21769                            unop(Iop_1Uto64, mkexpr(success)), mkU64(1)),
   21770                      mkU8(AMD64G_CC_SHIFT_Z)) ));
   21771 
   21772       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   21773       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(flags_new) ));
   21774       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   21775       /* Set NDEP even though it isn't used.  This makes
   21776          redundant-PUT elimination of previous stores to this field
   21777          work better. */
   21778       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   21779 
   21780       /* Sheesh.  Aren't you glad it was me and not you that had to
   21781          write and validate all this grunge? */
   21782 
   21783       DIP("cmpxchg8b %s\n", dis_buf);
   21784       return delta;
   21785    }
   21786 
   21787    case 0xC8: /* BSWAP %eax */
   21788    case 0xC9:
   21789    case 0xCA:
   21790    case 0xCB:
   21791    case 0xCC:
   21792    case 0xCD:
   21793    case 0xCE:
   21794    case 0xCF: /* BSWAP %edi */
   21795       if (haveF2orF3(pfx)) goto decode_failure;
   21796       /* According to the AMD64 docs, this insn can have size 4 or
   21797          8. */
   21798       if (sz == 4) {
   21799          t1 = newTemp(Ity_I32);
   21800          assign( t1, getIRegRexB(4, pfx, opc-0xC8) );
   21801          t2 = math_BSWAP( t1, Ity_I32 );
   21802          putIRegRexB(4, pfx, opc-0xC8, mkexpr(t2));
   21803          DIP("bswapl %s\n", nameIRegRexB(4, pfx, opc-0xC8));
   21804          return delta;
   21805       }
   21806       if (sz == 8) {
   21807          t1 = newTemp(Ity_I64);
   21808          t2 = newTemp(Ity_I64);
   21809          assign( t1, getIRegRexB(8, pfx, opc-0xC8) );
   21810          t2 = math_BSWAP( t1, Ity_I64 );
   21811          putIRegRexB(8, pfx, opc-0xC8, mkexpr(t2));
   21812          DIP("bswapq %s\n", nameIRegRexB(8, pfx, opc-0xC8));
   21813          return delta;
   21814       }
   21815       goto decode_failure;
   21816 
   21817    default:
   21818       break;
   21819 
   21820    } /* first switch */
   21821 
   21822 
   21823    /* =-=-=-=-=-=-=-=-= MMXery =-=-=-=-=-=-=-=-= */
   21824    /* In the second switch, pick off MMX insns. */
   21825 
   21826    if (!have66orF2orF3(pfx)) {
   21827       /* So there's no SIMD prefix. */
   21828 
   21829       vassert(sz == 4 || sz == 8);
   21830 
   21831       switch (opc) { /* second switch */
   21832 
   21833       case 0x71:
   21834       case 0x72:
   21835       case 0x73: /* PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
   21836 
   21837       case 0x6E: /* MOVD (src)ireg-or-mem, (dst)mmxreg */
   21838       case 0x7E: /* MOVD (src)mmxreg, (dst)ireg-or-mem */
   21839       case 0x7F: /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
   21840       case 0x6F: /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
   21841 
   21842       case 0xFC:
   21843       case 0xFD:
   21844       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
   21845 
   21846       case 0xEC:
   21847       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
   21848 
   21849       case 0xDC:
   21850       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   21851 
   21852       case 0xF8:
   21853       case 0xF9:
   21854       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
   21855 
   21856       case 0xE8:
   21857       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
   21858 
   21859       case 0xD8:
   21860       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   21861 
   21862       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
   21863       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
   21864 
   21865       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
   21866 
   21867       case 0x74:
   21868       case 0x75:
   21869       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
   21870 
   21871       case 0x64:
   21872       case 0x65:
   21873       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
   21874 
   21875       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
   21876       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
   21877       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
   21878 
   21879       case 0x68:
   21880       case 0x69:
   21881       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
   21882 
   21883       case 0x60:
   21884       case 0x61:
   21885       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
   21886 
   21887       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
   21888       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
   21889       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
   21890       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
   21891 
   21892       case 0xF1: /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
   21893       case 0xF2:
   21894       case 0xF3:
   21895 
   21896       case 0xD1: /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
   21897       case 0xD2:
   21898       case 0xD3:
   21899 
   21900       case 0xE1: /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
   21901       case 0xE2: {
   21902          Bool decode_OK = False;
   21903          delta = dis_MMX ( &decode_OK, vbi, pfx, sz, deltaIN );
   21904          if (decode_OK)
   21905             return delta;
   21906          goto decode_failure;
   21907       }
   21908 
   21909       default:
   21910          break;
   21911       } /* second switch */
   21912 
   21913    }
   21914 
   21915    /* A couple of MMX corner cases */
   21916    if (opc == 0x0E/* FEMMS */ || opc == 0x77/* EMMS */) {
   21917       if (sz != 4)
   21918          goto decode_failure;
   21919       do_EMMS_preamble();
   21920       DIP("{f}emms\n");
   21921       return delta;
   21922    }
   21923 
   21924    /* =-=-=-=-=-=-=-=-= SSE2ery =-=-=-=-=-=-=-=-= */
   21925    /* Perhaps it's an SSE or SSE2 instruction.  We can try this
   21926       without checking the guest hwcaps because SSE2 is a baseline
   21927       facility in 64 bit mode. */
   21928    {
   21929       Bool decode_OK = False;
   21930       delta = dis_ESC_0F__SSE2 ( &decode_OK, vbi, pfx, sz, deltaIN, dres );
   21931       if (decode_OK)
   21932          return delta;
   21933    }
   21934 
   21935    /* =-=-=-=-=-=-=-=-= SSE3ery =-=-=-=-=-=-=-=-= */
   21936    /* Perhaps it's a SSE3 instruction.  FIXME: check guest hwcaps
   21937       first. */
   21938    {
   21939       Bool decode_OK = False;
   21940       delta = dis_ESC_0F__SSE3 ( &decode_OK, vbi, pfx, sz, deltaIN );
   21941       if (decode_OK)
   21942          return delta;
   21943    }
   21944 
   21945    /* =-=-=-=-=-=-=-=-= SSE4ery =-=-=-=-=-=-=-=-= */
   21946    /* Perhaps it's a SSE4 instruction.  FIXME: check guest hwcaps
   21947       first. */
   21948    {
   21949       Bool decode_OK = False;
   21950       delta = dis_ESC_0F__SSE4 ( &decode_OK,
   21951                                  archinfo, vbi, pfx, sz, deltaIN );
   21952       if (decode_OK)
   21953          return delta;
   21954    }
   21955 
   21956   decode_failure:
   21957    return deltaIN; /* fail */
   21958 }
   21959 
   21960 
   21961 /*------------------------------------------------------------*/
   21962 /*---                                                      ---*/
   21963 /*--- Top-level post-escape decoders: dis_ESC_0F38         ---*/
   21964 /*---                                                      ---*/
   21965 /*------------------------------------------------------------*/
   21966 
   21967 __attribute__((noinline))
   21968 static
   21969 Long dis_ESC_0F38 (
   21970         /*MB_OUT*/DisResult* dres,
   21971         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
   21972         Bool         resteerCisOk,
   21973         void*        callback_opaque,
   21974         VexArchInfo* archinfo,
   21975         VexAbiInfo*  vbi,
   21976         Prefix pfx, Int sz, Long deltaIN
   21977      )
   21978 {
   21979    Long   delta = deltaIN;
   21980    UChar  opc   = getUChar(delta);
   21981    delta++;
   21982    switch (opc) {
   21983 
   21984    case 0xF0:   /* 0F 38 F0 = MOVBE m16/32/64(E), r16/32/64(G) */
   21985    case 0xF1: { /* 0F 38 F1 = MOVBE r16/32/64(G), m16/32/64(E) */
   21986       if (!haveF2orF3(pfx) && !haveVEX(pfx)
   21987           && (sz == 2 || sz == 4 || sz == 8)) {
   21988          IRTemp addr  = IRTemp_INVALID;
   21989          UChar  modrm = 0;
   21990          Int    alen  = 0;
   21991          HChar  dis_buf[50];
   21992          modrm = getUChar(delta);
   21993          if (epartIsReg(modrm)) break;
   21994          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   21995          delta += alen;
   21996          IRType ty = szToITy(sz);
   21997          IRTemp src = newTemp(ty);
   21998          if (opc == 0xF0) { /* LOAD */
   21999             assign(src, loadLE(ty, mkexpr(addr)));
   22000             IRTemp dst = math_BSWAP(src, ty);
   22001             putIRegG(sz, pfx, modrm, mkexpr(dst));
   22002             DIP("movbe %s,%s\n", dis_buf, nameIRegG(sz, pfx, modrm));
   22003          } else { /* STORE */
   22004             assign(src, getIRegG(sz, pfx, modrm));
   22005             IRTemp dst = math_BSWAP(src, ty);
   22006             storeLE(mkexpr(addr), mkexpr(dst));
   22007             DIP("movbe %s,%s\n", nameIRegG(sz, pfx, modrm), dis_buf);
   22008          }
   22009          return delta;
   22010       }
   22011       /* else fall through; maybe one of the decoders below knows what
   22012          it is. */
   22013       break;
   22014    }
   22015 
   22016    default:
   22017       break;
   22018 
   22019    }
   22020 
   22021    /* =-=-=-=-=-=-=-=-= SSSE3ery =-=-=-=-=-=-=-=-= */
   22022    /* Perhaps it's an SSSE3 instruction.  FIXME: consult guest hwcaps
   22023       rather than proceeding indiscriminately. */
   22024    {
   22025       Bool decode_OK = False;
   22026       delta = dis_ESC_0F38__SupSSE3 ( &decode_OK, vbi, pfx, sz, deltaIN );
   22027       if (decode_OK)
   22028          return delta;
   22029    }
   22030 
   22031    /* =-=-=-=-=-=-=-=-= SSE4ery =-=-=-=-=-=-=-=-= */
   22032    /* Perhaps it's an SSE4 instruction.  FIXME: consult guest hwcaps
   22033       rather than proceeding indiscriminately. */
   22034    {
   22035       Bool decode_OK = False;
   22036       delta = dis_ESC_0F38__SSE4 ( &decode_OK, vbi, pfx, sz, deltaIN );
   22037       if (decode_OK)
   22038          return delta;
   22039    }
   22040 
   22041   /*decode_failure:*/
   22042    return deltaIN; /* fail */
   22043 }
   22044 
   22045 
   22046 /*------------------------------------------------------------*/
   22047 /*---                                                      ---*/
   22048 /*--- Top-level post-escape decoders: dis_ESC_0F3A         ---*/
   22049 /*---                                                      ---*/
   22050 /*------------------------------------------------------------*/
   22051 
   22052 __attribute__((noinline))
   22053 static
   22054 Long dis_ESC_0F3A (
   22055         /*MB_OUT*/DisResult* dres,
   22056         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
   22057         Bool         resteerCisOk,
   22058         void*        callback_opaque,
   22059         VexArchInfo* archinfo,
   22060         VexAbiInfo*  vbi,
   22061         Prefix pfx, Int sz, Long deltaIN
   22062      )
   22063 {
   22064    Long   delta = deltaIN;
   22065    UChar  opc   = getUChar(delta);
   22066    delta++;
   22067    switch (opc) {
   22068 
   22069    default:
   22070       break;
   22071 
   22072    }
   22073 
   22074    /* =-=-=-=-=-=-=-=-= SSSE3ery =-=-=-=-=-=-=-=-= */
   22075    /* Perhaps it's an SSSE3 instruction.  FIXME: consult guest hwcaps
   22076       rather than proceeding indiscriminately. */
   22077    {
   22078       Bool decode_OK = False;
   22079       delta = dis_ESC_0F3A__SupSSE3 ( &decode_OK, vbi, pfx, sz, deltaIN );
   22080       if (decode_OK)
   22081          return delta;
   22082    }
   22083 
   22084    /* =-=-=-=-=-=-=-=-= SSE4ery =-=-=-=-=-=-=-=-= */
   22085    /* Perhaps it's an SSE4 instruction.  FIXME: consult guest hwcaps
   22086       rather than proceeding indiscriminately. */
   22087    {
   22088       Bool decode_OK = False;
   22089       delta = dis_ESC_0F3A__SSE4 ( &decode_OK, vbi, pfx, sz, deltaIN );
   22090       if (decode_OK)
   22091          return delta;
   22092    }
   22093 
   22094    return deltaIN; /* fail */
   22095 }
   22096 
   22097 
   22098 /*------------------------------------------------------------*/
   22099 /*---                                                      ---*/
   22100 /*--- Top-level post-escape decoders: dis_ESC_0F__VEX      ---*/
   22101 /*---                                                      ---*/
   22102 /*------------------------------------------------------------*/
   22103 
   22104 /* FIXME: common up with the _256_ version below? */
   22105 static
   22106 Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG (
   22107         /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi,
   22108         Prefix pfx, Long delta, const HChar* name,
   22109         /* The actual operation.  Use either 'op' or 'opfn',
   22110            but not both. */
   22111         IROp op, IRTemp(*opFn)(IRTemp,IRTemp),
   22112         Bool invertLeftArg,
   22113         Bool swapArgs
   22114      )
   22115 {
   22116    UChar  modrm = getUChar(delta);
   22117    UInt   rD    = gregOfRexRM(pfx, modrm);
   22118    UInt   rSL   = getVexNvvvv(pfx);
   22119    IRTemp tSL   = newTemp(Ity_V128);
   22120    IRTemp tSR   = newTemp(Ity_V128);
   22121    IRTemp addr  = IRTemp_INVALID;
   22122    HChar  dis_buf[50];
   22123    Int    alen  = 0;
   22124    vassert(0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*WIG?*/);
   22125 
   22126    assign(tSL, invertLeftArg ? unop(Iop_NotV128, getXMMReg(rSL))
   22127                              : getXMMReg(rSL));
   22128 
   22129    if (epartIsReg(modrm)) {
   22130       UInt rSR = eregOfRexRM(pfx, modrm);
   22131       delta += 1;
   22132       assign(tSR, getXMMReg(rSR));
   22133       DIP("%s %s,%s,%s\n",
   22134           name, nameXMMReg(rSR), nameXMMReg(rSL), nameXMMReg(rD));
   22135    } else {
   22136       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   22137       delta += alen;
   22138       assign(tSR, loadLE(Ity_V128, mkexpr(addr)));
   22139       DIP("%s %s,%s,%s\n",
   22140           name, dis_buf, nameXMMReg(rSL), nameXMMReg(rD));
   22141    }
   22142 
   22143    IRTemp res = IRTemp_INVALID;
   22144    if (op != Iop_INVALID) {
   22145       vassert(opFn == NULL);
   22146       res = newTemp(Ity_V128);
   22147       if (requiresRMode(op)) {
   22148          IRTemp rm = newTemp(Ity_I32);
   22149          assign(rm, get_FAKE_roundingmode()); /* XXXROUNDINGFIXME */
   22150          assign(res, swapArgs
   22151                         ? triop(op, mkexpr(rm), mkexpr(tSR), mkexpr(tSL))
   22152                         : triop(op, mkexpr(rm), mkexpr(tSL), mkexpr(tSR)));
   22153       } else {
   22154          assign(res, swapArgs
   22155                         ? binop(op, mkexpr(tSR), mkexpr(tSL))
   22156                         : binop(op, mkexpr(tSL), mkexpr(tSR)));
   22157       }
   22158    } else {
   22159       vassert(opFn != NULL);
   22160       res = swapArgs ? opFn(tSR, tSL) : opFn(tSL, tSR);
   22161    }
   22162 
   22163    putYMMRegLoAndZU(rD, mkexpr(res));
   22164 
   22165    *uses_vvvv = True;
   22166    return delta;
   22167 }
   22168 
   22169 
   22170 /* Handle a VEX_NDS_128_66_0F_WIG (3-addr) insn, with a simple IROp
   22171    for the operation, no inversion of the left arg, and no swapping of
   22172    args. */
   22173 static
   22174 Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple (
   22175         /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi,
   22176         Prefix pfx, Long delta, const HChar* name,
   22177         IROp op
   22178      )
   22179 {
   22180    return dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   22181              uses_vvvv, vbi, pfx, delta, name, op, NULL, False, False);
   22182 }
   22183 
   22184 
   22185 /* Handle a VEX_NDS_128_66_0F_WIG (3-addr) insn, using the given IR
   22186    generator to compute the result, no inversion of the left
   22187    arg, and no swapping of args. */
   22188 static
   22189 Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex (
   22190         /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi,
   22191         Prefix pfx, Long delta, const HChar* name,
   22192         IRTemp(*opFn)(IRTemp,IRTemp)
   22193      )
   22194 {
   22195    return dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   22196              uses_vvvv, vbi, pfx, delta, name,
   22197              Iop_INVALID, opFn, False, False );
   22198 }
   22199 
   22200 
   22201 /* Vector by scalar shift of V by the amount specified at the bottom
   22202    of E. */
   22203 static ULong dis_AVX128_shiftV_byE ( VexAbiInfo* vbi,
   22204                                      Prefix pfx, Long delta,
   22205                                      const HChar* opname, IROp op )
   22206 {
   22207    HChar   dis_buf[50];
   22208    Int     alen, size;
   22209    IRTemp  addr;
   22210    Bool    shl, shr, sar;
   22211    UChar   modrm = getUChar(delta);
   22212    UInt    rG    = gregOfRexRM(pfx,modrm);
   22213    UInt    rV    = getVexNvvvv(pfx);;
   22214    IRTemp  g0    = newTemp(Ity_V128);
   22215    IRTemp  g1    = newTemp(Ity_V128);
   22216    IRTemp  amt   = newTemp(Ity_I64);
   22217    IRTemp  amt8  = newTemp(Ity_I8);
   22218    if (epartIsReg(modrm)) {
   22219       UInt rE = eregOfRexRM(pfx,modrm);
   22220       assign( amt, getXMMRegLane64(rE, 0) );
   22221       DIP("%s %s,%s,%s\n", opname, nameXMMReg(rE),
   22222           nameXMMReg(rV), nameXMMReg(rG) );
   22223       delta++;
   22224    } else {
   22225       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22226       assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
   22227       DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   22228       delta += alen;
   22229    }
   22230    assign( g0, getXMMReg(rV) );
   22231    assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
   22232 
   22233    shl = shr = sar = False;
   22234    size = 0;
   22235    switch (op) {
   22236       case Iop_ShlN16x8: shl = True; size = 32; break;
   22237       case Iop_ShlN32x4: shl = True; size = 32; break;
   22238       case Iop_ShlN64x2: shl = True; size = 64; break;
   22239       case Iop_SarN16x8: sar = True; size = 16; break;
   22240       case Iop_SarN32x4: sar = True; size = 32; break;
   22241       case Iop_ShrN16x8: shr = True; size = 16; break;
   22242       case Iop_ShrN32x4: shr = True; size = 32; break;
   22243       case Iop_ShrN64x2: shr = True; size = 64; break;
   22244       default: vassert(0);
   22245    }
   22246 
   22247    if (shl || shr) {
   22248      assign(
   22249         g1,
   22250         IRExpr_ITE(
   22251            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
   22252            binop(op, mkexpr(g0), mkexpr(amt8)),
   22253            mkV128(0x0000)
   22254         )
   22255      );
   22256    } else
   22257    if (sar) {
   22258      assign(
   22259         g1,
   22260         IRExpr_ITE(
   22261            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
   22262            binop(op, mkexpr(g0), mkexpr(amt8)),
   22263            binop(op, mkexpr(g0), mkU8(size-1))
   22264         )
   22265      );
   22266    } else {
   22267       vassert(0);
   22268    }
   22269 
   22270    putYMMRegLoAndZU( rG, mkexpr(g1) );
   22271    return delta;
   22272 }
   22273 
   22274 
   22275 /* Vector by scalar shift of V by the amount specified at the bottom
   22276    of E. */
   22277 static ULong dis_AVX256_shiftV_byE ( VexAbiInfo* vbi,
   22278                                      Prefix pfx, Long delta,
   22279                                      const HChar* opname, IROp op )
   22280 {
   22281    HChar   dis_buf[50];
   22282    Int     alen, size;
   22283    IRTemp  addr;
   22284    Bool    shl, shr, sar;
   22285    UChar   modrm = getUChar(delta);
   22286    UInt    rG    = gregOfRexRM(pfx,modrm);
   22287    UInt    rV    = getVexNvvvv(pfx);;
   22288    IRTemp  g0    = newTemp(Ity_V256);
   22289    IRTemp  g1    = newTemp(Ity_V256);
   22290    IRTemp  amt   = newTemp(Ity_I64);
   22291    IRTemp  amt8  = newTemp(Ity_I8);
   22292    if (epartIsReg(modrm)) {
   22293       UInt rE = eregOfRexRM(pfx,modrm);
   22294       assign( amt, getXMMRegLane64(rE, 0) );
   22295       DIP("%s %s,%s,%s\n", opname, nameXMMReg(rE),
   22296           nameYMMReg(rV), nameYMMReg(rG) );
   22297       delta++;
   22298    } else {
   22299       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22300       assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
   22301       DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
   22302       delta += alen;
   22303    }
   22304    assign( g0, getYMMReg(rV) );
   22305    assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
   22306 
   22307    shl = shr = sar = False;
   22308    size = 0;
   22309    switch (op) {
   22310       case Iop_ShlN16x16: shl = True; size = 32; break;
   22311       case Iop_ShlN32x8:  shl = True; size = 32; break;
   22312       case Iop_ShlN64x4:  shl = True; size = 64; break;
   22313       case Iop_SarN16x16: sar = True; size = 16; break;
   22314       case Iop_SarN32x8:  sar = True; size = 32; break;
   22315       case Iop_ShrN16x16: shr = True; size = 16; break;
   22316       case Iop_ShrN32x8:  shr = True; size = 32; break;
   22317       case Iop_ShrN64x4:  shr = True; size = 64; break;
   22318       default: vassert(0);
   22319    }
   22320 
   22321    if (shl || shr) {
   22322      assign(
   22323         g1,
   22324         IRExpr_ITE(
   22325            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
   22326            binop(op, mkexpr(g0), mkexpr(amt8)),
   22327            binop(Iop_V128HLtoV256, mkV128(0), mkV128(0))
   22328         )
   22329      );
   22330    } else
   22331    if (sar) {
   22332      assign(
   22333         g1,
   22334         IRExpr_ITE(
   22335            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
   22336            binop(op, mkexpr(g0), mkexpr(amt8)),
   22337            binop(op, mkexpr(g0), mkU8(size-1))
   22338         )
   22339      );
   22340    } else {
   22341       vassert(0);
   22342    }
   22343 
   22344    putYMMReg( rG, mkexpr(g1) );
   22345    return delta;
   22346 }
   22347 
   22348 
   22349 /* Vector by vector shift of V by the amount specified at the bottom
   22350    of E.  Vector by vector shifts are defined for all shift amounts,
   22351    so not using Iop_S*x* here (and SSE2 doesn't support variable shifts
   22352    anyway).  */
   22353 static ULong dis_AVX_var_shiftV_byE ( VexAbiInfo* vbi,
   22354                                       Prefix pfx, Long delta,
   22355                                       const HChar* opname, IROp op, Bool isYMM )
   22356 {
   22357    HChar   dis_buf[50];
   22358    Int     alen, size, i;
   22359    IRTemp  addr;
   22360    UChar   modrm = getUChar(delta);
   22361    UInt    rG    = gregOfRexRM(pfx,modrm);
   22362    UInt    rV    = getVexNvvvv(pfx);;
   22363    IRTemp  sV    = isYMM ? newTemp(Ity_V256) : newTemp(Ity_V128);
   22364    IRTemp  amt   = isYMM ? newTemp(Ity_V256) : newTemp(Ity_V128);
   22365    IRTemp  amts[8], sVs[8], res[8];
   22366    if (epartIsReg(modrm)) {
   22367       UInt rE = eregOfRexRM(pfx,modrm);
   22368       assign( amt, isYMM ? getYMMReg(rE) : getXMMReg(rE) );
   22369       if (isYMM) {
   22370          DIP("%s %s,%s,%s\n", opname, nameYMMReg(rE),
   22371              nameYMMReg(rV), nameYMMReg(rG) );
   22372       } else {
   22373          DIP("%s %s,%s,%s\n", opname, nameXMMReg(rE),
   22374              nameXMMReg(rV), nameXMMReg(rG) );
   22375       }
   22376       delta++;
   22377    } else {
   22378       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22379       assign( amt, loadLE(isYMM ? Ity_V256 : Ity_V128, mkexpr(addr)) );
   22380       if (isYMM) {
   22381          DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV),
   22382              nameYMMReg(rG) );
   22383       } else {
   22384          DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV),
   22385              nameXMMReg(rG) );
   22386       }
   22387       delta += alen;
   22388    }
   22389    assign( sV, isYMM ? getYMMReg(rV) : getXMMReg(rV) );
   22390 
   22391    size = 0;
   22392    switch (op) {
   22393       case Iop_Shl32: size = 32; break;
   22394       case Iop_Shl64: size = 64; break;
   22395       case Iop_Sar32: size = 32; break;
   22396       case Iop_Shr32: size = 32; break;
   22397       case Iop_Shr64: size = 64; break;
   22398       default: vassert(0);
   22399    }
   22400 
   22401    for (i = 0; i < 8; i++) {
   22402       sVs[i] = IRTemp_INVALID;
   22403       amts[i] = IRTemp_INVALID;
   22404    }
   22405    switch (size) {
   22406       case 32:
   22407          if (isYMM) {
   22408             breakupV256to32s( sV, &sVs[7], &sVs[6], &sVs[5], &sVs[4],
   22409                                   &sVs[3], &sVs[2], &sVs[1], &sVs[0] );
   22410             breakupV256to32s( amt, &amts[7], &amts[6], &amts[5], &amts[4],
   22411                                    &amts[3], &amts[2], &amts[1], &amts[0] );
   22412          } else {
   22413             breakupV128to32s( sV, &sVs[3], &sVs[2], &sVs[1], &sVs[0] );
   22414             breakupV128to32s( amt, &amts[3], &amts[2], &amts[1], &amts[0] );
   22415         }
   22416          break;
   22417       case 64:
   22418          if (isYMM) {
   22419             breakupV256to64s( sV, &sVs[3], &sVs[2], &sVs[1], &sVs[0] );
   22420             breakupV256to64s( amt, &amts[3], &amts[2], &amts[1], &amts[0] );
   22421          } else {
   22422             breakupV128to64s( sV, &sVs[1], &sVs[0] );
   22423             breakupV128to64s( amt, &amts[1], &amts[0] );
   22424          }
   22425          break;
   22426       default: vassert(0);
   22427    }
   22428    for (i = 0; i < 8; i++)
   22429       if (sVs[i] != IRTemp_INVALID) {
   22430          res[i] = size == 32 ? newTemp(Ity_I32) : newTemp(Ity_I64);
   22431          assign( res[i],
   22432                  IRExpr_ITE(
   22433                     binop(size == 32 ? Iop_CmpLT32U : Iop_CmpLT64U,
   22434                           mkexpr(amts[i]),
   22435                           size == 32 ? mkU32(size) : mkU64(size)),
   22436                     binop(op, mkexpr(sVs[i]),
   22437                                unop(size == 32 ? Iop_32to8 : Iop_64to8,
   22438                                     mkexpr(amts[i]))),
   22439                     op == Iop_Sar32 ? binop(op, mkexpr(sVs[i]), mkU8(size-1))
   22440                                     : size == 32 ? mkU32(0) : mkU64(0)
   22441          ));
   22442       }
   22443    switch (size) {
   22444       case 32:
   22445          for (i = 0; i < 8; i++)
   22446             putYMMRegLane32( rG, i, (i < 4 || isYMM)
   22447                                     ? mkexpr(res[i]) : mkU32(0) );
   22448          break;
   22449       case 64:
   22450          for (i = 0; i < 4; i++)
   22451             putYMMRegLane64( rG, i, (i < 2 || isYMM)
   22452                                     ? mkexpr(res[i]) : mkU64(0) );
   22453          break;
   22454       default: vassert(0);
   22455    }
   22456 
   22457    return delta;
   22458 }
   22459 
   22460 
   22461 /* Vector by scalar shift of E into V, by an immediate byte.  Modified
   22462    version of dis_SSE_shiftE_imm. */
   22463 static
   22464 Long dis_AVX128_shiftE_to_V_imm( Prefix pfx,
   22465                                  Long delta, const HChar* opname, IROp op )
   22466 {
   22467    Bool    shl, shr, sar;
   22468    UChar   rm   = getUChar(delta);
   22469    IRTemp  e0   = newTemp(Ity_V128);
   22470    IRTemp  e1   = newTemp(Ity_V128);
   22471    UInt    rD   = getVexNvvvv(pfx);
   22472    UChar   amt, size;
   22473    vassert(epartIsReg(rm));
   22474    vassert(gregLO3ofRM(rm) == 2
   22475            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
   22476    amt = getUChar(delta+1);
   22477    delta += 2;
   22478    DIP("%s $%d,%s,%s\n", opname,
   22479                          (Int)amt,
   22480                          nameXMMReg(eregOfRexRM(pfx,rm)),
   22481                          nameXMMReg(rD));
   22482    assign( e0, getXMMReg(eregOfRexRM(pfx,rm)) );
   22483 
   22484    shl = shr = sar = False;
   22485    size = 0;
   22486    switch (op) {
   22487       case Iop_ShlN16x8: shl = True; size = 16; break;
   22488       case Iop_ShlN32x4: shl = True; size = 32; break;
   22489       case Iop_ShlN64x2: shl = True; size = 64; break;
   22490       case Iop_SarN16x8: sar = True; size = 16; break;
   22491       case Iop_SarN32x4: sar = True; size = 32; break;
   22492       case Iop_ShrN16x8: shr = True; size = 16; break;
   22493       case Iop_ShrN32x4: shr = True; size = 32; break;
   22494       case Iop_ShrN64x2: shr = True; size = 64; break;
   22495       default: vassert(0);
   22496    }
   22497 
   22498    if (shl || shr) {
   22499      assign( e1, amt >= size
   22500                     ? mkV128(0x0000)
   22501                     : binop(op, mkexpr(e0), mkU8(amt))
   22502      );
   22503    } else
   22504    if (sar) {
   22505      assign( e1, amt >= size
   22506                     ? binop(op, mkexpr(e0), mkU8(size-1))
   22507                     : binop(op, mkexpr(e0), mkU8(amt))
   22508      );
   22509    } else {
   22510       vassert(0);
   22511    }
   22512 
   22513    putYMMRegLoAndZU( rD, mkexpr(e1) );
   22514    return delta;
   22515 }
   22516 
   22517 
   22518 /* Vector by scalar shift of E into V, by an immediate byte.  Modified
   22519    version of dis_AVX128_shiftE_to_V_imm. */
   22520 static
   22521 Long dis_AVX256_shiftE_to_V_imm( Prefix pfx,
   22522                                  Long delta, const HChar* opname, IROp op )
   22523 {
   22524    Bool    shl, shr, sar;
   22525    UChar   rm   = getUChar(delta);
   22526    IRTemp  e0   = newTemp(Ity_V256);
   22527    IRTemp  e1   = newTemp(Ity_V256);
   22528    UInt    rD   = getVexNvvvv(pfx);
   22529    UChar   amt, size;
   22530    vassert(epartIsReg(rm));
   22531    vassert(gregLO3ofRM(rm) == 2
   22532            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
   22533    amt = getUChar(delta+1);
   22534    delta += 2;
   22535    DIP("%s $%d,%s,%s\n", opname,
   22536                          (Int)amt,
   22537                          nameYMMReg(eregOfRexRM(pfx,rm)),
   22538                          nameYMMReg(rD));
   22539    assign( e0, getYMMReg(eregOfRexRM(pfx,rm)) );
   22540 
   22541    shl = shr = sar = False;
   22542    size = 0;
   22543    switch (op) {
   22544       case Iop_ShlN16x16: shl = True; size = 16; break;
   22545       case Iop_ShlN32x8:  shl = True; size = 32; break;
   22546       case Iop_ShlN64x4:  shl = True; size = 64; break;
   22547       case Iop_SarN16x16: sar = True; size = 16; break;
   22548       case Iop_SarN32x8:  sar = True; size = 32; break;
   22549       case Iop_ShrN16x16: shr = True; size = 16; break;
   22550       case Iop_ShrN32x8:  shr = True; size = 32; break;
   22551       case Iop_ShrN64x4:  shr = True; size = 64; break;
   22552       default: vassert(0);
   22553    }
   22554 
   22555 
   22556    if (shl || shr) {
   22557      assign( e1, amt >= size
   22558                     ? binop(Iop_V128HLtoV256, mkV128(0), mkV128(0))
   22559                     : binop(op, mkexpr(e0), mkU8(amt))
   22560      );
   22561    } else
   22562    if (sar) {
   22563      assign( e1, amt >= size
   22564                     ? binop(op, mkexpr(e0), mkU8(size-1))
   22565                     : binop(op, mkexpr(e0), mkU8(amt))
   22566      );
   22567    } else {
   22568       vassert(0);
   22569    }
   22570 
   22571    putYMMReg( rD, mkexpr(e1) );
   22572    return delta;
   22573 }
   22574 
   22575 
   22576 /* Lower 64-bit lane only AVX128 binary operation:
   22577    G[63:0]    = V[63:0] `op` E[63:0]
   22578    G[127:64]  = V[127:64]
   22579    G[255:128] = 0.
   22580    The specified op must be of the 64F0x2 kind, so that it
   22581    copies the upper half of the left operand to the result.
   22582 */
   22583 static Long dis_AVX128_E_V_to_G_lo64 ( /*OUT*/Bool* uses_vvvv,
   22584                                        VexAbiInfo* vbi,
   22585                                        Prefix pfx, Long delta,
   22586                                        const HChar* opname, IROp op )
   22587 {
   22588    HChar   dis_buf[50];
   22589    Int     alen;
   22590    IRTemp  addr;
   22591    UChar   rm    = getUChar(delta);
   22592    UInt    rG    = gregOfRexRM(pfx,rm);
   22593    UInt    rV    = getVexNvvvv(pfx);
   22594    IRExpr* vpart = getXMMReg(rV);
   22595    if (epartIsReg(rm)) {
   22596       UInt rE = eregOfRexRM(pfx,rm);
   22597       putXMMReg( rG, binop(op, vpart, getXMMReg(rE)) );
   22598       DIP("%s %s,%s,%s\n", opname,
   22599           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   22600       delta = delta+1;
   22601    } else {
   22602       /* We can only do a 64-bit memory read, so the upper half of the
   22603          E operand needs to be made simply of zeroes. */
   22604       IRTemp epart = newTemp(Ity_V128);
   22605       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22606       assign( epart, unop( Iop_64UtoV128,
   22607                            loadLE(Ity_I64, mkexpr(addr))) );
   22608       putXMMReg( rG, binop(op, vpart, mkexpr(epart)) );
   22609       DIP("%s %s,%s,%s\n", opname,
   22610           dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   22611       delta = delta+alen;
   22612    }
   22613    putYMMRegLane128( rG, 1, mkV128(0) );
   22614    *uses_vvvv = True;
   22615    return delta;
   22616 }
   22617 
   22618 
   22619 /* Lower 64-bit lane only AVX128 unary operation:
   22620    G[63:0]    = op(E[63:0])
   22621    G[127:64]  = V[127:64]
   22622    G[255:128] = 0
   22623    The specified op must be of the 64F0x2 kind, so that it
   22624    copies the upper half of the operand to the result.
   22625 */
   22626 static Long dis_AVX128_E_V_to_G_lo64_unary ( /*OUT*/Bool* uses_vvvv,
   22627                                              VexAbiInfo* vbi,
   22628                                              Prefix pfx, Long delta,
   22629                                              const HChar* opname, IROp op )
   22630 {
   22631    HChar   dis_buf[50];
   22632    Int     alen;
   22633    IRTemp  addr;
   22634    UChar   rm  = getUChar(delta);
   22635    UInt    rG  = gregOfRexRM(pfx,rm);
   22636    UInt    rV  = getVexNvvvv(pfx);
   22637    IRTemp  e64 = newTemp(Ity_I64);
   22638 
   22639    /* Fetch E[63:0] */
   22640    if (epartIsReg(rm)) {
   22641       UInt rE = eregOfRexRM(pfx,rm);
   22642       assign(e64, getXMMRegLane64(rE, 0));
   22643       DIP("%s %s,%s,%s\n", opname,
   22644           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   22645       delta += 1;
   22646    } else {
   22647       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22648       assign(e64, loadLE(Ity_I64, mkexpr(addr)));
   22649       DIP("%s %s,%s,%s\n", opname,
   22650           dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   22651       delta += alen;
   22652    }
   22653 
   22654    /* Create a value 'arg' as V[127:64]++E[63:0] */
   22655    IRTemp arg = newTemp(Ity_V128);
   22656    assign(arg,
   22657           binop(Iop_SetV128lo64,
   22658                 getXMMReg(rV), mkexpr(e64)));
   22659    /* and apply op to it */
   22660    putYMMRegLoAndZU( rG, unop(op, mkexpr(arg)) );
   22661    *uses_vvvv = True;
   22662    return delta;
   22663 }
   22664 
   22665 
   22666 /* Lower 32-bit lane only AVX128 unary operation:
   22667    G[31:0]    = op(E[31:0])
   22668    G[127:32]  = V[127:32]
   22669    G[255:128] = 0
   22670    The specified op must be of the 32F0x4 kind, so that it
   22671    copies the upper 3/4 of the operand to the result.
   22672 */
   22673 static Long dis_AVX128_E_V_to_G_lo32_unary ( /*OUT*/Bool* uses_vvvv,
   22674                                              VexAbiInfo* vbi,
   22675                                              Prefix pfx, Long delta,
   22676                                              const HChar* opname, IROp op )
   22677 {
   22678    HChar   dis_buf[50];
   22679    Int     alen;
   22680    IRTemp  addr;
   22681    UChar   rm  = getUChar(delta);
   22682    UInt    rG  = gregOfRexRM(pfx,rm);
   22683    UInt    rV  = getVexNvvvv(pfx);
   22684    IRTemp  e32 = newTemp(Ity_I32);
   22685 
   22686    /* Fetch E[31:0] */
   22687    if (epartIsReg(rm)) {
   22688       UInt rE = eregOfRexRM(pfx,rm);
   22689       assign(e32, getXMMRegLane32(rE, 0));
   22690       DIP("%s %s,%s,%s\n", opname,
   22691           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   22692       delta += 1;
   22693    } else {
   22694       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22695       assign(e32, loadLE(Ity_I32, mkexpr(addr)));
   22696       DIP("%s %s,%s,%s\n", opname,
   22697           dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   22698       delta += alen;
   22699    }
   22700 
   22701    /* Create a value 'arg' as V[127:32]++E[31:0] */
   22702    IRTemp arg = newTemp(Ity_V128);
   22703    assign(arg,
   22704           binop(Iop_SetV128lo32,
   22705                 getXMMReg(rV), mkexpr(e32)));
   22706    /* and apply op to it */
   22707    putYMMRegLoAndZU( rG, unop(op, mkexpr(arg)) );
   22708    *uses_vvvv = True;
   22709    return delta;
   22710 }
   22711 
   22712 
   22713 /* Lower 32-bit lane only AVX128 binary operation:
   22714    G[31:0]    = V[31:0] `op` E[31:0]
   22715    G[127:32]  = V[127:32]
   22716    G[255:128] = 0.
   22717    The specified op must be of the 32F0x4 kind, so that it
   22718    copies the upper 3/4 of the left operand to the result.
   22719 */
   22720 static Long dis_AVX128_E_V_to_G_lo32 ( /*OUT*/Bool* uses_vvvv,
   22721                                        VexAbiInfo* vbi,
   22722                                        Prefix pfx, Long delta,
   22723                                        const HChar* opname, IROp op )
   22724 {
   22725    HChar   dis_buf[50];
   22726    Int     alen;
   22727    IRTemp  addr;
   22728    UChar   rm    = getUChar(delta);
   22729    UInt    rG    = gregOfRexRM(pfx,rm);
   22730    UInt    rV    = getVexNvvvv(pfx);
   22731    IRExpr* vpart = getXMMReg(rV);
   22732    if (epartIsReg(rm)) {
   22733       UInt rE = eregOfRexRM(pfx,rm);
   22734       putXMMReg( rG, binop(op, vpart, getXMMReg(rE)) );
   22735       DIP("%s %s,%s,%s\n", opname,
   22736           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   22737       delta = delta+1;
   22738    } else {
   22739       /* We can only do a 32-bit memory read, so the upper 3/4 of the
   22740          E operand needs to be made simply of zeroes. */
   22741       IRTemp epart = newTemp(Ity_V128);
   22742       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22743       assign( epart, unop( Iop_32UtoV128,
   22744                            loadLE(Ity_I32, mkexpr(addr))) );
   22745       putXMMReg( rG, binop(op, vpart, mkexpr(epart)) );
   22746       DIP("%s %s,%s,%s\n", opname,
   22747           dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   22748       delta = delta+alen;
   22749    }
   22750    putYMMRegLane128( rG, 1, mkV128(0) );
   22751    *uses_vvvv = True;
   22752    return delta;
   22753 }
   22754 
   22755 
   22756 /* All-lanes AVX128 binary operation:
   22757    G[127:0]   = V[127:0] `op` E[127:0]
   22758    G[255:128] = 0.
   22759 */
   22760 static Long dis_AVX128_E_V_to_G ( /*OUT*/Bool* uses_vvvv,
   22761                                   VexAbiInfo* vbi,
   22762                                   Prefix pfx, Long delta,
   22763                                   const HChar* opname, IROp op )
   22764 {
   22765    return dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   22766              uses_vvvv, vbi, pfx, delta, opname, op,
   22767              NULL, False/*!invertLeftArg*/, False/*!swapArgs*/
   22768    );
   22769 }
   22770 
   22771 
   22772 /* Handles AVX128 32F/64F comparisons.  A derivative of
   22773    dis_SSEcmp_E_to_G.  It can fail, in which case it returns the
   22774    original delta to indicate failure. */
   22775 static
   22776 Long dis_AVX128_cmp_V_E_to_G ( /*OUT*/Bool* uses_vvvv,
   22777                                VexAbiInfo* vbi,
   22778                                Prefix pfx, Long delta,
   22779                                const HChar* opname, Bool all_lanes, Int sz )
   22780 {
   22781    vassert(sz == 4 || sz == 8);
   22782    Long    deltaIN = delta;
   22783    HChar   dis_buf[50];
   22784    Int     alen;
   22785    UInt    imm8;
   22786    IRTemp  addr;
   22787    Bool    preSwap = False;
   22788    IROp    op      = Iop_INVALID;
   22789    Bool    postNot = False;
   22790    IRTemp  plain   = newTemp(Ity_V128);
   22791    UChar   rm      = getUChar(delta);
   22792    UInt    rG      = gregOfRexRM(pfx, rm);
   22793    UInt    rV      = getVexNvvvv(pfx);
   22794    IRTemp argL     = newTemp(Ity_V128);
   22795    IRTemp argR     = newTemp(Ity_V128);
   22796 
   22797    assign(argL, getXMMReg(rV));
   22798    if (epartIsReg(rm)) {
   22799       imm8 = getUChar(delta+1);
   22800       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
   22801       if (!ok) return deltaIN; /* FAIL */
   22802       UInt rE = eregOfRexRM(pfx,rm);
   22803       assign(argR, getXMMReg(rE));
   22804       delta += 1+1;
   22805       DIP("%s $%d,%s,%s,%s\n",
   22806           opname, (Int)imm8,
   22807           nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   22808    } else {
   22809       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   22810       imm8 = getUChar(delta+alen);
   22811       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
   22812       if (!ok) return deltaIN; /* FAIL */
   22813       assign(argR,
   22814              all_lanes   ? loadLE(Ity_V128, mkexpr(addr))
   22815              : sz == 8   ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
   22816              : /*sz==4*/   unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr))));
   22817       delta += alen+1;
   22818       DIP("%s $%d,%s,%s,%s\n",
   22819           opname, (Int)imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   22820    }
   22821 
   22822    assign(plain, preSwap ? binop(op, mkexpr(argR), mkexpr(argL))
   22823                          : binop(op, mkexpr(argL), mkexpr(argR)));
   22824 
   22825    if (all_lanes) {
   22826       /* This is simple: just invert the result, if necessary, and
   22827          have done. */
   22828       if (postNot) {
   22829          putYMMRegLoAndZU( rG, unop(Iop_NotV128, mkexpr(plain)) );
   22830       } else {
   22831          putYMMRegLoAndZU( rG, mkexpr(plain) );
   22832       }
   22833    }
   22834    else
   22835    if (!preSwap) {
   22836       /* More complex.  It's a one-lane-only, hence need to possibly
   22837          invert only that one lane.  But at least the other lanes are
   22838          correctly "in" the result, having been copied from the left
   22839          operand (argL). */
   22840       if (postNot) {
   22841          IRExpr* mask = mkV128(sz==4 ? 0x000F : 0x00FF);
   22842          putYMMRegLoAndZU( rG, binop(Iop_XorV128, mkexpr(plain),
   22843                                                   mask) );
   22844       } else {
   22845          putYMMRegLoAndZU( rG, mkexpr(plain) );
   22846       }
   22847    }
   22848    else {
   22849       /* This is the most complex case.  One-lane-only, but the args
   22850          were swapped.  So we have to possibly invert the bottom lane,
   22851          and (definitely) we have to copy the upper lane(s) from argL
   22852          since, due to the swapping, what's currently there is from
   22853          argR, which is not correct. */
   22854       IRTemp res     = newTemp(Ity_V128);
   22855       IRTemp mask    = newTemp(Ity_V128);
   22856       IRTemp notMask = newTemp(Ity_V128);
   22857       assign(mask,    mkV128(sz==4 ? 0x000F : 0x00FF));
   22858       assign(notMask, mkV128(sz==4 ? 0xFFF0 : 0xFF00));
   22859       if (postNot) {
   22860          assign(res,
   22861                 binop(Iop_OrV128,
   22862                       binop(Iop_AndV128,
   22863                             unop(Iop_NotV128, mkexpr(plain)),
   22864                             mkexpr(mask)),
   22865                       binop(Iop_AndV128, mkexpr(argL), mkexpr(notMask))));
   22866       } else {
   22867          assign(res,
   22868                 binop(Iop_OrV128,
   22869                       binop(Iop_AndV128,
   22870                             mkexpr(plain),
   22871                             mkexpr(mask)),
   22872                       binop(Iop_AndV128, mkexpr(argL), mkexpr(notMask))));
   22873       }
   22874       putYMMRegLoAndZU( rG, mkexpr(res) );
   22875    }
   22876 
   22877    *uses_vvvv = True;
   22878    return delta;
   22879 }
   22880 
   22881 
   22882 /* Handles AVX256 32F/64F comparisons.  A derivative of
   22883    dis_SSEcmp_E_to_G.  It can fail, in which case it returns the
   22884    original delta to indicate failure. */
   22885 static
   22886 Long dis_AVX256_cmp_V_E_to_G ( /*OUT*/Bool* uses_vvvv,
   22887                                VexAbiInfo* vbi,
   22888                                Prefix pfx, Long delta,
   22889                                const HChar* opname, Int sz )
   22890 {
   22891    vassert(sz == 4 || sz == 8);
   22892    Long    deltaIN = delta;
   22893    HChar   dis_buf[50];
   22894    Int     alen;
   22895    UInt    imm8;
   22896    IRTemp  addr;
   22897    Bool    preSwap = False;
   22898    IROp    op      = Iop_INVALID;
   22899    Bool    postNot = False;
   22900    IRTemp  plain   = newTemp(Ity_V256);
   22901    UChar   rm      = getUChar(delta);
   22902    UInt    rG      = gregOfRexRM(pfx, rm);
   22903    UInt    rV      = getVexNvvvv(pfx);
   22904    IRTemp argL     = newTemp(Ity_V256);
   22905    IRTemp argR     = newTemp(Ity_V256);
   22906    IRTemp argLhi   = IRTemp_INVALID;
   22907    IRTemp argLlo   = IRTemp_INVALID;
   22908    IRTemp argRhi   = IRTemp_INVALID;
   22909    IRTemp argRlo   = IRTemp_INVALID;
   22910 
   22911    assign(argL, getYMMReg(rV));
   22912    if (epartIsReg(rm)) {
   22913       imm8 = getUChar(delta+1);
   22914       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8,
   22915                              True/*all_lanes*/, sz);
   22916       if (!ok) return deltaIN; /* FAIL */
   22917       UInt rE = eregOfRexRM(pfx,rm);
   22918       assign(argR, getYMMReg(rE));
   22919       delta += 1+1;
   22920       DIP("%s $%d,%s,%s,%s\n",
   22921           opname, (Int)imm8,
   22922           nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   22923    } else {
   22924       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   22925       imm8 = getUChar(delta+alen);
   22926       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8,
   22927                              True/*all_lanes*/, sz);
   22928       if (!ok) return deltaIN; /* FAIL */
   22929       assign(argR, loadLE(Ity_V256, mkexpr(addr)) );
   22930       delta += alen+1;
   22931       DIP("%s $%d,%s,%s,%s\n",
   22932           opname, (Int)imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   22933    }
   22934 
   22935    breakupV256toV128s( preSwap ? argR : argL, &argLhi, &argLlo );
   22936    breakupV256toV128s( preSwap ? argL : argR, &argRhi, &argRlo );
   22937    assign(plain, binop( Iop_V128HLtoV256,
   22938                         binop(op, mkexpr(argLhi), mkexpr(argRhi)),
   22939                         binop(op, mkexpr(argLlo), mkexpr(argRlo)) ) );
   22940 
   22941    /* This is simple: just invert the result, if necessary, and
   22942       have done. */
   22943    if (postNot) {
   22944       putYMMReg( rG, unop(Iop_NotV256, mkexpr(plain)) );
   22945    } else {
   22946       putYMMReg( rG, mkexpr(plain) );
   22947    }
   22948 
   22949    *uses_vvvv = True;
   22950    return delta;
   22951 }
   22952 
   22953 
   22954 /* Handles AVX128 unary E-to-G all-lanes operations. */
   22955 static
   22956 Long dis_AVX128_E_to_G_unary ( /*OUT*/Bool* uses_vvvv,
   22957                                VexAbiInfo* vbi,
   22958                                Prefix pfx, Long delta,
   22959                                const HChar* opname,
   22960                                IRTemp (*opFn)(IRTemp) )
   22961 {
   22962    HChar  dis_buf[50];
   22963    Int    alen;
   22964    IRTemp addr;
   22965    IRTemp res  = newTemp(Ity_V128);
   22966    IRTemp arg  = newTemp(Ity_V128);
   22967    UChar  rm   = getUChar(delta);
   22968    UInt   rG   = gregOfRexRM(pfx, rm);
   22969    if (epartIsReg(rm)) {
   22970       UInt rE = eregOfRexRM(pfx,rm);
   22971       assign(arg, getXMMReg(rE));
   22972       delta += 1;
   22973       DIP("%s %s,%s\n", opname, nameXMMReg(rE), nameXMMReg(rG));
   22974    } else {
   22975       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   22976       assign(arg, loadLE(Ity_V128, mkexpr(addr)));
   22977       delta += alen;
   22978       DIP("%s %s,%s\n", opname, dis_buf, nameXMMReg(rG));
   22979    }
   22980    res = opFn(arg);
   22981    putYMMRegLoAndZU( rG, mkexpr(res) );
   22982    *uses_vvvv = False;
   22983    return delta;
   22984 }
   22985 
   22986 
   22987 /* Handles AVX128 unary E-to-G all-lanes operations. */
   22988 static
   22989 Long dis_AVX128_E_to_G_unary_all ( /*OUT*/Bool* uses_vvvv,
   22990                                    VexAbiInfo* vbi,
   22991                                    Prefix pfx, Long delta,
   22992                                    const HChar* opname, IROp op )
   22993 {
   22994    HChar  dis_buf[50];
   22995    Int    alen;
   22996    IRTemp addr;
   22997    IRTemp arg  = newTemp(Ity_V128);
   22998    UChar  rm   = getUChar(delta);
   22999    UInt   rG   = gregOfRexRM(pfx, rm);
   23000    if (epartIsReg(rm)) {
   23001       UInt rE = eregOfRexRM(pfx,rm);
   23002       assign(arg, getXMMReg(rE));
   23003       delta += 1;
   23004       DIP("%s %s,%s\n", opname, nameXMMReg(rE), nameXMMReg(rG));
   23005    } else {
   23006       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23007       assign(arg, loadLE(Ity_V128, mkexpr(addr)));
   23008       delta += alen;
   23009       DIP("%s %s,%s\n", opname, dis_buf, nameXMMReg(rG));
   23010    }
   23011    putYMMRegLoAndZU( rG, unop(op, mkexpr(arg)) );
   23012    *uses_vvvv = False;
   23013    return delta;
   23014 }
   23015 
   23016 
   23017 /* FIXME: common up with the _128_ version above? */
   23018 static
   23019 Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG (
   23020         /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi,
   23021         Prefix pfx, Long delta, const HChar* name,
   23022         /* The actual operation.  Use either 'op' or 'opfn',
   23023            but not both. */
   23024         IROp op, IRTemp(*opFn)(IRTemp,IRTemp),
   23025         Bool invertLeftArg,
   23026         Bool swapArgs
   23027      )
   23028 {
   23029    UChar  modrm = getUChar(delta);
   23030    UInt   rD    = gregOfRexRM(pfx, modrm);
   23031    UInt   rSL   = getVexNvvvv(pfx);
   23032    IRTemp tSL   = newTemp(Ity_V256);
   23033    IRTemp tSR   = newTemp(Ity_V256);
   23034    IRTemp addr  = IRTemp_INVALID;
   23035    HChar  dis_buf[50];
   23036    Int    alen  = 0;
   23037    vassert(1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*WIG?*/);
   23038 
   23039    assign(tSL, invertLeftArg ? unop(Iop_NotV256, getYMMReg(rSL))
   23040                              : getYMMReg(rSL));
   23041 
   23042    if (epartIsReg(modrm)) {
   23043       UInt rSR = eregOfRexRM(pfx, modrm);
   23044       delta += 1;
   23045       assign(tSR, getYMMReg(rSR));
   23046       DIP("%s %s,%s,%s\n",
   23047           name, nameYMMReg(rSR), nameYMMReg(rSL), nameYMMReg(rD));
   23048    } else {
   23049       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   23050       delta += alen;
   23051       assign(tSR, loadLE(Ity_V256, mkexpr(addr)));
   23052       DIP("%s %s,%s,%s\n",
   23053           name, dis_buf, nameYMMReg(rSL), nameYMMReg(rD));
   23054    }
   23055 
   23056    IRTemp res = IRTemp_INVALID;
   23057    if (op != Iop_INVALID) {
   23058       vassert(opFn == NULL);
   23059       res = newTemp(Ity_V256);
   23060       if (requiresRMode(op)) {
   23061          IRTemp rm = newTemp(Ity_I32);
   23062          assign(rm, get_FAKE_roundingmode()); /* XXXROUNDINGFIXME */
   23063          assign(res, swapArgs
   23064                         ? triop(op, mkexpr(rm), mkexpr(tSR), mkexpr(tSL))
   23065                         : triop(op, mkexpr(rm), mkexpr(tSL), mkexpr(tSR)));
   23066       } else {
   23067          assign(res, swapArgs
   23068                         ? binop(op, mkexpr(tSR), mkexpr(tSL))
   23069                         : binop(op, mkexpr(tSL), mkexpr(tSR)));
   23070       }
   23071    } else {
   23072       vassert(opFn != NULL);
   23073       res = swapArgs ? opFn(tSR, tSL) : opFn(tSL, tSR);
   23074    }
   23075 
   23076    putYMMReg(rD, mkexpr(res));
   23077 
   23078    *uses_vvvv = True;
   23079    return delta;
   23080 }
   23081 
   23082 
   23083 /* All-lanes AVX256 binary operation:
   23084    G[255:0] = V[255:0] `op` E[255:0]
   23085 */
   23086 static Long dis_AVX256_E_V_to_G ( /*OUT*/Bool* uses_vvvv,
   23087                                   VexAbiInfo* vbi,
   23088                                   Prefix pfx, Long delta,
   23089                                   const HChar* opname, IROp op )
   23090 {
   23091    return dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
   23092              uses_vvvv, vbi, pfx, delta, opname, op,
   23093              NULL, False/*!invertLeftArg*/, False/*!swapArgs*/
   23094    );
   23095 }
   23096 
   23097 
   23098 /* Handle a VEX_NDS_256_66_0F_WIG (3-addr) insn, with a simple IROp
   23099    for the operation, no inversion of the left arg, and no swapping of
   23100    args. */
   23101 static
   23102 Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple (
   23103         /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi,
   23104         Prefix pfx, Long delta, const HChar* name,
   23105         IROp op
   23106      )
   23107 {
   23108    return dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
   23109              uses_vvvv, vbi, pfx, delta, name, op, NULL, False, False);
   23110 }
   23111 
   23112 
   23113 /* Handle a VEX_NDS_256_66_0F_WIG (3-addr) insn, using the given IR
   23114    generator to compute the result, no inversion of the left
   23115    arg, and no swapping of args. */
   23116 static
   23117 Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex (
   23118         /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi,
   23119         Prefix pfx, Long delta, const HChar* name,
   23120         IRTemp(*opFn)(IRTemp,IRTemp)
   23121      )
   23122 {
   23123    return dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
   23124              uses_vvvv, vbi, pfx, delta, name,
   23125              Iop_INVALID, opFn, False, False );
   23126 }
   23127 
   23128 
   23129 /* Handles AVX256 unary E-to-G all-lanes operations. */
   23130 static
   23131 Long dis_AVX256_E_to_G_unary ( /*OUT*/Bool* uses_vvvv,
   23132                                VexAbiInfo* vbi,
   23133                                Prefix pfx, Long delta,
   23134                                const HChar* opname,
   23135                                IRTemp (*opFn)(IRTemp) )
   23136 {
   23137    HChar  dis_buf[50];
   23138    Int    alen;
   23139    IRTemp addr;
   23140    IRTemp res  = newTemp(Ity_V256);
   23141    IRTemp arg  = newTemp(Ity_V256);
   23142    UChar  rm   = getUChar(delta);
   23143    UInt   rG   = gregOfRexRM(pfx, rm);
   23144    if (epartIsReg(rm)) {
   23145       UInt rE = eregOfRexRM(pfx,rm);
   23146       assign(arg, getYMMReg(rE));
   23147       delta += 1;
   23148       DIP("%s %s,%s\n", opname, nameYMMReg(rE), nameYMMReg(rG));
   23149    } else {
   23150       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23151       assign(arg, loadLE(Ity_V256, mkexpr(addr)));
   23152       delta += alen;
   23153       DIP("%s %s,%s\n", opname, dis_buf, nameYMMReg(rG));
   23154    }
   23155    res = opFn(arg);
   23156    putYMMReg( rG, mkexpr(res) );
   23157    *uses_vvvv = False;
   23158    return delta;
   23159 }
   23160 
   23161 
   23162 /* Handles AVX256 unary E-to-G all-lanes operations. */
   23163 static
   23164 Long dis_AVX256_E_to_G_unary_all ( /*OUT*/Bool* uses_vvvv,
   23165                                    VexAbiInfo* vbi,
   23166                                    Prefix pfx, Long delta,
   23167                                    const HChar* opname, IROp op )
   23168 {
   23169    HChar  dis_buf[50];
   23170    Int    alen;
   23171    IRTemp addr;
   23172    IRTemp arg  = newTemp(Ity_V256);
   23173    UChar  rm   = getUChar(delta);
   23174    UInt   rG   = gregOfRexRM(pfx, rm);
   23175    if (epartIsReg(rm)) {
   23176       UInt rE = eregOfRexRM(pfx,rm);
   23177       assign(arg, getYMMReg(rE));
   23178       delta += 1;
   23179       DIP("%s %s,%s\n", opname, nameYMMReg(rE), nameYMMReg(rG));
   23180    } else {
   23181       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23182       assign(arg, loadLE(Ity_V256, mkexpr(addr)));
   23183       delta += alen;
   23184       DIP("%s %s,%s\n", opname, dis_buf, nameYMMReg(rG));
   23185    }
   23186    putYMMReg( rG, unop(op, mkexpr(arg)) );
   23187    *uses_vvvv = False;
   23188    return delta;
   23189 }
   23190 
   23191 
   23192 /* The use of ReinterpF64asI64 is ugly.  Surely could do better if we
   23193    had a variant of Iop_64x4toV256 that took F64s as args instead. */
   23194 static Long dis_CVTDQ2PD_256 ( VexAbiInfo* vbi, Prefix pfx,
   23195                                Long delta )
   23196 {
   23197    IRTemp addr  = IRTemp_INVALID;
   23198    Int    alen  = 0;
   23199    HChar  dis_buf[50];
   23200    UChar  modrm = getUChar(delta);
   23201    IRTemp sV    = newTemp(Ity_V128);
   23202    UInt   rG    = gregOfRexRM(pfx,modrm);
   23203    if (epartIsReg(modrm)) {
   23204       UInt rE = eregOfRexRM(pfx,modrm);
   23205       assign( sV, getXMMReg(rE) );
   23206       delta += 1;
   23207       DIP("vcvtdq2pd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   23208    } else {
   23209       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23210       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   23211       delta += alen;
   23212       DIP("vcvtdq2pd %s,%s\n", dis_buf, nameYMMReg(rG) );
   23213    }
   23214    IRTemp s3, s2, s1, s0;
   23215    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   23216    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   23217    IRExpr* res
   23218       = IRExpr_Qop(
   23219            Iop_64x4toV256,
   23220            unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s3))),
   23221            unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s2))),
   23222            unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s1))),
   23223            unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s0)))
   23224         );
   23225    putYMMReg(rG, res);
   23226    return delta;
   23227 }
   23228 
   23229 
   23230 static Long dis_CVTPD2PS_256 ( VexAbiInfo* vbi, Prefix pfx,
   23231                                Long delta )
   23232 {
   23233    IRTemp addr  = IRTemp_INVALID;
   23234    Int    alen  = 0;
   23235    HChar  dis_buf[50];
   23236    UChar  modrm = getUChar(delta);
   23237    UInt   rG    = gregOfRexRM(pfx,modrm);
   23238    IRTemp argV  = newTemp(Ity_V256);
   23239    IRTemp rmode = newTemp(Ity_I32);
   23240    if (epartIsReg(modrm)) {
   23241       UInt rE = eregOfRexRM(pfx,modrm);
   23242       assign( argV, getYMMReg(rE) );
   23243       delta += 1;
   23244       DIP("vcvtpd2psy %s,%s\n", nameYMMReg(rE), nameXMMReg(rG));
   23245    } else {
   23246       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23247       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
   23248       delta += alen;
   23249       DIP("vcvtpd2psy %s,%s\n", dis_buf, nameXMMReg(rG) );
   23250    }
   23251 
   23252    assign( rmode, get_sse_roundingmode() );
   23253    IRTemp t3, t2, t1, t0;
   23254    t3 = t2 = t1 = t0 = IRTemp_INVALID;
   23255    breakupV256to64s( argV, &t3, &t2, &t1, &t0 );
   23256 #  define CVT(_t)  binop( Iop_F64toF32, mkexpr(rmode), \
   23257                           unop(Iop_ReinterpI64asF64, mkexpr(_t)) )
   23258    putXMMRegLane32F( rG, 3, CVT(t3) );
   23259    putXMMRegLane32F( rG, 2, CVT(t2) );
   23260    putXMMRegLane32F( rG, 1, CVT(t1) );
   23261    putXMMRegLane32F( rG, 0, CVT(t0) );
   23262 #  undef CVT
   23263    putYMMRegLane128( rG, 1, mkV128(0) );
   23264    return delta;
   23265 }
   23266 
   23267 
   23268 static IRTemp math_VPUNPCK_YMM ( IRTemp tL, IRType tR, IROp op )
   23269 {
   23270    IRTemp tLhi, tLlo, tRhi, tRlo;
   23271    tLhi = tLlo = tRhi = tRlo = IRTemp_INVALID;
   23272    IRTemp res = newTemp(Ity_V256);
   23273    breakupV256toV128s( tL, &tLhi, &tLlo );
   23274    breakupV256toV128s( tR, &tRhi, &tRlo );
   23275    assign( res, binop( Iop_V128HLtoV256,
   23276                        binop( op, mkexpr(tRhi), mkexpr(tLhi) ),
   23277                        binop( op, mkexpr(tRlo), mkexpr(tLlo) ) ) );
   23278    return res;
   23279 }
   23280 
   23281 
   23282 static IRTemp math_VPUNPCKLBW_YMM ( IRTemp tL, IRTemp tR )
   23283 {
   23284    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO8x16 );
   23285 }
   23286 
   23287 
   23288 static IRTemp math_VPUNPCKLWD_YMM ( IRTemp tL, IRTemp tR )
   23289 {
   23290    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO16x8 );
   23291 }
   23292 
   23293 
   23294 static IRTemp math_VPUNPCKLDQ_YMM ( IRTemp tL, IRTemp tR )
   23295 {
   23296    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO32x4 );
   23297 }
   23298 
   23299 
   23300 static IRTemp math_VPUNPCKLQDQ_YMM ( IRTemp tL, IRTemp tR )
   23301 {
   23302    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO64x2 );
   23303 }
   23304 
   23305 
   23306 static IRTemp math_VPUNPCKHBW_YMM ( IRTemp tL, IRTemp tR )
   23307 {
   23308    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI8x16 );
   23309 }
   23310 
   23311 
   23312 static IRTemp math_VPUNPCKHWD_YMM ( IRTemp tL, IRTemp tR )
   23313 {
   23314    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI16x8 );
   23315 }
   23316 
   23317 
   23318 static IRTemp math_VPUNPCKHDQ_YMM ( IRTemp tL, IRTemp tR )
   23319 {
   23320    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI32x4 );
   23321 }
   23322 
   23323 
   23324 static IRTemp math_VPUNPCKHQDQ_YMM ( IRTemp tL, IRTemp tR )
   23325 {
   23326    return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI64x2 );
   23327 }
   23328 
   23329 
   23330 static IRTemp math_VPACKSSWB_YMM ( IRTemp tL, IRTemp tR )
   23331 {
   23332    return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin16Sto8Sx16 );
   23333 }
   23334 
   23335 
   23336 static IRTemp math_VPACKUSWB_YMM ( IRTemp tL, IRTemp tR )
   23337 {
   23338    return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin16Sto8Ux16 );
   23339 }
   23340 
   23341 
   23342 static IRTemp math_VPACKSSDW_YMM ( IRTemp tL, IRTemp tR )
   23343 {
   23344    return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin32Sto16Sx8 );
   23345 }
   23346 
   23347 
   23348 static IRTemp math_VPACKUSDW_YMM ( IRTemp tL, IRTemp tR )
   23349 {
   23350    return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin32Sto16Ux8 );
   23351 }
   23352 
   23353 
   23354 __attribute__((noinline))
   23355 static
   23356 Long dis_ESC_0F__VEX (
   23357         /*MB_OUT*/DisResult* dres,
   23358         /*OUT*/   Bool*      uses_vvvv,
   23359         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
   23360         Bool         resteerCisOk,
   23361         void*        callback_opaque,
   23362         VexArchInfo* archinfo,
   23363         VexAbiInfo*  vbi,
   23364         Prefix pfx, Int sz, Long deltaIN
   23365      )
   23366 {
   23367    IRTemp addr  = IRTemp_INVALID;
   23368    Int    alen  = 0;
   23369    HChar  dis_buf[50];
   23370    Long   delta = deltaIN;
   23371    UChar  opc   = getUChar(delta);
   23372    delta++;
   23373    *uses_vvvv = False;
   23374 
   23375    switch (opc) {
   23376 
   23377    case 0x10:
   23378       /* VMOVSD m64, xmm1 = VEX.LIG.F2.0F.WIG 10 /r */
   23379       /* Move 64 bits from E (mem only) to G (lo half xmm).
   23380          Bits 255-64 of the dest are zeroed out. */
   23381       if (haveF2no66noF3(pfx) && !epartIsReg(getUChar(delta))) {
   23382          UChar modrm = getUChar(delta);
   23383          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23384          UInt   rG   = gregOfRexRM(pfx,modrm);
   23385          IRTemp z128 = newTemp(Ity_V128);
   23386          assign(z128, mkV128(0));
   23387          putXMMReg( rG, mkexpr(z128) );
   23388          /* FIXME: ALIGNMENT CHECK? */
   23389          putXMMRegLane64( rG, 0, loadLE(Ity_I64, mkexpr(addr)) );
   23390          putYMMRegLane128( rG, 1, mkexpr(z128) );
   23391          DIP("vmovsd %s,%s\n", dis_buf, nameXMMReg(rG));
   23392          delta += alen;
   23393          goto decode_success;
   23394       }
   23395       /* VMOVSD xmm3, xmm2, xmm1 = VEX.LIG.F2.0F.WIG 10 /r */
   23396       /* Reg form. */
   23397       if (haveF2no66noF3(pfx) && epartIsReg(getUChar(delta))) {
   23398          UChar modrm = getUChar(delta);
   23399          UInt  rG    = gregOfRexRM(pfx, modrm);
   23400          UInt  rE    = eregOfRexRM(pfx, modrm);
   23401          UInt  rV    = getVexNvvvv(pfx);
   23402          delta++;
   23403          DIP("vmovsd %s,%s,%s\n",
   23404              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   23405          IRTemp res = newTemp(Ity_V128);
   23406          assign(res, binop(Iop_64HLtoV128,
   23407                            getXMMRegLane64(rV, 1),
   23408                            getXMMRegLane64(rE, 0)));
   23409          putYMMRegLoAndZU(rG, mkexpr(res));
   23410          *uses_vvvv = True;
   23411          goto decode_success;
   23412       }
   23413       /* VMOVSS m32, xmm1 = VEX.LIG.F3.0F.WIG 10 /r */
   23414       /* Move 32 bits from E (mem only) to G (lo half xmm).
   23415          Bits 255-32 of the dest are zeroed out. */
   23416       if (haveF3no66noF2(pfx) && !epartIsReg(getUChar(delta))) {
   23417          UChar modrm = getUChar(delta);
   23418          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23419          UInt   rG   = gregOfRexRM(pfx,modrm);
   23420          IRTemp z128 = newTemp(Ity_V128);
   23421          assign(z128, mkV128(0));
   23422          putXMMReg( rG, mkexpr(z128) );
   23423          /* FIXME: ALIGNMENT CHECK? */
   23424          putXMMRegLane32( rG, 0, loadLE(Ity_I32, mkexpr(addr)) );
   23425          putYMMRegLane128( rG, 1, mkexpr(z128) );
   23426          DIP("vmovss %s,%s\n", dis_buf, nameXMMReg(rG));
   23427          delta += alen;
   23428          goto decode_success;
   23429       }
   23430       /* VMOVSS xmm3, xmm2, xmm1 = VEX.LIG.F3.0F.WIG 10 /r */
   23431       /* Reg form. */
   23432       if (haveF3no66noF2(pfx) && epartIsReg(getUChar(delta))) {
   23433          UChar modrm = getUChar(delta);
   23434          UInt  rG    = gregOfRexRM(pfx, modrm);
   23435          UInt  rE    = eregOfRexRM(pfx, modrm);
   23436          UInt  rV    = getVexNvvvv(pfx);
   23437          delta++;
   23438          DIP("vmovss %s,%s,%s\n",
   23439              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   23440          IRTemp res = newTemp(Ity_V128);
   23441          assign( res, binop( Iop_64HLtoV128,
   23442                              getXMMRegLane64(rV, 1),
   23443                              binop(Iop_32HLto64,
   23444                                    getXMMRegLane32(rV, 1),
   23445                                    getXMMRegLane32(rE, 0)) ) );
   23446          putYMMRegLoAndZU(rG, mkexpr(res));
   23447          *uses_vvvv = True;
   23448          goto decode_success;
   23449       }
   23450       /* VMOVUPD xmm2/m128, xmm1 = VEX.128.66.0F.WIG 10 /r */
   23451       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23452          UChar modrm = getUChar(delta);
   23453          UInt  rG    = gregOfRexRM(pfx, modrm);
   23454          if (epartIsReg(modrm)) {
   23455             UInt rE = eregOfRexRM(pfx,modrm);
   23456             putYMMRegLoAndZU( rG, getXMMReg( rE ));
   23457             DIP("vmovupd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   23458             delta += 1;
   23459          } else {
   23460             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23461             putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
   23462             DIP("vmovupd %s,%s\n", dis_buf, nameXMMReg(rG));
   23463             delta += alen;
   23464          }
   23465          goto decode_success;
   23466       }
   23467       /* VMOVUPD ymm2/m256, ymm1 = VEX.256.66.0F.WIG 10 /r */
   23468       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   23469          UChar modrm = getUChar(delta);
   23470          UInt  rG    = gregOfRexRM(pfx, modrm);
   23471          if (epartIsReg(modrm)) {
   23472             UInt rE = eregOfRexRM(pfx,modrm);
   23473             putYMMReg( rG, getYMMReg( rE ));
   23474             DIP("vmovupd %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   23475             delta += 1;
   23476          } else {
   23477             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23478             putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
   23479             DIP("vmovupd %s,%s\n", dis_buf, nameYMMReg(rG));
   23480             delta += alen;
   23481          }
   23482          goto decode_success;
   23483       }
   23484       /* VMOVUPS xmm2/m128, xmm1 = VEX.128.0F.WIG 10 /r */
   23485       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23486          UChar modrm = getUChar(delta);
   23487          UInt  rG    = gregOfRexRM(pfx, modrm);
   23488          if (epartIsReg(modrm)) {
   23489             UInt rE = eregOfRexRM(pfx,modrm);
   23490             putYMMRegLoAndZU( rG, getXMMReg( rE ));
   23491             DIP("vmovups %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   23492             delta += 1;
   23493          } else {
   23494             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23495             putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
   23496             DIP("vmovups %s,%s\n", dis_buf, nameXMMReg(rG));
   23497             delta += alen;
   23498          }
   23499          goto decode_success;
   23500       }
   23501       /* VMOVUPS ymm2/m256, ymm1 = VEX.256.0F.WIG 10 /r */
   23502       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   23503          UChar modrm = getUChar(delta);
   23504          UInt  rG    = gregOfRexRM(pfx, modrm);
   23505          if (epartIsReg(modrm)) {
   23506             UInt rE = eregOfRexRM(pfx,modrm);
   23507             putYMMReg( rG, getYMMReg( rE ));
   23508             DIP("vmovups %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   23509             delta += 1;
   23510          } else {
   23511             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23512             putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
   23513             DIP("vmovups %s,%s\n", dis_buf, nameYMMReg(rG));
   23514             delta += alen;
   23515          }
   23516          goto decode_success;
   23517       }
   23518       break;
   23519 
   23520    case 0x11:
   23521       /* VMOVSD xmm1, m64 = VEX.LIG.F2.0F.WIG 11 /r */
   23522       /* Move 64 bits from G (low half xmm) to mem only. */
   23523       if (haveF2no66noF3(pfx) && !epartIsReg(getUChar(delta))) {
   23524          UChar modrm = getUChar(delta);
   23525          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23526          UInt   rG   = gregOfRexRM(pfx,modrm);
   23527          /* FIXME: ALIGNMENT CHECK? */
   23528          storeLE( mkexpr(addr), getXMMRegLane64(rG, 0));
   23529          DIP("vmovsd %s,%s\n", nameXMMReg(rG), dis_buf);
   23530          delta += alen;
   23531          goto decode_success;
   23532       }
   23533       /* VMOVSD xmm3, xmm2, xmm1 = VEX.LIG.F2.0F.WIG 11 /r */
   23534       /* Reg form. */
   23535       if (haveF2no66noF3(pfx) && epartIsReg(getUChar(delta))) {
   23536          UChar modrm = getUChar(delta);
   23537          UInt  rG    = gregOfRexRM(pfx, modrm);
   23538          UInt  rE    = eregOfRexRM(pfx, modrm);
   23539          UInt  rV    = getVexNvvvv(pfx);
   23540          delta++;
   23541          DIP("vmovsd %s,%s,%s\n",
   23542              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   23543          IRTemp res = newTemp(Ity_V128);
   23544          assign(res, binop(Iop_64HLtoV128,
   23545                            getXMMRegLane64(rV, 1),
   23546                            getXMMRegLane64(rE, 0)));
   23547          putYMMRegLoAndZU(rG, mkexpr(res));
   23548          *uses_vvvv = True;
   23549          goto decode_success;
   23550       }
   23551       /* VMOVSS xmm1, m64 = VEX.LIG.F3.0F.WIG 11 /r */
   23552       /* Move 32 bits from G (low 1/4 xmm) to mem only. */
   23553       if (haveF3no66noF2(pfx) && !epartIsReg(getUChar(delta))) {
   23554          UChar modrm = getUChar(delta);
   23555          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23556          UInt   rG   = gregOfRexRM(pfx,modrm);
   23557          /* FIXME: ALIGNMENT CHECK? */
   23558          storeLE( mkexpr(addr), getXMMRegLane32(rG, 0));
   23559          DIP("vmovss %s,%s\n", nameXMMReg(rG), dis_buf);
   23560          delta += alen;
   23561          goto decode_success;
   23562       }
   23563       /* VMOVSS xmm3, xmm2, xmm1 = VEX.LIG.F3.0F.WIG 11 /r */
   23564       /* Reg form. */
   23565       if (haveF3no66noF2(pfx) && epartIsReg(getUChar(delta))) {
   23566          UChar modrm = getUChar(delta);
   23567          UInt  rG    = gregOfRexRM(pfx, modrm);
   23568          UInt  rE    = eregOfRexRM(pfx, modrm);
   23569          UInt  rV    = getVexNvvvv(pfx);
   23570          delta++;
   23571          DIP("vmovss %s,%s,%s\n",
   23572              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   23573          IRTemp res = newTemp(Ity_V128);
   23574          assign( res, binop( Iop_64HLtoV128,
   23575                              getXMMRegLane64(rV, 1),
   23576                              binop(Iop_32HLto64,
   23577                                    getXMMRegLane32(rV, 1),
   23578                                    getXMMRegLane32(rE, 0)) ) );
   23579          putYMMRegLoAndZU(rG, mkexpr(res));
   23580          *uses_vvvv = True;
   23581          goto decode_success;
   23582       }
   23583       /* VMOVUPD xmm1, xmm2/m128 = VEX.128.66.0F.WIG 11 /r */
   23584       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23585          UChar modrm = getUChar(delta);
   23586          UInt  rG    = gregOfRexRM(pfx,modrm);
   23587          if (epartIsReg(modrm)) {
   23588             UInt rE = eregOfRexRM(pfx,modrm);
   23589             putYMMRegLoAndZU( rE, getXMMReg(rG) );
   23590             DIP("vmovupd %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
   23591             delta += 1;
   23592          } else {
   23593             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23594             storeLE( mkexpr(addr), getXMMReg(rG) );
   23595             DIP("vmovupd %s,%s\n", nameXMMReg(rG), dis_buf);
   23596             delta += alen;
   23597          }
   23598          goto decode_success;
   23599       }
   23600       /* VMOVUPD ymm1, ymm2/m256 = VEX.256.66.0F.WIG 11 /r */
   23601       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   23602          UChar modrm = getUChar(delta);
   23603          UInt  rG    = gregOfRexRM(pfx,modrm);
   23604          if (epartIsReg(modrm)) {
   23605             UInt rE = eregOfRexRM(pfx,modrm);
   23606             putYMMReg( rE, getYMMReg(rG) );
   23607             DIP("vmovupd %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
   23608             delta += 1;
   23609          } else {
   23610             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23611             storeLE( mkexpr(addr), getYMMReg(rG) );
   23612             DIP("vmovupd %s,%s\n", nameYMMReg(rG), dis_buf);
   23613             delta += alen;
   23614          }
   23615          goto decode_success;
   23616       }
   23617       /* VMOVUPS xmm1, xmm2/m128 = VEX.128.0F.WIG 11 /r */
   23618       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23619          UChar modrm = getUChar(delta);
   23620          UInt  rG    = gregOfRexRM(pfx,modrm);
   23621          if (epartIsReg(modrm)) {
   23622             UInt rE = eregOfRexRM(pfx,modrm);
   23623             putYMMRegLoAndZU( rE, getXMMReg(rG) );
   23624             DIP("vmovups %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
   23625             delta += 1;
   23626          } else {
   23627             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23628             storeLE( mkexpr(addr), getXMMReg(rG) );
   23629             DIP("vmovups %s,%s\n", nameXMMReg(rG), dis_buf);
   23630             delta += alen;
   23631          }
   23632          goto decode_success;
   23633       }
   23634       /* VMOVUPS ymm1, ymm2/m256 = VEX.256.0F.WIG 11 /r */
   23635       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   23636          UChar modrm = getUChar(delta);
   23637          UInt  rG    = gregOfRexRM(pfx,modrm);
   23638          if (epartIsReg(modrm)) {
   23639             UInt rE = eregOfRexRM(pfx,modrm);
   23640             putYMMReg( rE, getYMMReg(rG) );
   23641             DIP("vmovups %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
   23642             delta += 1;
   23643          } else {
   23644             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23645             storeLE( mkexpr(addr), getYMMReg(rG) );
   23646             DIP("vmovups %s,%s\n", nameYMMReg(rG), dis_buf);
   23647             delta += alen;
   23648          }
   23649          goto decode_success;
   23650       }
   23651       break;
   23652 
   23653    case 0x12:
   23654       /* VMOVDDUP xmm2/m64, xmm1 = VEX.128.F2.0F.WIG /12 r */
   23655       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23656          delta = dis_MOVDDUP_128( vbi, pfx, delta, True/*isAvx*/ );
   23657          goto decode_success;
   23658       }
   23659       /* VMOVDDUP ymm2/m256, ymm1 = VEX.256.F2.0F.WIG /12 r */
   23660       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   23661          delta = dis_MOVDDUP_256( vbi, pfx, delta );
   23662          goto decode_success;
   23663       }
   23664       /* VMOVHLPS xmm3, xmm2, xmm1 = VEX.NDS.128.0F.WIG 12 /r */
   23665       /* Insn only exists in reg form */
   23666       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   23667           && epartIsReg(getUChar(delta))) {
   23668          UChar modrm = getUChar(delta);
   23669          UInt  rG    = gregOfRexRM(pfx, modrm);
   23670          UInt  rE    = eregOfRexRM(pfx, modrm);
   23671          UInt  rV    = getVexNvvvv(pfx);
   23672          delta++;
   23673          DIP("vmovhlps %s,%s,%s\n",
   23674              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   23675          IRTemp res = newTemp(Ity_V128);
   23676          assign(res, binop(Iop_64HLtoV128,
   23677                            getXMMRegLane64(rV, 1),
   23678                            getXMMRegLane64(rE, 1)));
   23679          putYMMRegLoAndZU(rG, mkexpr(res));
   23680          *uses_vvvv = True;
   23681          goto decode_success;
   23682       }
   23683       /* VMOVLPS m64, xmm1, xmm2 = VEX.NDS.128.0F.WIG 12 /r */
   23684       /* Insn exists only in mem form, it appears. */
   23685       /* VMOVLPD m64, xmm1, xmm2 = VEX.NDS.128.66.0F.WIG 12 /r */
   23686       /* Insn exists only in mem form, it appears. */
   23687       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   23688           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
   23689          UChar modrm = getUChar(delta);
   23690          UInt  rG    = gregOfRexRM(pfx, modrm);
   23691          UInt  rV    = getVexNvvvv(pfx);
   23692          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23693          delta += alen;
   23694          DIP("vmovlpd %s,%s,%s\n",
   23695              dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   23696          IRTemp res = newTemp(Ity_V128);
   23697          assign(res, binop(Iop_64HLtoV128,
   23698                            getXMMRegLane64(rV, 1),
   23699                            loadLE(Ity_I64, mkexpr(addr))));
   23700          putYMMRegLoAndZU(rG, mkexpr(res));
   23701          *uses_vvvv = True;
   23702          goto decode_success;
   23703       }
   23704       /* VMOVSLDUP xmm2/m128, xmm1 = VEX.NDS.128.F3.0F.WIG 12 /r */
   23705       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
   23706          delta = dis_MOVSxDUP_128( vbi, pfx, delta, True/*isAvx*/,
   23707                                    True/*isL*/ );
   23708          goto decode_success;
   23709       }
   23710       /* VMOVSLDUP ymm2/m256, ymm1 = VEX.NDS.256.F3.0F.WIG 12 /r */
   23711       if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
   23712          delta = dis_MOVSxDUP_256( vbi, pfx, delta, True/*isL*/ );
   23713          goto decode_success;
   23714       }
   23715       break;
   23716 
   23717    case 0x13:
   23718       /* VMOVLPS xmm1, m64 = VEX.128.0F.WIG 13 /r */
   23719       /* Insn exists only in mem form, it appears. */
   23720       /* VMOVLPD xmm1, m64 = VEX.128.66.0F.WIG 13 /r */
   23721       /* Insn exists only in mem form, it appears. */
   23722       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   23723           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
   23724          UChar modrm = getUChar(delta);
   23725          UInt  rG    = gregOfRexRM(pfx, modrm);
   23726          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23727          delta += alen;
   23728          storeLE( mkexpr(addr), getXMMRegLane64( rG, 0));
   23729          DIP("vmovlpd %s,%s\n", nameXMMReg(rG), dis_buf);
   23730          goto decode_success;
   23731       }
   23732       break;
   23733 
   23734    case 0x14:
   23735    case 0x15:
   23736       /* VUNPCKLPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 14 /r */
   23737       /* VUNPCKHPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 15 /r */
   23738       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23739          Bool   hi    = opc == 0x15;
   23740          UChar  modrm = getUChar(delta);
   23741          UInt   rG    = gregOfRexRM(pfx,modrm);
   23742          UInt   rV    = getVexNvvvv(pfx);
   23743          IRTemp eV    = newTemp(Ity_V128);
   23744          IRTemp vV    = newTemp(Ity_V128);
   23745          assign( vV, getXMMReg(rV) );
   23746          if (epartIsReg(modrm)) {
   23747             UInt rE = eregOfRexRM(pfx,modrm);
   23748             assign( eV, getXMMReg(rE) );
   23749             delta += 1;
   23750             DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
   23751                 nameXMMReg(rE), nameXMMReg(rG));
   23752          } else {
   23753             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23754             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   23755             delta += alen;
   23756             DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
   23757                 dis_buf, nameXMMReg(rG));
   23758          }
   23759          IRTemp res = math_UNPCKxPS_128( eV, vV, hi );
   23760          putYMMRegLoAndZU( rG, mkexpr(res) );
   23761          *uses_vvvv = True;
   23762          goto decode_success;
   23763       }
   23764       /* VUNPCKLPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 14 /r */
   23765       /* VUNPCKHPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 15 /r */
   23766       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   23767          Bool   hi    = opc == 0x15;
   23768          UChar  modrm = getUChar(delta);
   23769          UInt   rG    = gregOfRexRM(pfx,modrm);
   23770          UInt   rV    = getVexNvvvv(pfx);
   23771          IRTemp eV    = newTemp(Ity_V256);
   23772          IRTemp vV    = newTemp(Ity_V256);
   23773          assign( vV, getYMMReg(rV) );
   23774          if (epartIsReg(modrm)) {
   23775             UInt rE = eregOfRexRM(pfx,modrm);
   23776             assign( eV, getYMMReg(rE) );
   23777             delta += 1;
   23778             DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
   23779                 nameYMMReg(rE), nameYMMReg(rG));
   23780          } else {
   23781             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23782             assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
   23783             delta += alen;
   23784             DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
   23785                 dis_buf, nameYMMReg(rG));
   23786          }
   23787          IRTemp res = math_UNPCKxPS_256( eV, vV, hi );
   23788          putYMMReg( rG, mkexpr(res) );
   23789          *uses_vvvv = True;
   23790          goto decode_success;
   23791       }
   23792       /* VUNPCKLPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 14 /r */
   23793       /* VUNPCKHPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 15 /r */
   23794       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23795          Bool   hi    = opc == 0x15;
   23796          UChar  modrm = getUChar(delta);
   23797          UInt   rG    = gregOfRexRM(pfx,modrm);
   23798          UInt   rV    = getVexNvvvv(pfx);
   23799          IRTemp eV    = newTemp(Ity_V128);
   23800          IRTemp vV    = newTemp(Ity_V128);
   23801          assign( vV, getXMMReg(rV) );
   23802          if (epartIsReg(modrm)) {
   23803             UInt rE = eregOfRexRM(pfx,modrm);
   23804             assign( eV, getXMMReg(rE) );
   23805             delta += 1;
   23806             DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
   23807                 nameXMMReg(rE), nameXMMReg(rG));
   23808          } else {
   23809             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23810             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   23811             delta += alen;
   23812             DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
   23813                 dis_buf, nameXMMReg(rG));
   23814          }
   23815          IRTemp res = math_UNPCKxPD_128( eV, vV, hi );
   23816          putYMMRegLoAndZU( rG, mkexpr(res) );
   23817          *uses_vvvv = True;
   23818          goto decode_success;
   23819       }
   23820       /* VUNPCKLPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 14 /r */
   23821       /* VUNPCKHPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 15 /r */
   23822       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   23823          Bool   hi    = opc == 0x15;
   23824          UChar  modrm = getUChar(delta);
   23825          UInt   rG    = gregOfRexRM(pfx,modrm);
   23826          UInt   rV    = getVexNvvvv(pfx);
   23827          IRTemp eV    = newTemp(Ity_V256);
   23828          IRTemp vV    = newTemp(Ity_V256);
   23829          assign( vV, getYMMReg(rV) );
   23830          if (epartIsReg(modrm)) {
   23831             UInt rE = eregOfRexRM(pfx,modrm);
   23832             assign( eV, getYMMReg(rE) );
   23833             delta += 1;
   23834             DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
   23835                 nameYMMReg(rE), nameYMMReg(rG));
   23836          } else {
   23837             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23838             assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
   23839             delta += alen;
   23840             DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
   23841                 dis_buf, nameYMMReg(rG));
   23842          }
   23843          IRTemp res = math_UNPCKxPD_256( eV, vV, hi );
   23844          putYMMReg( rG, mkexpr(res) );
   23845          *uses_vvvv = True;
   23846          goto decode_success;
   23847       }
   23848       break;
   23849 
   23850    case 0x16:
   23851       /* VMOVLHPS xmm3, xmm2, xmm1 = VEX.NDS.128.0F.WIG 16 /r */
   23852       /* Insn only exists in reg form */
   23853       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   23854           && epartIsReg(getUChar(delta))) {
   23855          UChar modrm = getUChar(delta);
   23856          UInt  rG    = gregOfRexRM(pfx, modrm);
   23857          UInt  rE    = eregOfRexRM(pfx, modrm);
   23858          UInt  rV    = getVexNvvvv(pfx);
   23859          delta++;
   23860          DIP("vmovlhps %s,%s,%s\n",
   23861              nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   23862          IRTemp res = newTemp(Ity_V128);
   23863          assign(res, binop(Iop_64HLtoV128,
   23864                            getXMMRegLane64(rE, 0),
   23865                            getXMMRegLane64(rV, 0)));
   23866          putYMMRegLoAndZU(rG, mkexpr(res));
   23867          *uses_vvvv = True;
   23868          goto decode_success;
   23869       }
   23870       /* VMOVHPS m64, xmm1, xmm2 = VEX.NDS.128.0F.WIG 16 /r */
   23871       /* Insn exists only in mem form, it appears. */
   23872       /* VMOVHPD m64, xmm1, xmm2 = VEX.NDS.128.66.0F.WIG 16 /r */
   23873       /* Insn exists only in mem form, it appears. */
   23874       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   23875           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
   23876          UChar modrm = getUChar(delta);
   23877          UInt  rG    = gregOfRexRM(pfx, modrm);
   23878          UInt  rV    = getVexNvvvv(pfx);
   23879          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23880          delta += alen;
   23881          DIP("vmovhp%c %s,%s,%s\n", have66(pfx) ? 'd' : 's',
   23882              dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   23883          IRTemp res = newTemp(Ity_V128);
   23884          assign(res, binop(Iop_64HLtoV128,
   23885                            loadLE(Ity_I64, mkexpr(addr)),
   23886                            getXMMRegLane64(rV, 0)));
   23887          putYMMRegLoAndZU(rG, mkexpr(res));
   23888          *uses_vvvv = True;
   23889          goto decode_success;
   23890       }
   23891       /* VMOVSHDUP xmm2/m128, xmm1 = VEX.NDS.128.F3.0F.WIG 16 /r */
   23892       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
   23893          delta = dis_MOVSxDUP_128( vbi, pfx, delta, True/*isAvx*/,
   23894                                    False/*!isL*/ );
   23895          goto decode_success;
   23896       }
   23897       /* VMOVSHDUP ymm2/m256, ymm1 = VEX.NDS.256.F3.0F.WIG 16 /r */
   23898       if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
   23899          delta = dis_MOVSxDUP_256( vbi, pfx, delta, False/*!isL*/ );
   23900          goto decode_success;
   23901       }
   23902       break;
   23903 
   23904    case 0x17:
   23905       /* VMOVHPS xmm1, m64 = VEX.128.0F.WIG 17 /r */
   23906       /* Insn exists only in mem form, it appears. */
   23907       /* VMOVHPD xmm1, m64 = VEX.128.66.0F.WIG 17 /r */
   23908       /* Insn exists only in mem form, it appears. */
   23909       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   23910           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
   23911          UChar modrm = getUChar(delta);
   23912          UInt  rG    = gregOfRexRM(pfx, modrm);
   23913          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23914          delta += alen;
   23915          storeLE( mkexpr(addr), getXMMRegLane64( rG, 1));
   23916          DIP("vmovhp%c %s,%s\n", have66(pfx) ? 'd' : 's',
   23917              nameXMMReg(rG), dis_buf);
   23918          goto decode_success;
   23919       }
   23920       break;
   23921 
   23922    case 0x28:
   23923       /* VMOVAPD xmm2/m128, xmm1 = VEX.128.66.0F.WIG 28 /r */
   23924       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23925          UChar modrm = getUChar(delta);
   23926          UInt  rG    = gregOfRexRM(pfx, modrm);
   23927          if (epartIsReg(modrm)) {
   23928             UInt rE = eregOfRexRM(pfx,modrm);
   23929             putYMMRegLoAndZU( rG, getXMMReg( rE ));
   23930             DIP("vmovapd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   23931             delta += 1;
   23932          } else {
   23933             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23934             gen_SEGV_if_not_16_aligned( addr );
   23935             putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
   23936             DIP("vmovapd %s,%s\n", dis_buf, nameXMMReg(rG));
   23937             delta += alen;
   23938          }
   23939          goto decode_success;
   23940       }
   23941       /* VMOVAPD ymm2/m256, ymm1 = VEX.256.66.0F.WIG 28 /r */
   23942       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   23943          UChar modrm = getUChar(delta);
   23944          UInt  rG    = gregOfRexRM(pfx, modrm);
   23945          if (epartIsReg(modrm)) {
   23946             UInt rE = eregOfRexRM(pfx,modrm);
   23947             putYMMReg( rG, getYMMReg( rE ));
   23948             DIP("vmovapd %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   23949             delta += 1;
   23950          } else {
   23951             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23952             gen_SEGV_if_not_32_aligned( addr );
   23953             putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
   23954             DIP("vmovapd %s,%s\n", dis_buf, nameYMMReg(rG));
   23955             delta += alen;
   23956          }
   23957          goto decode_success;
   23958       }
   23959       /* VMOVAPS xmm2/m128, xmm1 = VEX.128.0F.WIG 28 /r */
   23960       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   23961          UChar modrm = getUChar(delta);
   23962          UInt  rG    = gregOfRexRM(pfx, modrm);
   23963          if (epartIsReg(modrm)) {
   23964             UInt rE = eregOfRexRM(pfx,modrm);
   23965             putYMMRegLoAndZU( rG, getXMMReg( rE ));
   23966             DIP("vmovaps %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   23967             delta += 1;
   23968          } else {
   23969             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23970             gen_SEGV_if_not_16_aligned( addr );
   23971             putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
   23972             DIP("vmovaps %s,%s\n", dis_buf, nameXMMReg(rG));
   23973             delta += alen;
   23974          }
   23975          goto decode_success;
   23976       }
   23977       /* VMOVAPS ymm2/m256, ymm1 = VEX.256.0F.WIG 28 /r */
   23978       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   23979          UChar modrm = getUChar(delta);
   23980          UInt  rG    = gregOfRexRM(pfx, modrm);
   23981          if (epartIsReg(modrm)) {
   23982             UInt rE = eregOfRexRM(pfx,modrm);
   23983             putYMMReg( rG, getYMMReg( rE ));
   23984             DIP("vmovaps %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   23985             delta += 1;
   23986          } else {
   23987             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   23988             gen_SEGV_if_not_32_aligned( addr );
   23989             putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
   23990             DIP("vmovaps %s,%s\n", dis_buf, nameYMMReg(rG));
   23991             delta += alen;
   23992          }
   23993          goto decode_success;
   23994       }
   23995       break;
   23996 
   23997    case 0x29:
   23998       /* VMOVAPD xmm1, xmm2/m128 = VEX.128.66.0F.WIG 29 /r */
   23999       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24000          UChar modrm = getUChar(delta);
   24001          UInt  rG    = gregOfRexRM(pfx,modrm);
   24002          if (epartIsReg(modrm)) {
   24003             UInt rE = eregOfRexRM(pfx,modrm);
   24004             putYMMRegLoAndZU( rE, getXMMReg(rG) );
   24005             DIP("vmovapd %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
   24006             delta += 1;
   24007          } else {
   24008             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24009             gen_SEGV_if_not_16_aligned( addr );
   24010             storeLE( mkexpr(addr), getXMMReg(rG) );
   24011             DIP("vmovapd %s,%s\n", nameXMMReg(rG), dis_buf );
   24012             delta += alen;
   24013          }
   24014          goto decode_success;
   24015       }
   24016       /* VMOVAPD ymm1, ymm2/m256 = VEX.256.66.0F.WIG 29 /r */
   24017       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24018          UChar modrm = getUChar(delta);
   24019          UInt  rG    = gregOfRexRM(pfx,modrm);
   24020          if (epartIsReg(modrm)) {
   24021             UInt rE = eregOfRexRM(pfx,modrm);
   24022             putYMMReg( rE, getYMMReg(rG) );
   24023             DIP("vmovapd %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
   24024             delta += 1;
   24025          } else {
   24026             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24027             gen_SEGV_if_not_32_aligned( addr );
   24028             storeLE( mkexpr(addr), getYMMReg(rG) );
   24029             DIP("vmovapd %s,%s\n", nameYMMReg(rG), dis_buf );
   24030             delta += alen;
   24031          }
   24032          goto decode_success;
   24033       }
   24034       /* VMOVAPS xmm1, xmm2/m128 = VEX.128.0F.WIG 29 /r */
   24035       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24036          UChar modrm = getUChar(delta);
   24037          UInt  rG    = gregOfRexRM(pfx,modrm);
   24038          if (epartIsReg(modrm)) {
   24039             UInt rE = eregOfRexRM(pfx,modrm);
   24040             putYMMRegLoAndZU( rE, getXMMReg(rG) );
   24041             DIP("vmovaps %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
   24042             delta += 1;
   24043             goto decode_success;
   24044          } else {
   24045             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24046             gen_SEGV_if_not_16_aligned( addr );
   24047             storeLE( mkexpr(addr), getXMMReg(rG) );
   24048             DIP("vmovaps %s,%s\n", nameXMMReg(rG), dis_buf );
   24049             delta += alen;
   24050             goto decode_success;
   24051          }
   24052       }
   24053       /* VMOVAPS ymm1, ymm2/m256 = VEX.256.0F.WIG 29 /r */
   24054       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24055          UChar modrm = getUChar(delta);
   24056          UInt  rG    = gregOfRexRM(pfx,modrm);
   24057          if (epartIsReg(modrm)) {
   24058             UInt rE = eregOfRexRM(pfx,modrm);
   24059             putYMMReg( rE, getYMMReg(rG) );
   24060             DIP("vmovaps %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
   24061             delta += 1;
   24062             goto decode_success;
   24063          } else {
   24064             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24065             gen_SEGV_if_not_32_aligned( addr );
   24066             storeLE( mkexpr(addr), getYMMReg(rG) );
   24067             DIP("vmovaps %s,%s\n", nameYMMReg(rG), dis_buf );
   24068             delta += alen;
   24069             goto decode_success;
   24070          }
   24071       }
   24072       break;
   24073 
   24074    case 0x2A: {
   24075       IRTemp rmode = newTemp(Ity_I32);
   24076       assign( rmode, get_sse_roundingmode() );
   24077       /* VCVTSI2SD r/m32, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.W0 2A /r */
   24078       if (haveF2no66noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
   24079          UChar  modrm = getUChar(delta);
   24080          UInt   rV    = getVexNvvvv(pfx);
   24081          UInt   rD    = gregOfRexRM(pfx, modrm);
   24082          IRTemp arg32 = newTemp(Ity_I32);
   24083          if (epartIsReg(modrm)) {
   24084             UInt rS = eregOfRexRM(pfx,modrm);
   24085             assign( arg32, getIReg32(rS) );
   24086             delta += 1;
   24087             DIP("vcvtsi2sdl %s,%s,%s\n",
   24088                 nameIReg32(rS), nameXMMReg(rV), nameXMMReg(rD));
   24089          } else {
   24090             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24091             assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   24092             delta += alen;
   24093             DIP("vcvtsi2sdl %s,%s,%s\n",
   24094                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   24095          }
   24096          putXMMRegLane64F( rD, 0,
   24097                            unop(Iop_I32StoF64, mkexpr(arg32)));
   24098          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   24099          putYMMRegLane128( rD, 1, mkV128(0) );
   24100          *uses_vvvv = True;
   24101          goto decode_success;
   24102       }
   24103       /* VCVTSI2SD r/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.W1 2A /r */
   24104       if (haveF2no66noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
   24105          UChar  modrm = getUChar(delta);
   24106          UInt   rV    = getVexNvvvv(pfx);
   24107          UInt   rD    = gregOfRexRM(pfx, modrm);
   24108          IRTemp arg64 = newTemp(Ity_I64);
   24109          if (epartIsReg(modrm)) {
   24110             UInt rS = eregOfRexRM(pfx,modrm);
   24111             assign( arg64, getIReg64(rS) );
   24112             delta += 1;
   24113             DIP("vcvtsi2sdq %s,%s,%s\n",
   24114                 nameIReg64(rS), nameXMMReg(rV), nameXMMReg(rD));
   24115          } else {
   24116             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24117             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   24118             delta += alen;
   24119             DIP("vcvtsi2sdq %s,%s,%s\n",
   24120                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   24121          }
   24122          putXMMRegLane64F( rD, 0,
   24123                            binop( Iop_I64StoF64,
   24124                                   get_sse_roundingmode(),
   24125                                   mkexpr(arg64)) );
   24126          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   24127          putYMMRegLane128( rD, 1, mkV128(0) );
   24128          *uses_vvvv = True;
   24129          goto decode_success;
   24130       }
   24131       /* VCVTSI2SS r/m64, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.W1 2A /r */
   24132       if (haveF3no66noF2(pfx) && 1==getRexW(pfx)/*W1*/) {
   24133          UChar  modrm = getUChar(delta);
   24134          UInt   rV    = getVexNvvvv(pfx);
   24135          UInt   rD    = gregOfRexRM(pfx, modrm);
   24136          IRTemp arg64 = newTemp(Ity_I64);
   24137          if (epartIsReg(modrm)) {
   24138             UInt rS = eregOfRexRM(pfx,modrm);
   24139             assign( arg64, getIReg64(rS) );
   24140             delta += 1;
   24141             DIP("vcvtsi2ssq %s,%s,%s\n",
   24142                 nameIReg64(rS), nameXMMReg(rV), nameXMMReg(rD));
   24143          } else {
   24144             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24145             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   24146             delta += alen;
   24147             DIP("vcvtsi2ssq %s,%s,%s\n",
   24148                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   24149          }
   24150          putXMMRegLane32F( rD, 0,
   24151                            binop(Iop_F64toF32,
   24152                                  mkexpr(rmode),
   24153                                  binop(Iop_I64StoF64, mkexpr(rmode),
   24154                                                       mkexpr(arg64)) ) );
   24155          putXMMRegLane32( rD, 1, getXMMRegLane32( rV, 1 ));
   24156          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   24157          putYMMRegLane128( rD, 1, mkV128(0) );
   24158          *uses_vvvv = True;
   24159          goto decode_success;
   24160       }
   24161       /* VCVTSI2SS r/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.W0 2A /r */
   24162       if (haveF3no66noF2(pfx) && 0==getRexW(pfx)/*W0*/) {
   24163          UChar  modrm = getUChar(delta);
   24164          UInt   rV    = getVexNvvvv(pfx);
   24165          UInt   rD    = gregOfRexRM(pfx, modrm);
   24166          IRTemp arg32 = newTemp(Ity_I32);
   24167          if (epartIsReg(modrm)) {
   24168             UInt rS = eregOfRexRM(pfx,modrm);
   24169             assign( arg32, getIReg32(rS) );
   24170             delta += 1;
   24171             DIP("vcvtsi2ssl %s,%s,%s\n",
   24172                 nameIReg32(rS), nameXMMReg(rV), nameXMMReg(rD));
   24173          } else {
   24174             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24175             assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   24176             delta += alen;
   24177             DIP("vcvtsi2ssl %s,%s,%s\n",
   24178                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   24179          }
   24180          putXMMRegLane32F( rD, 0,
   24181                            binop(Iop_F64toF32,
   24182                                  mkexpr(rmode),
   24183                                  unop(Iop_I32StoF64, mkexpr(arg32)) ) );
   24184          putXMMRegLane32( rD, 1, getXMMRegLane32( rV, 1 ));
   24185          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   24186          putYMMRegLane128( rD, 1, mkV128(0) );
   24187          *uses_vvvv = True;
   24188          goto decode_success;
   24189       }
   24190       break;
   24191    }
   24192 
   24193    case 0x2B:
   24194       /* VMOVNTPD xmm1, m128 = VEX.128.66.0F.WIG 2B /r */
   24195       /* VMOVNTPS xmm1, m128 = VEX.128.0F.WIG 2B /r */
   24196       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   24197           && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
   24198          UChar  modrm = getUChar(delta);
   24199          UInt   rS    = gregOfRexRM(pfx, modrm);
   24200          IRTemp tS    = newTemp(Ity_V128);
   24201          assign(tS, getXMMReg(rS));
   24202          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   24203          delta += alen;
   24204          gen_SEGV_if_not_16_aligned(addr);
   24205          storeLE(mkexpr(addr), mkexpr(tS));
   24206          DIP("vmovntp%c %s,%s\n", have66(pfx) ? 'd' : 's',
   24207              nameXMMReg(rS), dis_buf);
   24208          goto decode_success;
   24209       }
   24210       /* VMOVNTPD ymm1, m256 = VEX.256.66.0F.WIG 2B /r */
   24211       /* VMOVNTPS ymm1, m256 = VEX.256.0F.WIG 2B /r */
   24212       if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
   24213           && 1==getVexL(pfx)/*256*/ && !epartIsReg(getUChar(delta))) {
   24214          UChar  modrm = getUChar(delta);
   24215          UInt   rS    = gregOfRexRM(pfx, modrm);
   24216          IRTemp tS    = newTemp(Ity_V256);
   24217          assign(tS, getYMMReg(rS));
   24218          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   24219          delta += alen;
   24220          gen_SEGV_if_not_32_aligned(addr);
   24221          storeLE(mkexpr(addr), mkexpr(tS));
   24222          DIP("vmovntp%c %s,%s\n", have66(pfx) ? 'd' : 's',
   24223              nameYMMReg(rS), dis_buf);
   24224          goto decode_success;
   24225       }
   24226       break;
   24227 
   24228    case 0x2C:
   24229       /* VCVTTSD2SI xmm1/m32, r32 = VEX.LIG.F2.0F.W0 2C /r */
   24230       if (haveF2no66noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
   24231          delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
   24232          goto decode_success;
   24233       }
   24234       /* VCVTTSD2SI xmm1/m64, r64 = VEX.LIG.F2.0F.W1 2C /r */
   24235       if (haveF2no66noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
   24236          delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
   24237          goto decode_success;
   24238       }
   24239       /* VCVTTSS2SI xmm1/m32, r32 = VEX.LIG.F3.0F.W0 2C /r */
   24240       if (haveF3no66noF2(pfx) && 0==getRexW(pfx)/*W0*/) {
   24241          delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
   24242          goto decode_success;
   24243       }
   24244       /* VCVTTSS2SI xmm1/m64, r64 = VEX.LIG.F3.0F.W1 2C /r */
   24245       if (haveF3no66noF2(pfx) && 1==getRexW(pfx)/*W1*/) {
   24246          delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
   24247          goto decode_success;
   24248       }
   24249       break;
   24250 
   24251    case 0x2D:
   24252       /* VCVTSD2SI xmm1/m32, r32 = VEX.LIG.F2.0F.W0 2D /r */
   24253       if (haveF2no66noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
   24254          delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
   24255          goto decode_success;
   24256       }
   24257       /* VCVTSD2SI xmm1/m64, r64 = VEX.LIG.F2.0F.W1 2D /r */
   24258       if (haveF2no66noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
   24259          delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
   24260          goto decode_success;
   24261       }
   24262       /* VCVTSS2SI xmm1/m32, r32 = VEX.LIG.F3.0F.W0 2D /r */
   24263       if (haveF3no66noF2(pfx) && 0==getRexW(pfx)/*W0*/) {
   24264          delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
   24265          goto decode_success;
   24266       }
   24267       /* VCVTSS2SI xmm1/m64, r64 = VEX.LIG.F3.0F.W1 2D /r */
   24268       if (haveF3no66noF2(pfx) && 1==getRexW(pfx)/*W1*/) {
   24269          delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
   24270          goto decode_success;
   24271       }
   24272       break;
   24273 
   24274    case 0x2E:
   24275    case 0x2F:
   24276       /* VUCOMISD xmm2/m64, xmm1 = VEX.LIG.66.0F.WIG 2E /r */
   24277       /* VCOMISD  xmm2/m64, xmm1 = VEX.LIG.66.0F.WIG 2F /r */
   24278       if (have66noF2noF3(pfx)) {
   24279          delta = dis_COMISD( vbi, pfx, delta, True/*isAvx*/, opc );
   24280          goto decode_success;
   24281       }
   24282       /* VUCOMISS xmm2/m32, xmm1 = VEX.LIG.0F.WIG 2E /r */
   24283       /* VCOMISS xmm2/m32, xmm1  = VEX.LIG.0F.WIG 2F /r */
   24284       if (haveNo66noF2noF3(pfx)) {
   24285          delta = dis_COMISS( vbi, pfx, delta, True/*isAvx*/, opc );
   24286          goto decode_success;
   24287       }
   24288       break;
   24289 
   24290    case 0x50:
   24291       /* VMOVMSKPD xmm2, r32 = VEX.128.66.0F.WIG 50 /r */
   24292       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24293          delta = dis_MOVMSKPD_128( vbi, pfx, delta, True/*isAvx*/ );
   24294          goto decode_success;
   24295       }
   24296       /* VMOVMSKPD ymm2, r32 = VEX.256.66.0F.WIG 50 /r */
   24297       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24298          delta = dis_MOVMSKPD_256( vbi, pfx, delta );
   24299          goto decode_success;
   24300       }
   24301       /* VMOVMSKPS xmm2, r32 = VEX.128.0F.WIG 50 /r */
   24302       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24303          delta = dis_MOVMSKPS_128( vbi, pfx, delta, True/*isAvx*/ );
   24304          goto decode_success;
   24305       }
   24306       /* VMOVMSKPS ymm2, r32 = VEX.256.0F.WIG 50 /r */
   24307       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24308          delta = dis_MOVMSKPS_256( vbi, pfx, delta );
   24309          goto decode_success;
   24310       }
   24311       break;
   24312 
   24313    case 0x51:
   24314       /* VSQRTSS xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F3.0F.WIG 51 /r */
   24315       if (haveF3no66noF2(pfx)) {
   24316          delta = dis_AVX128_E_V_to_G_lo32_unary(
   24317                     uses_vvvv, vbi, pfx, delta, "vsqrtss", Iop_Sqrt32F0x4 );
   24318          goto decode_success;
   24319       }
   24320       /* VSQRTPS xmm2/m128(E), xmm1(G) = VEX.NDS.128.0F.WIG 51 /r */
   24321       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24322          delta = dis_AVX128_E_to_G_unary_all(
   24323                     uses_vvvv, vbi, pfx, delta, "vsqrtps", Iop_Sqrt32Fx4 );
   24324          goto decode_success;
   24325       }
   24326       /* VSQRTPS ymm2/m256(E), ymm1(G) = VEX.NDS.256.0F.WIG 51 /r */
   24327       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24328          delta = dis_AVX256_E_to_G_unary_all(
   24329                     uses_vvvv, vbi, pfx, delta, "vsqrtps", Iop_Sqrt32Fx8 );
   24330          goto decode_success;
   24331       }
   24332       /* VSQRTSD xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F2.0F.WIG 51 /r */
   24333       if (haveF2no66noF3(pfx)) {
   24334          delta = dis_AVX128_E_V_to_G_lo64_unary(
   24335                     uses_vvvv, vbi, pfx, delta, "vsqrtsd", Iop_Sqrt64F0x2 );
   24336          goto decode_success;
   24337       }
   24338       /* VSQRTPD xmm2/m128(E), xmm1(G) = VEX.NDS.128.66.0F.WIG 51 /r */
   24339       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24340          delta = dis_AVX128_E_to_G_unary_all(
   24341                     uses_vvvv, vbi, pfx, delta, "vsqrtpd", Iop_Sqrt64Fx2 );
   24342          goto decode_success;
   24343       }
   24344       /* VSQRTPD ymm2/m256(E), ymm1(G) = VEX.NDS.256.66.0F.WIG 51 /r */
   24345       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24346          delta = dis_AVX256_E_to_G_unary_all(
   24347                     uses_vvvv, vbi, pfx, delta, "vsqrtpd", Iop_Sqrt64Fx4 );
   24348          goto decode_success;
   24349       }
   24350       break;
   24351 
   24352    case 0x52:
   24353       /* VRSQRTSS xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F3.0F.WIG 52 /r */
   24354       if (haveF3no66noF2(pfx)) {
   24355          delta = dis_AVX128_E_V_to_G_lo32_unary(
   24356                     uses_vvvv, vbi, pfx, delta, "vrsqrtss", Iop_RSqrt32F0x4 );
   24357          goto decode_success;
   24358       }
   24359       /* VRSQRTPS xmm2/m128(E), xmm1(G) = VEX.NDS.128.0F.WIG 52 /r */
   24360       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24361          delta = dis_AVX128_E_to_G_unary_all(
   24362                     uses_vvvv, vbi, pfx, delta, "vrsqrtps", Iop_RSqrt32Fx4 );
   24363          goto decode_success;
   24364       }
   24365       /* VRSQRTPS ymm2/m256(E), ymm1(G) = VEX.NDS.256.0F.WIG 52 /r */
   24366       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24367          delta = dis_AVX256_E_to_G_unary_all(
   24368                     uses_vvvv, vbi, pfx, delta, "vrsqrtps", Iop_RSqrt32Fx8 );
   24369          goto decode_success;
   24370       }
   24371       break;
   24372 
   24373    case 0x53:
   24374       /* VRCPSS xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F3.0F.WIG 53 /r */
   24375       if (haveF3no66noF2(pfx)) {
   24376          delta = dis_AVX128_E_V_to_G_lo32_unary(
   24377                     uses_vvvv, vbi, pfx, delta, "vrcpss", Iop_Recip32F0x4 );
   24378          goto decode_success;
   24379       }
   24380       /* VRCPPS xmm2/m128(E), xmm1(G) = VEX.NDS.128.0F.WIG 53 /r */
   24381       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24382          delta = dis_AVX128_E_to_G_unary_all(
   24383                     uses_vvvv, vbi, pfx, delta, "vrcpps", Iop_Recip32Fx4 );
   24384          goto decode_success;
   24385       }
   24386       /* VRCPPS ymm2/m256(E), ymm1(G) = VEX.NDS.256.0F.WIG 53 /r */
   24387       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24388          delta = dis_AVX256_E_to_G_unary_all(
   24389                     uses_vvvv, vbi, pfx, delta, "vrcpps", Iop_Recip32Fx8 );
   24390          goto decode_success;
   24391       }
   24392       break;
   24393 
   24394    case 0x54:
   24395       /* VANDPD r/m, rV, r ::: r = rV & r/m */
   24396       /* VANDPD = VEX.NDS.128.66.0F.WIG 54 /r */
   24397       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24398          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24399                     uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV128 );
   24400          goto decode_success;
   24401       }
   24402       /* VANDPD r/m, rV, r ::: r = rV & r/m */
   24403       /* VANDPD = VEX.NDS.256.66.0F.WIG 54 /r */
   24404       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24405          delta = dis_AVX256_E_V_to_G(
   24406                     uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV256 );
   24407          goto decode_success;
   24408       }
   24409       /* VANDPS = VEX.NDS.128.0F.WIG 54 /r */
   24410       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24411          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24412                     uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV128 );
   24413          goto decode_success;
   24414       }
   24415       /* VANDPS = VEX.NDS.256.0F.WIG 54 /r */
   24416       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24417          delta = dis_AVX256_E_V_to_G(
   24418                     uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV256 );
   24419          goto decode_success;
   24420       }
   24421       break;
   24422 
   24423    case 0x55:
   24424       /* VANDNPD r/m, rV, r ::: r = (not rV) & r/m */
   24425       /* VANDNPD = VEX.NDS.128.66.0F.WIG 55 /r */
   24426       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24427          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   24428                     uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV128,
   24429                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
   24430          goto decode_success;
   24431       }
   24432       /* VANDNPD = VEX.NDS.256.66.0F.WIG 55 /r */
   24433       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24434          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
   24435                     uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV256,
   24436                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
   24437          goto decode_success;
   24438       }
   24439       /* VANDNPS = VEX.NDS.128.0F.WIG 55 /r */
   24440       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24441          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   24442                     uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV128,
   24443                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
   24444          goto decode_success;
   24445       }
   24446       /* VANDNPS = VEX.NDS.256.0F.WIG 55 /r */
   24447       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24448          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
   24449                     uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV256,
   24450                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
   24451          goto decode_success;
   24452       }
   24453       break;
   24454 
   24455    case 0x56:
   24456       /* VORPD r/m, rV, r ::: r = rV | r/m */
   24457       /* VORPD = VEX.NDS.128.66.0F.WIG 56 /r */
   24458       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24459          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24460                     uses_vvvv, vbi, pfx, delta, "vorpd", Iop_OrV128 );
   24461          goto decode_success;
   24462       }
   24463       /* VORPD r/m, rV, r ::: r = rV | r/m */
   24464       /* VORPD = VEX.NDS.256.66.0F.WIG 56 /r */
   24465       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24466          delta = dis_AVX256_E_V_to_G(
   24467                     uses_vvvv, vbi, pfx, delta, "vorpd", Iop_OrV256 );
   24468          goto decode_success;
   24469       }
   24470       /* VORPS r/m, rV, r ::: r = rV | r/m */
   24471       /* VORPS = VEX.NDS.128.0F.WIG 56 /r */
   24472       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24473          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24474                     uses_vvvv, vbi, pfx, delta, "vorps", Iop_OrV128 );
   24475          goto decode_success;
   24476       }
   24477       /* VORPS r/m, rV, r ::: r = rV | r/m */
   24478       /* VORPS = VEX.NDS.256.0F.WIG 56 /r */
   24479       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24480          delta = dis_AVX256_E_V_to_G(
   24481                     uses_vvvv, vbi, pfx, delta, "vorps", Iop_OrV256 );
   24482          goto decode_success;
   24483       }
   24484       break;
   24485 
   24486    case 0x57:
   24487       /* VXORPD r/m, rV, r ::: r = rV ^ r/m */
   24488       /* VXORPD = VEX.NDS.128.66.0F.WIG 57 /r */
   24489       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24490          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24491                     uses_vvvv, vbi, pfx, delta, "vxorpd", Iop_XorV128 );
   24492          goto decode_success;
   24493       }
   24494       /* VXORPD r/m, rV, r ::: r = rV ^ r/m */
   24495       /* VXORPD = VEX.NDS.256.66.0F.WIG 57 /r */
   24496       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24497          delta = dis_AVX256_E_V_to_G(
   24498                     uses_vvvv, vbi, pfx, delta, "vxorpd", Iop_XorV256 );
   24499          goto decode_success;
   24500       }
   24501       /* VXORPS r/m, rV, r ::: r = rV ^ r/m */
   24502       /* VXORPS = VEX.NDS.128.0F.WIG 57 /r */
   24503       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24504          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24505                     uses_vvvv, vbi, pfx, delta, "vxorps", Iop_XorV128 );
   24506          goto decode_success;
   24507       }
   24508       /* VXORPS r/m, rV, r ::: r = rV ^ r/m */
   24509       /* VXORPS = VEX.NDS.256.0F.WIG 57 /r */
   24510       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24511          delta = dis_AVX256_E_V_to_G(
   24512                     uses_vvvv, vbi, pfx, delta, "vxorps", Iop_XorV256 );
   24513          goto decode_success;
   24514       }
   24515       break;
   24516 
   24517    case 0x58:
   24518       /* VADDSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 58 /r */
   24519       if (haveF2no66noF3(pfx)) {
   24520          delta = dis_AVX128_E_V_to_G_lo64(
   24521                     uses_vvvv, vbi, pfx, delta, "vaddsd", Iop_Add64F0x2 );
   24522          goto decode_success;
   24523       }
   24524       /* VADDSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 58 /r */
   24525       if (haveF3no66noF2(pfx)) {
   24526          delta = dis_AVX128_E_V_to_G_lo32(
   24527                     uses_vvvv, vbi, pfx, delta, "vaddss", Iop_Add32F0x4 );
   24528          goto decode_success;
   24529       }
   24530       /* VADDPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 58 /r */
   24531       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24532          delta = dis_AVX128_E_V_to_G(
   24533                     uses_vvvv, vbi, pfx, delta, "vaddps", Iop_Add32Fx4 );
   24534          goto decode_success;
   24535       }
   24536       /* VADDPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 58 /r */
   24537       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24538          delta = dis_AVX256_E_V_to_G(
   24539                     uses_vvvv, vbi, pfx, delta, "vaddps", Iop_Add32Fx8 );
   24540          goto decode_success;
   24541       }
   24542       /* VADDPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 58 /r */
   24543       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24544          delta = dis_AVX128_E_V_to_G(
   24545                     uses_vvvv, vbi, pfx, delta, "vaddpd", Iop_Add64Fx2 );
   24546          goto decode_success;
   24547       }
   24548       /* VADDPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 58 /r */
   24549       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24550          delta = dis_AVX256_E_V_to_G(
   24551                     uses_vvvv, vbi, pfx, delta, "vaddpd", Iop_Add64Fx4 );
   24552          goto decode_success;
   24553       }
   24554       break;
   24555 
   24556    case 0x59:
   24557       /* VMULSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 59 /r */
   24558       if (haveF2no66noF3(pfx)) {
   24559          delta = dis_AVX128_E_V_to_G_lo64(
   24560                     uses_vvvv, vbi, pfx, delta, "vmulsd", Iop_Mul64F0x2 );
   24561          goto decode_success;
   24562       }
   24563       /* VMULSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 59 /r */
   24564       if (haveF3no66noF2(pfx)) {
   24565          delta = dis_AVX128_E_V_to_G_lo32(
   24566                     uses_vvvv, vbi, pfx, delta, "vmulss", Iop_Mul32F0x4 );
   24567          goto decode_success;
   24568       }
   24569       /* VMULPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 59 /r */
   24570       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24571          delta = dis_AVX128_E_V_to_G(
   24572                     uses_vvvv, vbi, pfx, delta, "vmulps", Iop_Mul32Fx4 );
   24573          goto decode_success;
   24574       }
   24575       /* VMULPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 59 /r */
   24576       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24577          delta = dis_AVX256_E_V_to_G(
   24578                     uses_vvvv, vbi, pfx, delta, "vmulps", Iop_Mul32Fx8 );
   24579          goto decode_success;
   24580       }
   24581       /* VMULPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 59 /r */
   24582       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24583          delta = dis_AVX128_E_V_to_G(
   24584                     uses_vvvv, vbi, pfx, delta, "vmulpd", Iop_Mul64Fx2 );
   24585          goto decode_success;
   24586       }
   24587       /* VMULPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 59 /r */
   24588       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24589          delta = dis_AVX256_E_V_to_G(
   24590                     uses_vvvv, vbi, pfx, delta, "vmulpd", Iop_Mul64Fx4 );
   24591          goto decode_success;
   24592       }
   24593       break;
   24594 
   24595    case 0x5A:
   24596       /* VCVTPS2PD xmm2/m64, xmm1 = VEX.128.0F.WIG 5A /r */
   24597       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24598          delta = dis_CVTPS2PD_128( vbi, pfx, delta, True/*isAvx*/ );
   24599          goto decode_success;
   24600       }
   24601       /* VCVTPS2PD xmm2/m128, ymm1 = VEX.256.0F.WIG 5A /r */
   24602       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24603          delta = dis_CVTPS2PD_256( vbi, pfx, delta );
   24604          goto decode_success;
   24605       }
   24606       /* VCVTPD2PS xmm2/m128, xmm1 = VEX.128.66.0F.WIG 5A /r */
   24607       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24608          delta = dis_CVTPD2PS_128( vbi, pfx, delta, True/*isAvx*/ );
   24609          goto decode_success;
   24610       }
   24611       /* VCVTPD2PS ymm2/m256, xmm1 = VEX.256.66.0F.WIG 5A /r */
   24612       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24613          delta = dis_CVTPD2PS_256( vbi, pfx, delta );
   24614          goto decode_success;
   24615       }
   24616       /* VCVTSD2SS xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5A /r */
   24617       if (haveF2no66noF3(pfx)) {
   24618          UChar  modrm = getUChar(delta);
   24619          UInt   rV    = getVexNvvvv(pfx);
   24620          UInt   rD    = gregOfRexRM(pfx, modrm);
   24621          IRTemp f64lo = newTemp(Ity_F64);
   24622          IRTemp rmode = newTemp(Ity_I32);
   24623          assign( rmode, get_sse_roundingmode() );
   24624          if (epartIsReg(modrm)) {
   24625             UInt rS = eregOfRexRM(pfx,modrm);
   24626             assign(f64lo, getXMMRegLane64F(rS, 0));
   24627             delta += 1;
   24628             DIP("vcvtsd2ss %s,%s,%s\n",
   24629                 nameXMMReg(rS), nameXMMReg(rV), nameXMMReg(rD));
   24630          } else {
   24631             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24632             assign(f64lo, loadLE(Ity_F64, mkexpr(addr)) );
   24633             delta += alen;
   24634             DIP("vcvtsd2ss %s,%s,%s\n",
   24635                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   24636          }
   24637          putXMMRegLane32F( rD, 0,
   24638                            binop( Iop_F64toF32, mkexpr(rmode),
   24639                                                 mkexpr(f64lo)) );
   24640          putXMMRegLane32( rD, 1, getXMMRegLane32( rV, 1 ));
   24641          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   24642          putYMMRegLane128( rD, 1, mkV128(0) );
   24643          *uses_vvvv = True;
   24644          goto decode_success;
   24645       }
   24646       /* VCVTSS2SD xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5A /r */
   24647       if (haveF3no66noF2(pfx)) {
   24648          UChar  modrm = getUChar(delta);
   24649          UInt   rV    = getVexNvvvv(pfx);
   24650          UInt   rD    = gregOfRexRM(pfx, modrm);
   24651          IRTemp f32lo = newTemp(Ity_F32);
   24652          if (epartIsReg(modrm)) {
   24653             UInt rS = eregOfRexRM(pfx,modrm);
   24654             assign(f32lo, getXMMRegLane32F(rS, 0));
   24655             delta += 1;
   24656             DIP("vcvtss2sd %s,%s,%s\n",
   24657                 nameXMMReg(rS), nameXMMReg(rV), nameXMMReg(rD));
   24658          } else {
   24659             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   24660             assign(f32lo, loadLE(Ity_F32, mkexpr(addr)) );
   24661             delta += alen;
   24662             DIP("vcvtss2sd %s,%s,%s\n",
   24663                 dis_buf, nameXMMReg(rV), nameXMMReg(rD));
   24664          }
   24665          putXMMRegLane64F( rD, 0,
   24666                            unop( Iop_F32toF64, mkexpr(f32lo)) );
   24667          putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
   24668          putYMMRegLane128( rD, 1, mkV128(0) );
   24669          *uses_vvvv = True;
   24670          goto decode_success;
   24671       }
   24672       break;
   24673 
   24674    case 0x5B:
   24675       /* VCVTPS2DQ xmm2/m128, xmm1 = VEX.128.66.0F.WIG 5B /r */
   24676       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24677          delta = dis_CVTxPS2DQ_128( vbi, pfx, delta,
   24678                                     True/*isAvx*/, False/*!r2zero*/ );
   24679          goto decode_success;
   24680       }
   24681       /* VCVTPS2DQ ymm2/m256, ymm1 = VEX.256.66.0F.WIG 5B /r */
   24682       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24683          delta = dis_CVTxPS2DQ_256( vbi, pfx, delta,
   24684                                     False/*!r2zero*/ );
   24685          goto decode_success;
   24686       }
   24687       /* VCVTTPS2DQ xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 5B /r */
   24688       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
   24689          delta = dis_CVTxPS2DQ_128( vbi, pfx, delta,
   24690                                     True/*isAvx*/, True/*r2zero*/ );
   24691          goto decode_success;
   24692       }
   24693       /* VCVTTPS2DQ ymm2/m256, ymm1 = VEX.256.F3.0F.WIG 5B /r */
   24694       if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
   24695          delta = dis_CVTxPS2DQ_256( vbi, pfx, delta,
   24696                                     True/*r2zero*/ );
   24697          goto decode_success;
   24698       }
   24699       /* VCVTDQ2PS xmm2/m128, xmm1 = VEX.128.0F.WIG 5B /r */
   24700       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24701          delta = dis_CVTDQ2PS_128 ( vbi, pfx, delta, True/*isAvx*/ );
   24702          goto decode_success;
   24703       }
   24704       /* VCVTDQ2PS ymm2/m256, ymm1 = VEX.256.0F.WIG 5B /r */
   24705       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24706          delta = dis_CVTDQ2PS_256 ( vbi, pfx, delta );
   24707          goto decode_success;
   24708       }
   24709       break;
   24710 
   24711    case 0x5C:
   24712       /* VSUBSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5C /r */
   24713       if (haveF2no66noF3(pfx)) {
   24714          delta = dis_AVX128_E_V_to_G_lo64(
   24715                     uses_vvvv, vbi, pfx, delta, "vsubsd", Iop_Sub64F0x2 );
   24716          goto decode_success;
   24717       }
   24718       /* VSUBSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5C /r */
   24719       if (haveF3no66noF2(pfx)) {
   24720          delta = dis_AVX128_E_V_to_G_lo32(
   24721                     uses_vvvv, vbi, pfx, delta, "vsubss", Iop_Sub32F0x4 );
   24722          goto decode_success;
   24723       }
   24724       /* VSUBPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5C /r */
   24725       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24726          delta = dis_AVX128_E_V_to_G(
   24727                     uses_vvvv, vbi, pfx, delta, "vsubps", Iop_Sub32Fx4 );
   24728          goto decode_success;
   24729       }
   24730       /* VSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5C /r */
   24731       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24732          delta = dis_AVX256_E_V_to_G(
   24733                     uses_vvvv, vbi, pfx, delta, "vsubps", Iop_Sub32Fx8 );
   24734          goto decode_success;
   24735       }
   24736       /* VSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5C /r */
   24737       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24738          delta = dis_AVX128_E_V_to_G(
   24739                     uses_vvvv, vbi, pfx, delta, "vsubpd", Iop_Sub64Fx2 );
   24740          goto decode_success;
   24741       }
   24742       /* VSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5C /r */
   24743       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24744          delta = dis_AVX256_E_V_to_G(
   24745                     uses_vvvv, vbi, pfx, delta, "vsubpd", Iop_Sub64Fx4 );
   24746          goto decode_success;
   24747       }
   24748       break;
   24749 
   24750    case 0x5D:
   24751       /* VMINSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5D /r */
   24752       if (haveF2no66noF3(pfx)) {
   24753          delta = dis_AVX128_E_V_to_G_lo64(
   24754                     uses_vvvv, vbi, pfx, delta, "vminsd", Iop_Min64F0x2 );
   24755          goto decode_success;
   24756       }
   24757       /* VMINSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5D /r */
   24758       if (haveF3no66noF2(pfx)) {
   24759          delta = dis_AVX128_E_V_to_G_lo32(
   24760                     uses_vvvv, vbi, pfx, delta, "vminss", Iop_Min32F0x4 );
   24761          goto decode_success;
   24762       }
   24763       /* VMINPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5D /r */
   24764       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24765          delta = dis_AVX128_E_V_to_G(
   24766                     uses_vvvv, vbi, pfx, delta, "vminps", Iop_Min32Fx4 );
   24767          goto decode_success;
   24768       }
   24769       /* VMINPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5D /r */
   24770       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24771          delta = dis_AVX256_E_V_to_G(
   24772                     uses_vvvv, vbi, pfx, delta, "vminps", Iop_Min32Fx8 );
   24773          goto decode_success;
   24774       }
   24775       /* VMINPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5D /r */
   24776       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24777          delta = dis_AVX128_E_V_to_G(
   24778                     uses_vvvv, vbi, pfx, delta, "vminpd", Iop_Min64Fx2 );
   24779          goto decode_success;
   24780       }
   24781       /* VMINPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5D /r */
   24782       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24783          delta = dis_AVX256_E_V_to_G(
   24784                     uses_vvvv, vbi, pfx, delta, "vminpd", Iop_Min64Fx4 );
   24785          goto decode_success;
   24786       }
   24787       break;
   24788 
   24789    case 0x5E:
   24790       /* VDIVSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5E /r */
   24791       if (haveF2no66noF3(pfx)) {
   24792          delta = dis_AVX128_E_V_to_G_lo64(
   24793                     uses_vvvv, vbi, pfx, delta, "vdivsd", Iop_Div64F0x2 );
   24794          goto decode_success;
   24795       }
   24796       /* VDIVSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5E /r */
   24797       if (haveF3no66noF2(pfx)) {
   24798          delta = dis_AVX128_E_V_to_G_lo32(
   24799                     uses_vvvv, vbi, pfx, delta, "vdivss", Iop_Div32F0x4 );
   24800          goto decode_success;
   24801       }
   24802       /* VDIVPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5E /r */
   24803       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24804          delta = dis_AVX128_E_V_to_G(
   24805                     uses_vvvv, vbi, pfx, delta, "vdivps", Iop_Div32Fx4 );
   24806          goto decode_success;
   24807       }
   24808       /* VDIVPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5E /r */
   24809       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24810          delta = dis_AVX256_E_V_to_G(
   24811                     uses_vvvv, vbi, pfx, delta, "vdivps", Iop_Div32Fx8 );
   24812          goto decode_success;
   24813       }
   24814       /* VDIVPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5E /r */
   24815       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24816          delta = dis_AVX128_E_V_to_G(
   24817                     uses_vvvv, vbi, pfx, delta, "vdivpd", Iop_Div64Fx2 );
   24818          goto decode_success;
   24819       }
   24820       /* VDIVPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5E /r */
   24821       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24822          delta = dis_AVX256_E_V_to_G(
   24823                     uses_vvvv, vbi, pfx, delta, "vdivpd", Iop_Div64Fx4 );
   24824          goto decode_success;
   24825       }
   24826       break;
   24827 
   24828    case 0x5F:
   24829       /* VMAXSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5F /r */
   24830       if (haveF2no66noF3(pfx)) {
   24831          delta = dis_AVX128_E_V_to_G_lo64(
   24832                     uses_vvvv, vbi, pfx, delta, "vmaxsd", Iop_Max64F0x2 );
   24833          goto decode_success;
   24834       }
   24835       /* VMAXSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5F /r */
   24836       if (haveF3no66noF2(pfx)) {
   24837          delta = dis_AVX128_E_V_to_G_lo32(
   24838                     uses_vvvv, vbi, pfx, delta, "vmaxss", Iop_Max32F0x4 );
   24839          goto decode_success;
   24840       }
   24841       /* VMAXPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5F /r */
   24842       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24843          delta = dis_AVX128_E_V_to_G(
   24844                     uses_vvvv, vbi, pfx, delta, "vmaxps", Iop_Max32Fx4 );
   24845          goto decode_success;
   24846       }
   24847       /* VMAXPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5F /r */
   24848       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24849          delta = dis_AVX256_E_V_to_G(
   24850                     uses_vvvv, vbi, pfx, delta, "vmaxps", Iop_Max32Fx8 );
   24851          goto decode_success;
   24852       }
   24853       /* VMAXPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5F /r */
   24854       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24855          delta = dis_AVX128_E_V_to_G(
   24856                     uses_vvvv, vbi, pfx, delta, "vmaxpd", Iop_Max64Fx2 );
   24857          goto decode_success;
   24858       }
   24859       /* VMAXPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5F /r */
   24860       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24861          delta = dis_AVX256_E_V_to_G(
   24862                     uses_vvvv, vbi, pfx, delta, "vmaxpd", Iop_Max64Fx4 );
   24863          goto decode_success;
   24864       }
   24865       break;
   24866 
   24867    case 0x60:
   24868       /* VPUNPCKLBW r/m, rV, r ::: r = interleave-lo-bytes(rV, r/m) */
   24869       /* VPUNPCKLBW = VEX.NDS.128.66.0F.WIG 60 /r */
   24870       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24871          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   24872                     uses_vvvv, vbi, pfx, delta, "vpunpcklbw",
   24873                     Iop_InterleaveLO8x16, NULL,
   24874                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   24875          goto decode_success;
   24876       }
   24877       /* VPUNPCKLBW r/m, rV, r ::: r = interleave-lo-bytes(rV, r/m) */
   24878       /* VPUNPCKLBW = VEX.NDS.256.66.0F.WIG 60 /r */
   24879       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24880          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   24881                     uses_vvvv, vbi, pfx, delta, "vpunpcklbw",
   24882                     math_VPUNPCKLBW_YMM );
   24883          goto decode_success;
   24884       }
   24885       break;
   24886 
   24887    case 0x61:
   24888       /* VPUNPCKLWD r/m, rV, r ::: r = interleave-lo-words(rV, r/m) */
   24889       /* VPUNPCKLWD = VEX.NDS.128.66.0F.WIG 61 /r */
   24890       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24891          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   24892                     uses_vvvv, vbi, pfx, delta, "vpunpcklwd",
   24893                     Iop_InterleaveLO16x8, NULL,
   24894                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   24895          goto decode_success;
   24896       }
   24897       /* VPUNPCKLWD r/m, rV, r ::: r = interleave-lo-words(rV, r/m) */
   24898       /* VPUNPCKLWD = VEX.NDS.256.66.0F.WIG 61 /r */
   24899       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24900          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   24901                     uses_vvvv, vbi, pfx, delta, "vpunpcklwd",
   24902                     math_VPUNPCKLWD_YMM );
   24903          goto decode_success;
   24904       }
   24905       break;
   24906 
   24907    case 0x62:
   24908       /* VPUNPCKLDQ r/m, rV, r ::: r = interleave-lo-dwords(rV, r/m) */
   24909       /* VPUNPCKLDQ = VEX.NDS.128.66.0F.WIG 62 /r */
   24910       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24911          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   24912                     uses_vvvv, vbi, pfx, delta, "vpunpckldq",
   24913                     Iop_InterleaveLO32x4, NULL,
   24914                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   24915          goto decode_success;
   24916       }
   24917       /* VPUNPCKLDQ r/m, rV, r ::: r = interleave-lo-dwords(rV, r/m) */
   24918       /* VPUNPCKLDQ = VEX.NDS.256.66.0F.WIG 62 /r */
   24919       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24920          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   24921                     uses_vvvv, vbi, pfx, delta, "vpunpckldq",
   24922                     math_VPUNPCKLDQ_YMM );
   24923          goto decode_success;
   24924       }
   24925       break;
   24926 
   24927    case 0x63:
   24928       /* VPACKSSWB r/m, rV, r ::: r = QNarrowBin16Sto8Sx16(rV, r/m) */
   24929       /* VPACKSSWB = VEX.NDS.128.66.0F.WIG 63 /r */
   24930       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24931          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   24932                     uses_vvvv, vbi, pfx, delta, "vpacksswb",
   24933                     Iop_QNarrowBin16Sto8Sx16, NULL,
   24934                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   24935          goto decode_success;
   24936       }
   24937       /* VPACKSSWB r/m, rV, r ::: r = QNarrowBin16Sto8Sx16(rV, r/m) */
   24938       /* VPACKSSWB = VEX.NDS.256.66.0F.WIG 63 /r */
   24939       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24940          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   24941                     uses_vvvv, vbi, pfx, delta, "vpacksswb",
   24942                     math_VPACKSSWB_YMM );
   24943          goto decode_success;
   24944       }
   24945       break;
   24946 
   24947    case 0x64:
   24948       /* VPCMPGTB r/m, rV, r ::: r = rV `>s-by-8s` r/m */
   24949       /* VPCMPGTB = VEX.NDS.128.66.0F.WIG 64 /r */
   24950       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24951          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24952                     uses_vvvv, vbi, pfx, delta, "vpcmpgtb", Iop_CmpGT8Sx16 );
   24953          goto decode_success;
   24954       }
   24955       /* VPCMPGTB r/m, rV, r ::: r = rV `>s-by-8s` r/m */
   24956       /* VPCMPGTB = VEX.NDS.256.66.0F.WIG 64 /r */
   24957       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24958          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   24959                     uses_vvvv, vbi, pfx, delta, "vpcmpgtb", Iop_CmpGT8Sx32 );
   24960          goto decode_success;
   24961       }
   24962       break;
   24963 
   24964    case 0x65:
   24965       /* VPCMPGTW r/m, rV, r ::: r = rV `>s-by-16s` r/m */
   24966       /* VPCMPGTW = VEX.NDS.128.66.0F.WIG 65 /r */
   24967       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24968          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24969                     uses_vvvv, vbi, pfx, delta, "vpcmpgtw", Iop_CmpGT16Sx8 );
   24970          goto decode_success;
   24971       }
   24972       /* VPCMPGTW r/m, rV, r ::: r = rV `>s-by-16s` r/m */
   24973       /* VPCMPGTW = VEX.NDS.256.66.0F.WIG 65 /r */
   24974       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24975          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   24976                     uses_vvvv, vbi, pfx, delta, "vpcmpgtw", Iop_CmpGT16Sx16 );
   24977          goto decode_success;
   24978       }
   24979       break;
   24980 
   24981    case 0x66:
   24982       /* VPCMPGTD r/m, rV, r ::: r = rV `>s-by-32s` r/m */
   24983       /* VPCMPGTD = VEX.NDS.128.66.0F.WIG 66 /r */
   24984       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   24985          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   24986                     uses_vvvv, vbi, pfx, delta, "vpcmpgtd", Iop_CmpGT32Sx4 );
   24987          goto decode_success;
   24988       }
   24989       /* VPCMPGTD r/m, rV, r ::: r = rV `>s-by-32s` r/m */
   24990       /* VPCMPGTD = VEX.NDS.256.66.0F.WIG 66 /r */
   24991       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   24992          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   24993                     uses_vvvv, vbi, pfx, delta, "vpcmpgtd", Iop_CmpGT32Sx8 );
   24994          goto decode_success;
   24995       }
   24996       break;
   24997 
   24998    case 0x67:
   24999       /* VPACKUSWB r/m, rV, r ::: r = QNarrowBin16Sto8Ux16(rV, r/m) */
   25000       /* VPACKUSWB = VEX.NDS.128.66.0F.WIG 67 /r */
   25001       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25002          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25003                     uses_vvvv, vbi, pfx, delta, "vpackuswb",
   25004                     Iop_QNarrowBin16Sto8Ux16, NULL,
   25005                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25006          goto decode_success;
   25007       }
   25008       /* VPACKUSWB r/m, rV, r ::: r = QNarrowBin16Sto8Ux16(rV, r/m) */
   25009       /* VPACKUSWB = VEX.NDS.256.66.0F.WIG 67 /r */
   25010       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25011          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25012                     uses_vvvv, vbi, pfx, delta, "vpackuswb",
   25013                     math_VPACKUSWB_YMM );
   25014          goto decode_success;
   25015       }
   25016       break;
   25017 
   25018    case 0x68:
   25019       /* VPUNPCKHBW r/m, rV, r ::: r = interleave-hi-bytes(rV, r/m) */
   25020       /* VPUNPCKHBW = VEX.NDS.128.0F.WIG 68 /r */
   25021       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25022          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25023                     uses_vvvv, vbi, pfx, delta, "vpunpckhbw",
   25024                     Iop_InterleaveHI8x16, NULL,
   25025                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25026          goto decode_success;
   25027       }
   25028       /* VPUNPCKHBW r/m, rV, r ::: r = interleave-hi-bytes(rV, r/m) */
   25029       /* VPUNPCKHBW = VEX.NDS.256.0F.WIG 68 /r */
   25030       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25031          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25032                     uses_vvvv, vbi, pfx, delta, "vpunpckhbw",
   25033                     math_VPUNPCKHBW_YMM );
   25034          goto decode_success;
   25035       }
   25036       break;
   25037 
   25038    case 0x69:
   25039       /* VPUNPCKHWD r/m, rV, r ::: r = interleave-hi-words(rV, r/m) */
   25040       /* VPUNPCKHWD = VEX.NDS.128.0F.WIG 69 /r */
   25041       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25042          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25043                     uses_vvvv, vbi, pfx, delta, "vpunpckhwd",
   25044                     Iop_InterleaveHI16x8, NULL,
   25045                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25046          goto decode_success;
   25047       }
   25048       /* VPUNPCKHWD r/m, rV, r ::: r = interleave-hi-words(rV, r/m) */
   25049       /* VPUNPCKHWD = VEX.NDS.256.0F.WIG 69 /r */
   25050       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25051          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25052                     uses_vvvv, vbi, pfx, delta, "vpunpckhwd",
   25053                     math_VPUNPCKHWD_YMM );
   25054          goto decode_success;
   25055       }
   25056       break;
   25057 
   25058    case 0x6A:
   25059       /* VPUNPCKHDQ r/m, rV, r ::: r = interleave-hi-dwords(rV, r/m) */
   25060       /* VPUNPCKHDQ = VEX.NDS.128.66.0F.WIG 6A /r */
   25061       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25062          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25063                     uses_vvvv, vbi, pfx, delta, "vpunpckhdq",
   25064                     Iop_InterleaveHI32x4, NULL,
   25065                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25066          goto decode_success;
   25067       }
   25068       /* VPUNPCKHDQ r/m, rV, r ::: r = interleave-hi-dwords(rV, r/m) */
   25069       /* VPUNPCKHDQ = VEX.NDS.256.66.0F.WIG 6A /r */
   25070       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25071          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25072                     uses_vvvv, vbi, pfx, delta, "vpunpckhdq",
   25073                     math_VPUNPCKHDQ_YMM );
   25074          goto decode_success;
   25075       }
   25076       break;
   25077 
   25078    case 0x6B:
   25079       /* VPACKSSDW r/m, rV, r ::: r = QNarrowBin32Sto16Sx8(rV, r/m) */
   25080       /* VPACKSSDW = VEX.NDS.128.66.0F.WIG 6B /r */
   25081       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25082          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25083                     uses_vvvv, vbi, pfx, delta, "vpackssdw",
   25084                     Iop_QNarrowBin32Sto16Sx8, NULL,
   25085                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25086          goto decode_success;
   25087       }
   25088       /* VPACKSSDW r/m, rV, r ::: r = QNarrowBin32Sto16Sx8(rV, r/m) */
   25089       /* VPACKSSDW = VEX.NDS.256.66.0F.WIG 6B /r */
   25090       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25091          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25092                     uses_vvvv, vbi, pfx, delta, "vpackssdw",
   25093                     math_VPACKSSDW_YMM );
   25094          goto decode_success;
   25095       }
   25096       break;
   25097 
   25098    case 0x6C:
   25099       /* VPUNPCKLQDQ r/m, rV, r ::: r = interleave-lo-64bitses(rV, r/m) */
   25100       /* VPUNPCKLQDQ = VEX.NDS.128.0F.WIG 6C /r */
   25101       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25102          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25103                     uses_vvvv, vbi, pfx, delta, "vpunpcklqdq",
   25104                     Iop_InterleaveLO64x2, NULL,
   25105                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25106          goto decode_success;
   25107       }
   25108       /* VPUNPCKLQDQ r/m, rV, r ::: r = interleave-lo-64bitses(rV, r/m) */
   25109       /* VPUNPCKLQDQ = VEX.NDS.256.0F.WIG 6C /r */
   25110       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25111          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25112                     uses_vvvv, vbi, pfx, delta, "vpunpcklqdq",
   25113                     math_VPUNPCKLQDQ_YMM );
   25114          goto decode_success;
   25115       }
   25116       break;
   25117 
   25118    case 0x6D:
   25119       /* VPUNPCKHQDQ r/m, rV, r ::: r = interleave-hi-64bitses(rV, r/m) */
   25120       /* VPUNPCKHQDQ = VEX.NDS.128.0F.WIG 6D /r */
   25121       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25122          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   25123                     uses_vvvv, vbi, pfx, delta, "vpunpckhqdq",
   25124                     Iop_InterleaveHI64x2, NULL,
   25125                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   25126          goto decode_success;
   25127       }
   25128       /* VPUNPCKHQDQ r/m, rV, r ::: r = interleave-hi-64bitses(rV, r/m) */
   25129       /* VPUNPCKHQDQ = VEX.NDS.256.0F.WIG 6D /r */
   25130       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25131          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   25132                     uses_vvvv, vbi, pfx, delta, "vpunpckhqdq",
   25133                     math_VPUNPCKHQDQ_YMM );
   25134          goto decode_success;
   25135       }
   25136       break;
   25137 
   25138    case 0x6E:
   25139       /* VMOVD r32/m32, xmm1 = VEX.128.66.0F.W0 6E */
   25140       if (have66noF2noF3(pfx)
   25141           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   25142          vassert(sz == 2); /* even tho we are transferring 4, not 2. */
   25143          UChar modrm = getUChar(delta);
   25144          if (epartIsReg(modrm)) {
   25145             delta += 1;
   25146             putYMMRegLoAndZU(
   25147                gregOfRexRM(pfx,modrm),
   25148                unop( Iop_32UtoV128, getIReg32(eregOfRexRM(pfx,modrm)) )
   25149             );
   25150             DIP("vmovd %s, %s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   25151                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   25152         } else {
   25153             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   25154             delta += alen;
   25155             putYMMRegLoAndZU(
   25156                gregOfRexRM(pfx,modrm),
   25157                unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)))
   25158                              );
   25159             DIP("vmovd %s, %s\n", dis_buf,
   25160                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   25161          }
   25162          goto decode_success;
   25163       }
   25164       /* VMOVQ r64/m64, xmm1 = VEX.128.66.0F.W1 6E */
   25165       if (have66noF2noF3(pfx)
   25166           && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
   25167          vassert(sz == 2); /* even tho we are transferring 8, not 2. */
   25168          UChar modrm = getUChar(delta);
   25169          if (epartIsReg(modrm)) {
   25170             delta += 1;
   25171             putYMMRegLoAndZU(
   25172                gregOfRexRM(pfx,modrm),
   25173                unop( Iop_64UtoV128, getIReg64(eregOfRexRM(pfx,modrm)) )
   25174             );
   25175             DIP("vmovq %s, %s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   25176                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   25177         } else {
   25178             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   25179             delta += alen;
   25180             putYMMRegLoAndZU(
   25181                gregOfRexRM(pfx,modrm),
   25182                unop( Iop_64UtoV128,loadLE(Ity_I64, mkexpr(addr)))
   25183                              );
   25184             DIP("vmovq %s, %s\n", dis_buf,
   25185                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   25186          }
   25187          goto decode_success;
   25188       }
   25189       break;
   25190 
   25191    case 0x6F:
   25192       /* VMOVDQA ymm2/m256, ymm1 = VEX.256.66.0F.WIG 6F */
   25193       /* VMOVDQU ymm2/m256, ymm1 = VEX.256.F3.0F.WIG 6F */
   25194       if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
   25195           && 1==getVexL(pfx)/*256*/) {
   25196          UChar  modrm = getUChar(delta);
   25197          UInt   rD    = gregOfRexRM(pfx, modrm);
   25198          IRTemp tD    = newTemp(Ity_V256);
   25199          Bool   isA   = have66noF2noF3(pfx);
   25200          HChar  ch    = isA ? 'a' : 'u';
   25201          if (epartIsReg(modrm)) {
   25202             UInt rS = eregOfRexRM(pfx, modrm);
   25203             delta += 1;
   25204             assign(tD, getYMMReg(rS));
   25205             DIP("vmovdq%c %s,%s\n", ch, nameYMMReg(rS), nameYMMReg(rD));
   25206          } else {
   25207             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   25208             delta += alen;
   25209             if (isA)
   25210                gen_SEGV_if_not_32_aligned(addr);
   25211             assign(tD, loadLE(Ity_V256, mkexpr(addr)));
   25212             DIP("vmovdq%c %s,%s\n", ch, dis_buf, nameYMMReg(rD));
   25213          }
   25214          putYMMReg(rD, mkexpr(tD));
   25215          goto decode_success;
   25216       }
   25217       /* VMOVDQA xmm2/m128, xmm1 = VEX.128.66.0F.WIG 6F */
   25218       /* VMOVDQU xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 6F */
   25219       if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
   25220           && 0==getVexL(pfx)/*128*/) {
   25221          UChar  modrm = getUChar(delta);
   25222          UInt   rD    = gregOfRexRM(pfx, modrm);
   25223          IRTemp tD    = newTemp(Ity_V128);
   25224          Bool   isA   = have66noF2noF3(pfx);
   25225          HChar  ch    = isA ? 'a' : 'u';
   25226          if (epartIsReg(modrm)) {
   25227             UInt rS = eregOfRexRM(pfx, modrm);
   25228             delta += 1;
   25229             assign(tD, getXMMReg(rS));
   25230             DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), nameXMMReg(rD));
   25231          } else {
   25232             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   25233             delta += alen;
   25234             if (isA)
   25235                gen_SEGV_if_not_16_aligned(addr);
   25236             assign(tD, loadLE(Ity_V128, mkexpr(addr)));
   25237             DIP("vmovdq%c %s,%s\n", ch, dis_buf, nameXMMReg(rD));
   25238          }
   25239          putYMMRegLoAndZU(rD, mkexpr(tD));
   25240          goto decode_success;
   25241       }
   25242       break;
   25243 
   25244    case 0x70:
   25245       /* VPSHUFD imm8, xmm2/m128, xmm1 = VEX.128.66.0F.WIG 70 /r ib */
   25246       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25247          delta = dis_PSHUFD_32x4( vbi, pfx, delta, True/*writesYmm*/);
   25248          goto decode_success;
   25249       }
   25250       /* VPSHUFD imm8, ymm2/m256, ymm1 = VEX.256.66.0F.WIG 70 /r ib */
   25251       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25252          delta = dis_PSHUFD_32x8( vbi, pfx, delta);
   25253          goto decode_success;
   25254       }
   25255       /* VPSHUFLW imm8, xmm2/m128, xmm1 = VEX.128.F2.0F.WIG 70 /r ib */
   25256       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25257          delta = dis_PSHUFxW_128( vbi, pfx, delta,
   25258                                   True/*isAvx*/, False/*!xIsH*/ );
   25259          goto decode_success;
   25260       }
   25261       /* VPSHUFLW imm8, ymm2/m256, ymm1 = VEX.256.F2.0F.WIG 70 /r ib */
   25262       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25263          delta = dis_PSHUFxW_256( vbi, pfx, delta, False/*!xIsH*/ );
   25264          goto decode_success;
   25265       }
   25266       /* VPSHUFHW imm8, xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 70 /r ib */
   25267       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
   25268          delta = dis_PSHUFxW_128( vbi, pfx, delta,
   25269                                   True/*isAvx*/, True/*xIsH*/ );
   25270          goto decode_success;
   25271       }
   25272       /* VPSHUFHW imm8, ymm2/m256, ymm1 = VEX.256.F3.0F.WIG 70 /r ib */
   25273       if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
   25274          delta = dis_PSHUFxW_256( vbi, pfx, delta, True/*xIsH*/ );
   25275          goto decode_success;
   25276       }
   25277       break;
   25278 
   25279    case 0x71:
   25280       /* VPSRLW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /2 ib */
   25281       /* VPSRAW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /4 ib */
   25282       /* VPSLLW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /6 ib */
   25283       if (have66noF2noF3(pfx)
   25284           && 0==getVexL(pfx)/*128*/
   25285           && epartIsReg(getUChar(delta))) {
   25286          if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
   25287             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   25288                                                 "vpsrlw", Iop_ShrN16x8 );
   25289             *uses_vvvv = True;
   25290             goto decode_success;
   25291          }
   25292          if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
   25293             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   25294                                                 "vpsraw", Iop_SarN16x8 );
   25295             *uses_vvvv = True;
   25296             goto decode_success;
   25297          }
   25298          if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
   25299             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   25300                                                 "vpsllw", Iop_ShlN16x8 );
   25301             *uses_vvvv = True;
   25302             goto decode_success;
   25303          }
   25304          /* else fall through */
   25305       }
   25306       /* VPSRLW imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 71 /2 ib */
   25307       /* VPSRAW imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 71 /4 ib */
   25308       /* VPSLLW imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 71 /6 ib */
   25309       if (have66noF2noF3(pfx)
   25310           && 1==getVexL(pfx)/*256*/
   25311           && epartIsReg(getUChar(delta))) {
   25312          if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
   25313             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   25314                                                 "vpsrlw", Iop_ShrN16x16 );
   25315             *uses_vvvv = True;
   25316             goto decode_success;
   25317          }
   25318          if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
   25319             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   25320                                                 "vpsraw", Iop_SarN16x16 );
   25321             *uses_vvvv = True;
   25322             goto decode_success;
   25323          }
   25324          if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
   25325             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   25326                                                 "vpsllw", Iop_ShlN16x16 );
   25327             *uses_vvvv = True;
   25328             goto decode_success;
   25329          }
   25330          /* else fall through */
   25331       }
   25332       break;
   25333 
   25334    case 0x72:
   25335       /* VPSRLD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /2 ib */
   25336       /* VPSRAD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /4 ib */
   25337       /* VPSLLD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /6 ib */
   25338       if (have66noF2noF3(pfx)
   25339           && 0==getVexL(pfx)/*128*/
   25340           && epartIsReg(getUChar(delta))) {
   25341          if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
   25342             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   25343                                                 "vpsrld", Iop_ShrN32x4 );
   25344             *uses_vvvv = True;
   25345             goto decode_success;
   25346          }
   25347          if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
   25348             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   25349                                                 "vpsrad", Iop_SarN32x4 );
   25350             *uses_vvvv = True;
   25351             goto decode_success;
   25352          }
   25353          if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
   25354             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   25355                                                 "vpslld", Iop_ShlN32x4 );
   25356             *uses_vvvv = True;
   25357             goto decode_success;
   25358          }
   25359          /* else fall through */
   25360       }
   25361       /* VPSRLD imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 72 /2 ib */
   25362       /* VPSRAD imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 72 /4 ib */
   25363       /* VPSLLD imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 72 /6 ib */
   25364       if (have66noF2noF3(pfx)
   25365           && 1==getVexL(pfx)/*256*/
   25366           && epartIsReg(getUChar(delta))) {
   25367          if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
   25368             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   25369                                                 "vpsrld", Iop_ShrN32x8 );
   25370             *uses_vvvv = True;
   25371             goto decode_success;
   25372          }
   25373          if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
   25374             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   25375                                                 "vpsrad", Iop_SarN32x8 );
   25376             *uses_vvvv = True;
   25377             goto decode_success;
   25378          }
   25379          if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
   25380             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   25381                                                 "vpslld", Iop_ShlN32x8 );
   25382             *uses_vvvv = True;
   25383             goto decode_success;
   25384          }
   25385          /* else fall through */
   25386       }
   25387       break;
   25388 
   25389    case 0x73:
   25390       /* VPSRLDQ imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /3 ib */
   25391       /* VPSLLDQ imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /7 ib */
   25392       /* VPSRLQ  imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /2 ib */
   25393       /* VPSLLQ  imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /6 ib */
   25394       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   25395           && epartIsReg(getUChar(delta))) {
   25396          Int    rS   = eregOfRexRM(pfx,getUChar(delta));
   25397          Int    rD   = getVexNvvvv(pfx);
   25398          IRTemp vecS = newTemp(Ity_V128);
   25399          if (gregLO3ofRM(getUChar(delta)) == 3) {
   25400             Int imm = (Int)getUChar(delta+1);
   25401             DIP("vpsrldq $%d,%s,%s\n", imm, nameXMMReg(rS), nameXMMReg(rD));
   25402             delta += 2;
   25403             assign( vecS, getXMMReg(rS) );
   25404             putYMMRegLoAndZU(rD, mkexpr(math_PSRLDQ( vecS, imm )));
   25405             *uses_vvvv = True;
   25406             goto decode_success;
   25407          }
   25408          if (gregLO3ofRM(getUChar(delta)) == 7) {
   25409             Int imm = (Int)getUChar(delta+1);
   25410             DIP("vpslldq $%d,%s,%s\n", imm, nameXMMReg(rS), nameXMMReg(rD));
   25411             delta += 2;
   25412             assign( vecS, getXMMReg(rS) );
   25413             putYMMRegLoAndZU(rD, mkexpr(math_PSLLDQ( vecS, imm )));
   25414             *uses_vvvv = True;
   25415             goto decode_success;
   25416          }
   25417          if (gregLO3ofRM(getUChar(delta)) == 2) {
   25418             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   25419                                                 "vpsrlq", Iop_ShrN64x2 );
   25420             *uses_vvvv = True;
   25421             goto decode_success;
   25422          }
   25423          if (gregLO3ofRM(getUChar(delta)) == 6) {
   25424             delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
   25425                                                 "vpsllq", Iop_ShlN64x2 );
   25426             *uses_vvvv = True;
   25427             goto decode_success;
   25428          }
   25429          /* else fall through */
   25430       }
   25431       /* VPSRLDQ imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /3 ib */
   25432       /* VPSLLDQ imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /7 ib */
   25433       /* VPSRLQ  imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /2 ib */
   25434       /* VPSLLQ  imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /6 ib */
   25435       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   25436           && epartIsReg(getUChar(delta))) {
   25437          Int    rS   = eregOfRexRM(pfx,getUChar(delta));
   25438          Int    rD   = getVexNvvvv(pfx);
   25439          if (gregLO3ofRM(getUChar(delta)) == 3) {
   25440             IRTemp vecS0 = newTemp(Ity_V128);
   25441             IRTemp vecS1 = newTemp(Ity_V128);
   25442             Int imm = (Int)getUChar(delta+1);
   25443             DIP("vpsrldq $%d,%s,%s\n", imm, nameYMMReg(rS), nameYMMReg(rD));
   25444             delta += 2;
   25445             assign( vecS0, getYMMRegLane128(rS, 0));
   25446             assign( vecS1, getYMMRegLane128(rS, 1));
   25447             putYMMRegLane128(rD, 0, mkexpr(math_PSRLDQ( vecS0, imm )));
   25448             putYMMRegLane128(rD, 1, mkexpr(math_PSRLDQ( vecS1, imm )));
   25449             *uses_vvvv = True;
   25450             goto decode_success;
   25451          }
   25452          if (gregLO3ofRM(getUChar(delta)) == 7) {
   25453             IRTemp vecS0 = newTemp(Ity_V128);
   25454             IRTemp vecS1 = newTemp(Ity_V128);
   25455             Int imm = (Int)getUChar(delta+1);
   25456             DIP("vpslldq $%d,%s,%s\n", imm, nameYMMReg(rS), nameYMMReg(rD));
   25457             delta += 2;
   25458             assign( vecS0, getYMMRegLane128(rS, 0));
   25459             assign( vecS1, getYMMRegLane128(rS, 1));
   25460             putYMMRegLane128(rD, 0, mkexpr(math_PSLLDQ( vecS0, imm )));
   25461             putYMMRegLane128(rD, 1, mkexpr(math_PSLLDQ( vecS1, imm )));
   25462             *uses_vvvv = True;
   25463             goto decode_success;
   25464          }
   25465          if (gregLO3ofRM(getUChar(delta)) == 2) {
   25466             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   25467                                                 "vpsrlq", Iop_ShrN64x4 );
   25468             *uses_vvvv = True;
   25469             goto decode_success;
   25470          }
   25471          if (gregLO3ofRM(getUChar(delta)) == 6) {
   25472             delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
   25473                                                 "vpsllq", Iop_ShlN64x4 );
   25474             *uses_vvvv = True;
   25475             goto decode_success;
   25476          }
   25477          /* else fall through */
   25478       }
   25479       break;
   25480 
   25481    case 0x74:
   25482       /* VPCMPEQB r/m, rV, r ::: r = rV `eq-by-8s` r/m */
   25483       /* VPCMPEQB = VEX.NDS.128.66.0F.WIG 74 /r */
   25484       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25485          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25486                     uses_vvvv, vbi, pfx, delta, "vpcmpeqb", Iop_CmpEQ8x16 );
   25487          goto decode_success;
   25488       }
   25489       /* VPCMPEQB r/m, rV, r ::: r = rV `eq-by-8s` r/m */
   25490       /* VPCMPEQB = VEX.NDS.256.66.0F.WIG 74 /r */
   25491       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25492          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   25493                     uses_vvvv, vbi, pfx, delta, "vpcmpeqb", Iop_CmpEQ8x32 );
   25494          goto decode_success;
   25495       }
   25496       break;
   25497 
   25498    case 0x75:
   25499       /* VPCMPEQW r/m, rV, r ::: r = rV `eq-by-16s` r/m */
   25500       /* VPCMPEQW = VEX.NDS.128.66.0F.WIG 75 /r */
   25501       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25502          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25503                     uses_vvvv, vbi, pfx, delta, "vpcmpeqw", Iop_CmpEQ16x8 );
   25504          goto decode_success;
   25505       }
   25506       /* VPCMPEQW r/m, rV, r ::: r = rV `eq-by-16s` r/m */
   25507       /* VPCMPEQW = VEX.NDS.256.66.0F.WIG 75 /r */
   25508       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25509          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   25510                     uses_vvvv, vbi, pfx, delta, "vpcmpeqw", Iop_CmpEQ16x16 );
   25511          goto decode_success;
   25512       }
   25513       break;
   25514 
   25515    case 0x76:
   25516       /* VPCMPEQD r/m, rV, r ::: r = rV `eq-by-32s` r/m */
   25517       /* VPCMPEQD = VEX.NDS.128.66.0F.WIG 76 /r */
   25518       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25519          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   25520                     uses_vvvv, vbi, pfx, delta, "vpcmpeqd", Iop_CmpEQ32x4 );
   25521          goto decode_success;
   25522       }
   25523       /* VPCMPEQD r/m, rV, r ::: r = rV `eq-by-32s` r/m */
   25524       /* VPCMPEQD = VEX.NDS.256.66.0F.WIG 76 /r */
   25525       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25526          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   25527                     uses_vvvv, vbi, pfx, delta, "vpcmpeqd", Iop_CmpEQ32x8 );
   25528          goto decode_success;
   25529       }
   25530       break;
   25531 
   25532    case 0x77:
   25533       /* VZEROUPPER = VEX.128.0F.WIG 77 */
   25534       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25535          Int i;
   25536          IRTemp zero128 = newTemp(Ity_V128);
   25537          assign(zero128, mkV128(0));
   25538          for (i = 0; i < 16; i++) {
   25539             putYMMRegLane128(i, 1, mkexpr(zero128));
   25540          }
   25541          DIP("vzeroupper\n");
   25542          goto decode_success;
   25543       }
   25544       /* VZEROALL = VEX.256.0F.WIG 77 */
   25545       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25546          Int i;
   25547          IRTemp zero128 = newTemp(Ity_V128);
   25548          assign(zero128, mkV128(0));
   25549          for (i = 0; i < 16; i++) {
   25550             putYMMRegLoAndZU(i, mkexpr(zero128));
   25551          }
   25552          DIP("vzeroall\n");
   25553          goto decode_success;
   25554       }
   25555       break;
   25556 
   25557    case 0x7C:
   25558    case 0x7D:
   25559       /* VHADDPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.F2.0F.WIG 7C /r */
   25560       /* VHSUBPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.F2.0F.WIG 7D /r */
   25561       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25562          IRTemp sV     = newTemp(Ity_V128);
   25563          IRTemp dV     = newTemp(Ity_V128);
   25564          Bool   isAdd  = opc == 0x7C;
   25565          const HChar* str = isAdd ? "add" : "sub";
   25566          UChar modrm   = getUChar(delta);
   25567          UInt   rG     = gregOfRexRM(pfx,modrm);
   25568          UInt   rV     = getVexNvvvv(pfx);
   25569          if (epartIsReg(modrm)) {
   25570             UInt rE = eregOfRexRM(pfx,modrm);
   25571             assign( sV, getXMMReg(rE) );
   25572             DIP("vh%spd %s,%s,%s\n", str, nameXMMReg(rE),
   25573                 nameXMMReg(rV), nameXMMReg(rG));
   25574             delta += 1;
   25575          } else {
   25576             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   25577             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   25578             DIP("vh%spd %s,%s,%s\n", str, dis_buf,
   25579                 nameXMMReg(rV), nameXMMReg(rG));
   25580             delta += alen;
   25581          }
   25582          assign( dV, getXMMReg(rV) );
   25583          putYMMRegLoAndZU( rG, mkexpr( math_HADDPS_128 ( dV, sV, isAdd ) ) );
   25584          *uses_vvvv = True;
   25585          goto decode_success;
   25586       }
   25587       /* VHADDPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.F2.0F.WIG 7C /r */
   25588       /* VHSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.F2.0F.WIG 7D /r */
   25589       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25590          IRTemp sV     = newTemp(Ity_V256);
   25591          IRTemp dV     = newTemp(Ity_V256);
   25592          IRTemp s1, s0, d1, d0;
   25593          Bool   isAdd  = opc == 0x7C;
   25594          const HChar* str = isAdd ? "add" : "sub";
   25595          UChar modrm   = getUChar(delta);
   25596          UInt   rG     = gregOfRexRM(pfx,modrm);
   25597          UInt   rV     = getVexNvvvv(pfx);
   25598          s1 = s0 = d1 = d0 = IRTemp_INVALID;
   25599          if (epartIsReg(modrm)) {
   25600             UInt rE = eregOfRexRM(pfx,modrm);
   25601             assign( sV, getYMMReg(rE) );
   25602             DIP("vh%spd %s,%s,%s\n", str, nameYMMReg(rE),
   25603                 nameYMMReg(rV), nameYMMReg(rG));
   25604             delta += 1;
   25605          } else {
   25606             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   25607             assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   25608             DIP("vh%spd %s,%s,%s\n", str, dis_buf,
   25609                 nameYMMReg(rV), nameYMMReg(rG));
   25610             delta += alen;
   25611          }
   25612          assign( dV, getYMMReg(rV) );
   25613          breakupV256toV128s( dV, &d1, &d0 );
   25614          breakupV256toV128s( sV, &s1, &s0 );
   25615          putYMMReg( rG, binop(Iop_V128HLtoV256,
   25616                               mkexpr( math_HADDPS_128 ( d1, s1, isAdd ) ),
   25617                               mkexpr( math_HADDPS_128 ( d0, s0, isAdd ) ) ) );
   25618          *uses_vvvv = True;
   25619          goto decode_success;
   25620       }
   25621       /* VHADDPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 7C /r */
   25622       /* VHSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 7D /r */
   25623       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25624          IRTemp sV     = newTemp(Ity_V128);
   25625          IRTemp dV     = newTemp(Ity_V128);
   25626          Bool   isAdd  = opc == 0x7C;
   25627          const HChar* str = isAdd ? "add" : "sub";
   25628          UChar modrm   = getUChar(delta);
   25629          UInt   rG     = gregOfRexRM(pfx,modrm);
   25630          UInt   rV     = getVexNvvvv(pfx);
   25631          if (epartIsReg(modrm)) {
   25632             UInt rE = eregOfRexRM(pfx,modrm);
   25633             assign( sV, getXMMReg(rE) );
   25634             DIP("vh%spd %s,%s,%s\n", str, nameXMMReg(rE),
   25635                 nameXMMReg(rV), nameXMMReg(rG));
   25636             delta += 1;
   25637          } else {
   25638             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   25639             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   25640             DIP("vh%spd %s,%s,%s\n", str, dis_buf,
   25641                 nameXMMReg(rV), nameXMMReg(rG));
   25642             delta += alen;
   25643          }
   25644          assign( dV, getXMMReg(rV) );
   25645          putYMMRegLoAndZU( rG, mkexpr( math_HADDPD_128 ( dV, sV, isAdd ) ) );
   25646          *uses_vvvv = True;
   25647          goto decode_success;
   25648       }
   25649       /* VHADDPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 7C /r */
   25650       /* VHSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 7D /r */
   25651       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25652          IRTemp sV     = newTemp(Ity_V256);
   25653          IRTemp dV     = newTemp(Ity_V256);
   25654          IRTemp s1, s0, d1, d0;
   25655          Bool   isAdd  = opc == 0x7C;
   25656          const HChar* str = isAdd ? "add" : "sub";
   25657          UChar modrm   = getUChar(delta);
   25658          UInt   rG     = gregOfRexRM(pfx,modrm);
   25659          UInt   rV     = getVexNvvvv(pfx);
   25660          s1 = s0 = d1 = d0 = IRTemp_INVALID;
   25661          if (epartIsReg(modrm)) {
   25662             UInt rE = eregOfRexRM(pfx,modrm);
   25663             assign( sV, getYMMReg(rE) );
   25664             DIP("vh%spd %s,%s,%s\n", str, nameYMMReg(rE),
   25665                 nameYMMReg(rV), nameYMMReg(rG));
   25666             delta += 1;
   25667          } else {
   25668             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   25669             assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   25670             DIP("vh%spd %s,%s,%s\n", str, dis_buf,
   25671                 nameYMMReg(rV), nameYMMReg(rG));
   25672             delta += alen;
   25673          }
   25674          assign( dV, getYMMReg(rV) );
   25675          breakupV256toV128s( dV, &d1, &d0 );
   25676          breakupV256toV128s( sV, &s1, &s0 );
   25677          putYMMReg( rG, binop(Iop_V128HLtoV256,
   25678                               mkexpr( math_HADDPD_128 ( d1, s1, isAdd ) ),
   25679                               mkexpr( math_HADDPD_128 ( d0, s0, isAdd ) ) ) );
   25680          *uses_vvvv = True;
   25681          goto decode_success;
   25682       }
   25683       break;
   25684 
   25685    case 0x7E:
   25686       /* Note the Intel docs don't make sense for this.  I think they
   25687          are wrong.  They seem to imply it is a store when in fact I
   25688          think it is a load.  Also it's unclear whether this is W0, W1
   25689          or WIG. */
   25690       /* VMOVQ xmm2/m64, xmm1 = VEX.128.F3.0F.W0 7E /r */
   25691       if (haveF3no66noF2(pfx)
   25692           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   25693          vassert(sz == 4); /* even tho we are transferring 8, not 4. */
   25694          UChar modrm = getUChar(delta);
   25695          UInt  rG    = gregOfRexRM(pfx,modrm);
   25696          if (epartIsReg(modrm)) {
   25697             UInt rE = eregOfRexRM(pfx,modrm);
   25698             putXMMRegLane64( rG, 0, getXMMRegLane64( rE, 0 ));
   25699             DIP("vmovq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   25700             delta += 1;
   25701          } else {
   25702             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   25703             putXMMRegLane64( rG, 0, loadLE(Ity_I64, mkexpr(addr)) );
   25704             DIP("vmovq %s,%s\n", dis_buf, nameXMMReg(rG));
   25705             delta += alen;
   25706          }
   25707          /* zero bits 255:64 */
   25708          putXMMRegLane64( rG, 1, mkU64(0) );
   25709          putYMMRegLane128( rG, 1, mkV128(0) );
   25710          goto decode_success;
   25711       }
   25712       /* VMOVQ xmm1, r64 = VEX.128.66.0F.W1 7E /r (reg case only) */
   25713       /* Moves from G to E, so is a store-form insn */
   25714       /* Intel docs list this in the VMOVD entry for some reason. */
   25715       if (have66noF2noF3(pfx)
   25716           && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
   25717          UChar modrm = getUChar(delta);
   25718          UInt  rG    = gregOfRexRM(pfx,modrm);
   25719          if (epartIsReg(modrm)) {
   25720             UInt rE = eregOfRexRM(pfx,modrm);
   25721             DIP("vmovq %s,%s\n", nameXMMReg(rG), nameIReg64(rE));
   25722             putIReg64(rE, getXMMRegLane64(rG, 0));
   25723             delta += 1;
   25724          } else {
   25725             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   25726             storeLE( mkexpr(addr), getXMMRegLane64(rG, 0) );
   25727             DIP("vmovq %s,%s\n", dis_buf, nameXMMReg(rG));
   25728             delta += alen;
   25729          }
   25730          goto decode_success;
   25731       }
   25732       /* VMOVD xmm1, m32/r32 = VEX.128.66.0F.W0 7E /r (reg case only) */
   25733       /* Moves from G to E, so is a store-form insn */
   25734       if (have66noF2noF3(pfx)
   25735           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   25736          UChar modrm = getUChar(delta);
   25737          UInt  rG    = gregOfRexRM(pfx,modrm);
   25738          if (epartIsReg(modrm)) {
   25739             UInt rE = eregOfRexRM(pfx,modrm);
   25740             DIP("vmovd %s,%s\n", nameXMMReg(rG), nameIReg32(rE));
   25741             putIReg32(rE, getXMMRegLane32(rG, 0));
   25742             delta += 1;
   25743          } else {
   25744             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   25745             storeLE( mkexpr(addr), getXMMRegLane32(rG, 0) );
   25746             DIP("vmovd %s,%s\n", dis_buf, nameXMMReg(rG));
   25747             delta += alen;
   25748          }
   25749          goto decode_success;
   25750       }
   25751       break;
   25752 
   25753    case 0x7F:
   25754       /* VMOVDQA ymm1, ymm2/m256 = VEX.256.66.0F.WIG 7F */
   25755       /* VMOVDQU ymm1, ymm2/m256 = VEX.256.F3.0F.WIG 7F */
   25756       if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
   25757           && 1==getVexL(pfx)/*256*/) {
   25758          UChar  modrm = getUChar(delta);
   25759          UInt   rS    = gregOfRexRM(pfx, modrm);
   25760          IRTemp tS    = newTemp(Ity_V256);
   25761          Bool   isA   = have66noF2noF3(pfx);
   25762          HChar  ch    = isA ? 'a' : 'u';
   25763          assign(tS, getYMMReg(rS));
   25764          if (epartIsReg(modrm)) {
   25765             UInt rD = eregOfRexRM(pfx, modrm);
   25766             delta += 1;
   25767             putYMMReg(rD, mkexpr(tS));
   25768             DIP("vmovdq%c %s,%s\n", ch, nameYMMReg(rS), nameYMMReg(rD));
   25769          } else {
   25770             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   25771             delta += alen;
   25772             if (isA)
   25773                gen_SEGV_if_not_32_aligned(addr);
   25774             storeLE(mkexpr(addr), mkexpr(tS));
   25775             DIP("vmovdq%c %s,%s\n", ch, nameYMMReg(rS), dis_buf);
   25776          }
   25777          goto decode_success;
   25778       }
   25779       /* VMOVDQA xmm1, xmm2/m128 = VEX.128.66.0F.WIG 7F */
   25780       /* VMOVDQU xmm1, xmm2/m128 = VEX.128.F3.0F.WIG 7F */
   25781       if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
   25782           && 0==getVexL(pfx)/*128*/) {
   25783          UChar  modrm = getUChar(delta);
   25784          UInt   rS    = gregOfRexRM(pfx, modrm);
   25785          IRTemp tS    = newTemp(Ity_V128);
   25786          Bool   isA   = have66noF2noF3(pfx);
   25787          HChar  ch    = isA ? 'a' : 'u';
   25788          assign(tS, getXMMReg(rS));
   25789          if (epartIsReg(modrm)) {
   25790             UInt rD = eregOfRexRM(pfx, modrm);
   25791             delta += 1;
   25792             putYMMRegLoAndZU(rD, mkexpr(tS));
   25793             DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), nameXMMReg(rD));
   25794          } else {
   25795             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   25796             delta += alen;
   25797             if (isA)
   25798                gen_SEGV_if_not_16_aligned(addr);
   25799             storeLE(mkexpr(addr), mkexpr(tS));
   25800             DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), dis_buf);
   25801          }
   25802          goto decode_success;
   25803       }
   25804       break;
   25805 
   25806    case 0xAE:
   25807       /* VSTMXCSR m32 = VEX.LZ.0F.WIG AE /3 */
   25808       if (haveNo66noF2noF3(pfx)
   25809           && 0==getVexL(pfx)/*LZ*/
   25810           && 0==getRexW(pfx) /* be paranoid -- Intel docs don't require this */
   25811           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 3
   25812           && sz == 4) {
   25813          delta = dis_STMXCSR(vbi, pfx, delta, True/*isAvx*/);
   25814          goto decode_success;
   25815       }
   25816       /* VLDMXCSR m32 = VEX.LZ.0F.WIG AE /2 */
   25817       if (haveNo66noF2noF3(pfx)
   25818           && 0==getVexL(pfx)/*LZ*/
   25819           && 0==getRexW(pfx) /* be paranoid -- Intel docs don't require this */
   25820           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 2
   25821           && sz == 4) {
   25822          delta = dis_LDMXCSR(vbi, pfx, delta, True/*isAvx*/);
   25823          goto decode_success;
   25824       }
   25825       break;
   25826 
   25827    case 0xC2:
   25828       /* VCMPSD xmm3/m64(E=argL), xmm2(V=argR), xmm1(G) */
   25829       /* = VEX.NDS.LIG.F2.0F.WIG C2 /r ib */
   25830       if (haveF2no66noF3(pfx)) {
   25831          Long delta0 = delta;
   25832          delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   25833                                           "vcmpsd", False/*!all_lanes*/,
   25834                                           8/*sz*/);
   25835          if (delta > delta0) goto decode_success;
   25836          /* else fall through -- decoding has failed */
   25837       }
   25838       /* VCMPSS xmm3/m32(E=argL), xmm2(V=argR), xmm1(G) */
   25839       /* = VEX.NDS.LIG.F3.0F.WIG C2 /r ib */
   25840       if (haveF3no66noF2(pfx)) {
   25841          Long delta0 = delta;
   25842          delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   25843                                           "vcmpss", False/*!all_lanes*/,
   25844                                           4/*sz*/);
   25845          if (delta > delta0) goto decode_success;
   25846          /* else fall through -- decoding has failed */
   25847       }
   25848       /* VCMPPD xmm3/m128(E=argL), xmm2(V=argR), xmm1(G) */
   25849       /* = VEX.NDS.128.66.0F.WIG C2 /r ib */
   25850       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25851          Long delta0 = delta;
   25852          delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   25853                                           "vcmppd", True/*all_lanes*/,
   25854                                           8/*sz*/);
   25855          if (delta > delta0) goto decode_success;
   25856          /* else fall through -- decoding has failed */
   25857       }
   25858       /* VCMPPD ymm3/m256(E=argL), ymm2(V=argR), ymm1(G) */
   25859       /* = VEX.NDS.256.66.0F.WIG C2 /r ib */
   25860       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25861          Long delta0 = delta;
   25862          delta = dis_AVX256_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   25863                                           "vcmppd", 8/*sz*/);
   25864          if (delta > delta0) goto decode_success;
   25865          /* else fall through -- decoding has failed */
   25866       }
   25867       /* VCMPPS xmm3/m128(E=argL), xmm2(V=argR), xmm1(G) */
   25868       /* = VEX.NDS.128.0F.WIG C2 /r ib */
   25869       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25870          Long delta0 = delta;
   25871          delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   25872                                           "vcmpps", True/*all_lanes*/,
   25873                                           4/*sz*/);
   25874          if (delta > delta0) goto decode_success;
   25875          /* else fall through -- decoding has failed */
   25876       }
   25877       /* VCMPPS ymm3/m256(E=argL), ymm2(V=argR), ymm1(G) */
   25878       /* = VEX.NDS.256.0F.WIG C2 /r ib */
   25879       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25880          Long delta0 = delta;
   25881          delta = dis_AVX256_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
   25882                                           "vcmpps", 4/*sz*/);
   25883          if (delta > delta0) goto decode_success;
   25884          /* else fall through -- decoding has failed */
   25885       }
   25886       break;
   25887 
   25888    case 0xC4:
   25889       /* VPINSRW r32/m16, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG C4 /r ib */
   25890       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25891          UChar  modrm = getUChar(delta);
   25892          UInt   rG    = gregOfRexRM(pfx, modrm);
   25893          UInt   rV    = getVexNvvvv(pfx);
   25894          Int    imm8;
   25895          IRTemp new16 = newTemp(Ity_I16);
   25896 
   25897          if ( epartIsReg( modrm ) ) {
   25898             imm8 = (Int)(getUChar(delta+1) & 7);
   25899             assign( new16, unop(Iop_32to16,
   25900                                 getIReg32(eregOfRexRM(pfx,modrm))) );
   25901             delta += 1+1;
   25902             DIP( "vpinsrw $%d,%s,%s\n", imm8,
   25903                  nameIReg32( eregOfRexRM(pfx, modrm) ), nameXMMReg(rG) );
   25904          } else {
   25905             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   25906             imm8 = (Int)(getUChar(delta+alen) & 7);
   25907             assign( new16, loadLE( Ity_I16, mkexpr(addr) ));
   25908             delta += alen+1;
   25909             DIP( "vpinsrw $%d,%s,%s\n",
   25910                  imm8, dis_buf, nameXMMReg(rG) );
   25911          }
   25912 
   25913          IRTemp src_vec = newTemp(Ity_V128);
   25914          assign(src_vec, getXMMReg( rV ));
   25915          IRTemp res_vec = math_PINSRW_128( src_vec, new16, imm8 );
   25916          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   25917          *uses_vvvv = True;
   25918          goto decode_success;
   25919       }
   25920       break;
   25921 
   25922    case 0xC5:
   25923       /* VPEXTRW imm8, xmm1, reg32 = VEX.128.66.0F.W0 C5 /r ib */
   25924       if (have66noF2noF3(pfx)
   25925          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   25926          Long delta0 = delta;
   25927          delta = dis_PEXTRW_128_EregOnly_toG( vbi, pfx, delta,
   25928                                               True/*isAvx*/ );
   25929          if (delta > delta0) goto decode_success;
   25930          /* else fall through -- decoding has failed */
   25931       }
   25932       break;
   25933 
   25934    case 0xC6:
   25935       /* VSHUFPS imm8, xmm3/m128, xmm2, xmm1, xmm2 */
   25936       /* = VEX.NDS.128.0F.WIG C6 /r ib */
   25937       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25938          Int    imm8 = 0;
   25939          IRTemp eV   = newTemp(Ity_V128);
   25940          IRTemp vV   = newTemp(Ity_V128);
   25941          UInt  modrm = getUChar(delta);
   25942          UInt  rG    = gregOfRexRM(pfx,modrm);
   25943          UInt  rV    = getVexNvvvv(pfx);
   25944          assign( vV, getXMMReg(rV) );
   25945          if (epartIsReg(modrm)) {
   25946             UInt rE = eregOfRexRM(pfx,modrm);
   25947             assign( eV, getXMMReg(rE) );
   25948             imm8 = (Int)getUChar(delta+1);
   25949             delta += 1+1;
   25950             DIP("vshufps $%d,%s,%s,%s\n",
   25951                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   25952          } else {
   25953             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   25954             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   25955             imm8 = (Int)getUChar(delta+alen);
   25956             delta += 1+alen;
   25957             DIP("vshufps $%d,%s,%s,%s\n",
   25958                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   25959          }
   25960          IRTemp res = math_SHUFPS_128( eV, vV, imm8 );
   25961          putYMMRegLoAndZU( rG, mkexpr(res) );
   25962          *uses_vvvv = True;
   25963          goto decode_success;
   25964       }
   25965       /* VSHUFPS imm8, ymm3/m256, ymm2, ymm1, ymm2 */
   25966       /* = VEX.NDS.256.0F.WIG C6 /r ib */
   25967       if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   25968          Int    imm8 = 0;
   25969          IRTemp eV   = newTemp(Ity_V256);
   25970          IRTemp vV   = newTemp(Ity_V256);
   25971          UInt  modrm = getUChar(delta);
   25972          UInt  rG    = gregOfRexRM(pfx,modrm);
   25973          UInt  rV    = getVexNvvvv(pfx);
   25974          assign( vV, getYMMReg(rV) );
   25975          if (epartIsReg(modrm)) {
   25976             UInt rE = eregOfRexRM(pfx,modrm);
   25977             assign( eV, getYMMReg(rE) );
   25978             imm8 = (Int)getUChar(delta+1);
   25979             delta += 1+1;
   25980             DIP("vshufps $%d,%s,%s,%s\n",
   25981                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   25982          } else {
   25983             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   25984             assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
   25985             imm8 = (Int)getUChar(delta+alen);
   25986             delta += 1+alen;
   25987             DIP("vshufps $%d,%s,%s,%s\n",
   25988                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   25989          }
   25990          IRTemp res = math_SHUFPS_256( eV, vV, imm8 );
   25991          putYMMReg( rG, mkexpr(res) );
   25992          *uses_vvvv = True;
   25993          goto decode_success;
   25994       }
   25995       /* VSHUFPD imm8, xmm3/m128, xmm2, xmm1, xmm2 */
   25996       /* = VEX.NDS.128.66.0F.WIG C6 /r ib */
   25997       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   25998          Int    imm8 = 0;
   25999          IRTemp eV   = newTemp(Ity_V128);
   26000          IRTemp vV   = newTemp(Ity_V128);
   26001          UInt  modrm = getUChar(delta);
   26002          UInt  rG    = gregOfRexRM(pfx,modrm);
   26003          UInt  rV    = getVexNvvvv(pfx);
   26004          assign( vV, getXMMReg(rV) );
   26005          if (epartIsReg(modrm)) {
   26006             UInt rE = eregOfRexRM(pfx,modrm);
   26007             assign( eV, getXMMReg(rE) );
   26008             imm8 = (Int)getUChar(delta+1);
   26009             delta += 1+1;
   26010             DIP("vshufpd $%d,%s,%s,%s\n",
   26011                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   26012          } else {
   26013             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   26014             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   26015             imm8 = (Int)getUChar(delta+alen);
   26016             delta += 1+alen;
   26017             DIP("vshufpd $%d,%s,%s,%s\n",
   26018                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   26019          }
   26020          IRTemp res = math_SHUFPD_128( eV, vV, imm8 );
   26021          putYMMRegLoAndZU( rG, mkexpr(res) );
   26022          *uses_vvvv = True;
   26023          goto decode_success;
   26024       }
   26025       /* VSHUFPD imm8, ymm3/m256, ymm2, ymm1, ymm2 */
   26026       /* = VEX.NDS.256.66.0F.WIG C6 /r ib */
   26027       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26028          Int    imm8 = 0;
   26029          IRTemp eV   = newTemp(Ity_V256);
   26030          IRTemp vV   = newTemp(Ity_V256);
   26031          UInt  modrm = getUChar(delta);
   26032          UInt  rG    = gregOfRexRM(pfx,modrm);
   26033          UInt  rV    = getVexNvvvv(pfx);
   26034          assign( vV, getYMMReg(rV) );
   26035          if (epartIsReg(modrm)) {
   26036             UInt rE = eregOfRexRM(pfx,modrm);
   26037             assign( eV, getYMMReg(rE) );
   26038             imm8 = (Int)getUChar(delta+1);
   26039             delta += 1+1;
   26040             DIP("vshufpd $%d,%s,%s,%s\n",
   26041                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   26042          } else {
   26043             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   26044             assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
   26045             imm8 = (Int)getUChar(delta+alen);
   26046             delta += 1+alen;
   26047             DIP("vshufpd $%d,%s,%s,%s\n",
   26048                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   26049          }
   26050          IRTemp res = math_SHUFPD_256( eV, vV, imm8 );
   26051          putYMMReg( rG, mkexpr(res) );
   26052          *uses_vvvv = True;
   26053          goto decode_success;
   26054       }
   26055       break;
   26056 
   26057    case 0xD0:
   26058       /* VADDSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D0 /r */
   26059       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26060          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   26061                     uses_vvvv, vbi, pfx, delta,
   26062                     "vaddsubpd", math_ADDSUBPD_128 );
   26063          goto decode_success;
   26064       }
   26065       /* VADDSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D0 /r */
   26066       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26067          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   26068                     uses_vvvv, vbi, pfx, delta,
   26069                     "vaddsubpd", math_ADDSUBPD_256 );
   26070          goto decode_success;
   26071       }
   26072       /* VADDSUBPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.F2.0F.WIG D0 /r */
   26073       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26074          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   26075                     uses_vvvv, vbi, pfx, delta,
   26076                     "vaddsubps", math_ADDSUBPS_128 );
   26077          goto decode_success;
   26078       }
   26079       /* VADDSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.F2.0F.WIG D0 /r */
   26080       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26081          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   26082                     uses_vvvv, vbi, pfx, delta,
   26083                     "vaddsubps", math_ADDSUBPS_256 );
   26084          goto decode_success;
   26085       }
   26086       break;
   26087 
   26088    case 0xD1:
   26089       /* VPSRLW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D1 /r */
   26090       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26091          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   26092                                         "vpsrlw", Iop_ShrN16x8 );
   26093          *uses_vvvv = True;
   26094          goto decode_success;
   26095 
   26096       }
   26097       /* VPSRLW xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D1 /r */
   26098       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26099          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   26100                                         "vpsrlw", Iop_ShrN16x16 );
   26101          *uses_vvvv = True;
   26102          goto decode_success;
   26103 
   26104       }
   26105       break;
   26106 
   26107    case 0xD2:
   26108       /* VPSRLD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D2 /r */
   26109       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26110          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   26111                                         "vpsrld", Iop_ShrN32x4 );
   26112          *uses_vvvv = True;
   26113          goto decode_success;
   26114       }
   26115       /* VPSRLD xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D2 /r */
   26116       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26117          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   26118                                         "vpsrld", Iop_ShrN32x8 );
   26119          *uses_vvvv = True;
   26120          goto decode_success;
   26121       }
   26122       break;
   26123 
   26124    case 0xD3:
   26125       /* VPSRLQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D3 /r */
   26126       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26127          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   26128                                         "vpsrlq", Iop_ShrN64x2 );
   26129          *uses_vvvv = True;
   26130          goto decode_success;
   26131       }
   26132       /* VPSRLQ xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D3 /r */
   26133       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26134          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   26135                                         "vpsrlq", Iop_ShrN64x4 );
   26136          *uses_vvvv = True;
   26137          goto decode_success;
   26138       }
   26139       break;
   26140 
   26141    case 0xD4:
   26142       /* VPADDQ r/m, rV, r ::: r = rV + r/m */
   26143       /* VPADDQ = VEX.NDS.128.66.0F.WIG D4 /r */
   26144       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26145          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26146                     uses_vvvv, vbi, pfx, delta, "vpaddq", Iop_Add64x2 );
   26147          goto decode_success;
   26148       }
   26149       /* VPADDQ r/m, rV, r ::: r = rV + r/m */
   26150       /* VPADDQ = VEX.NDS.256.66.0F.WIG D4 /r */
   26151       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26152          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26153                     uses_vvvv, vbi, pfx, delta, "vpaddq", Iop_Add64x4 );
   26154          goto decode_success;
   26155       }
   26156       break;
   26157 
   26158    case 0xD5:
   26159       /* VPMULLW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D5 /r */
   26160       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26161          delta = dis_AVX128_E_V_to_G(
   26162                     uses_vvvv, vbi, pfx, delta, "vpmullw", Iop_Mul16x8 );
   26163          goto decode_success;
   26164       }
   26165       /* VPMULLW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D5 /r */
   26166       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26167          delta = dis_AVX256_E_V_to_G(
   26168                     uses_vvvv, vbi, pfx, delta, "vpmullw", Iop_Mul16x16 );
   26169          goto decode_success;
   26170       }
   26171       break;
   26172 
   26173    case 0xD6:
   26174       /* I can't even find any Intel docs for this one. */
   26175       /* Basically: 66 0F D6 = MOVQ -- move 64 bits from G (lo half
   26176          xmm) to E (mem or lo half xmm).  Looks like L==0(128), W==0
   26177          (WIG, maybe?) */
   26178       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   26179           && 0==getRexW(pfx)/*this might be redundant, dunno*/) {
   26180          UChar modrm = getUChar(delta);
   26181          UInt  rG    = gregOfRexRM(pfx,modrm);
   26182          if (epartIsReg(modrm)) {
   26183             /* fall through, awaiting test case */
   26184             /* dst: lo half copied, hi half zeroed */
   26185          } else {
   26186             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   26187             storeLE( mkexpr(addr), getXMMRegLane64( rG, 0 ));
   26188             DIP("vmovq %s,%s\n", nameXMMReg(rG), dis_buf );
   26189             delta += alen;
   26190             goto decode_success;
   26191          }
   26192       }
   26193       break;
   26194 
   26195    case 0xD7:
   26196       /* VEX.128.66.0F.WIG D7 /r = VPMOVMSKB xmm1, r32 */
   26197       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26198          delta = dis_PMOVMSKB_128( vbi, pfx, delta, True/*isAvx*/ );
   26199          goto decode_success;
   26200       }
   26201       /* VEX.128.66.0F.WIG D7 /r = VPMOVMSKB ymm1, r32 */
   26202       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26203          delta = dis_PMOVMSKB_256( vbi, pfx, delta );
   26204          goto decode_success;
   26205       }
   26206       break;
   26207 
   26208    case 0xD8:
   26209       /* VPSUBUSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D8 /r */
   26210       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26211          delta = dis_AVX128_E_V_to_G(
   26212                     uses_vvvv, vbi, pfx, delta, "vpsubusb", Iop_QSub8Ux16 );
   26213          goto decode_success;
   26214       }
   26215       /* VPSUBUSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D8 /r */
   26216       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26217          delta = dis_AVX256_E_V_to_G(
   26218                     uses_vvvv, vbi, pfx, delta, "vpsubusb", Iop_QSub8Ux32 );
   26219          goto decode_success;
   26220       }
   26221       break;
   26222 
   26223    case 0xD9:
   26224       /* VPSUBUSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D9 /r */
   26225       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26226          delta = dis_AVX128_E_V_to_G(
   26227                     uses_vvvv, vbi, pfx, delta, "vpsubusw", Iop_QSub16Ux8 );
   26228          goto decode_success;
   26229       }
   26230       /* VPSUBUSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D9 /r */
   26231       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26232          delta = dis_AVX256_E_V_to_G(
   26233                     uses_vvvv, vbi, pfx, delta, "vpsubusw", Iop_QSub16Ux16 );
   26234          goto decode_success;
   26235       }
   26236       break;
   26237 
   26238    case 0xDA:
   26239       /* VPMINUB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DA /r */
   26240       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26241          delta = dis_AVX128_E_V_to_G(
   26242                     uses_vvvv, vbi, pfx, delta, "vpminub", Iop_Min8Ux16 );
   26243          goto decode_success;
   26244       }
   26245       /* VPMINUB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DA /r */
   26246       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26247          delta = dis_AVX256_E_V_to_G(
   26248                     uses_vvvv, vbi, pfx, delta, "vpminub", Iop_Min8Ux32 );
   26249          goto decode_success;
   26250       }
   26251       break;
   26252 
   26253    case 0xDB:
   26254       /* VPAND r/m, rV, r ::: r = rV & r/m */
   26255       /* VEX.NDS.128.66.0F.WIG DB /r = VPAND xmm3/m128, xmm2, xmm1 */
   26256       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26257          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26258                     uses_vvvv, vbi, pfx, delta, "vpand", Iop_AndV128 );
   26259          goto decode_success;
   26260       }
   26261       /* VPAND r/m, rV, r ::: r = rV & r/m */
   26262       /* VEX.NDS.256.66.0F.WIG DB /r = VPAND ymm3/m256, ymm2, ymm1 */
   26263       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26264          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26265                     uses_vvvv, vbi, pfx, delta, "vpand", Iop_AndV256 );
   26266          goto decode_success;
   26267       }
   26268       break;
   26269 
   26270    case 0xDC:
   26271       /* VPADDUSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DC /r */
   26272       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26273          delta = dis_AVX128_E_V_to_G(
   26274                     uses_vvvv, vbi, pfx, delta, "vpaddusb", Iop_QAdd8Ux16 );
   26275          goto decode_success;
   26276       }
   26277       /* VPADDUSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DC /r */
   26278       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26279          delta = dis_AVX256_E_V_to_G(
   26280                     uses_vvvv, vbi, pfx, delta, "vpaddusb", Iop_QAdd8Ux32 );
   26281          goto decode_success;
   26282       }
   26283       break;
   26284 
   26285    case 0xDD:
   26286       /* VPADDUSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DD /r */
   26287       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26288          delta = dis_AVX128_E_V_to_G(
   26289                     uses_vvvv, vbi, pfx, delta, "vpaddusw", Iop_QAdd16Ux8 );
   26290          goto decode_success;
   26291       }
   26292       /* VPADDUSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DD /r */
   26293       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26294          delta = dis_AVX256_E_V_to_G(
   26295                     uses_vvvv, vbi, pfx, delta, "vpaddusw", Iop_QAdd16Ux16 );
   26296          goto decode_success;
   26297       }
   26298       break;
   26299 
   26300    case 0xDE:
   26301       /* VPMAXUB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DE /r */
   26302       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26303          delta = dis_AVX128_E_V_to_G(
   26304                     uses_vvvv, vbi, pfx, delta, "vpmaxub", Iop_Max8Ux16 );
   26305          goto decode_success;
   26306       }
   26307       /* VPMAXUB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DE /r */
   26308       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26309          delta = dis_AVX256_E_V_to_G(
   26310                     uses_vvvv, vbi, pfx, delta, "vpmaxub", Iop_Max8Ux32 );
   26311          goto decode_success;
   26312       }
   26313       break;
   26314 
   26315    case 0xDF:
   26316       /* VPANDN r/m, rV, r ::: r = rV & ~r/m (is that correct, re the ~ ?) */
   26317       /* VEX.NDS.128.66.0F.WIG DF /r = VPANDN xmm3/m128, xmm2, xmm1 */
   26318       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26319          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   26320                     uses_vvvv, vbi, pfx, delta, "vpandn", Iop_AndV128,
   26321                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
   26322          goto decode_success;
   26323       }
   26324       /* VPANDN r/m, rV, r ::: r = rV & ~r/m (is that correct, re the ~ ?) */
   26325       /* VEX.NDS.256.66.0F.WIG DF /r = VPANDN ymm3/m256, ymm2, ymm1 */
   26326       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26327          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
   26328                     uses_vvvv, vbi, pfx, delta, "vpandn", Iop_AndV256,
   26329                     NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
   26330          goto decode_success;
   26331       }
   26332       break;
   26333 
   26334    case 0xE0:
   26335       /* VPAVGB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E0 /r */
   26336       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26337          delta = dis_AVX128_E_V_to_G(
   26338                     uses_vvvv, vbi, pfx, delta, "vpavgb", Iop_Avg8Ux16 );
   26339          goto decode_success;
   26340       }
   26341       /* VPAVGB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E0 /r */
   26342       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26343          delta = dis_AVX256_E_V_to_G(
   26344                     uses_vvvv, vbi, pfx, delta, "vpavgb", Iop_Avg8Ux32 );
   26345          goto decode_success;
   26346       }
   26347       break;
   26348 
   26349    case 0xE1:
   26350       /* VPSRAW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E1 /r */
   26351       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26352          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   26353                                         "vpsraw", Iop_SarN16x8 );
   26354          *uses_vvvv = True;
   26355          goto decode_success;
   26356       }
   26357       /* VPSRAW xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E1 /r */
   26358       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26359          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   26360                                         "vpsraw", Iop_SarN16x16 );
   26361          *uses_vvvv = True;
   26362          goto decode_success;
   26363       }
   26364       break;
   26365 
   26366    case 0xE2:
   26367       /* VPSRAD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E2 /r */
   26368       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26369          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   26370                                         "vpsrad", Iop_SarN32x4 );
   26371          *uses_vvvv = True;
   26372          goto decode_success;
   26373       }
   26374       /* VPSRAD xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E2 /r */
   26375       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26376          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   26377                                         "vpsrad", Iop_SarN32x8 );
   26378          *uses_vvvv = True;
   26379          goto decode_success;
   26380       }
   26381       break;
   26382 
   26383    case 0xE3:
   26384       /* VPAVGW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E3 /r */
   26385       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26386          delta = dis_AVX128_E_V_to_G(
   26387                     uses_vvvv, vbi, pfx, delta, "vpavgw", Iop_Avg16Ux8 );
   26388          goto decode_success;
   26389       }
   26390       /* VPAVGW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E3 /r */
   26391       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26392          delta = dis_AVX256_E_V_to_G(
   26393                     uses_vvvv, vbi, pfx, delta, "vpavgw", Iop_Avg16Ux16 );
   26394          goto decode_success;
   26395       }
   26396       break;
   26397 
   26398    case 0xE4:
   26399       /* VPMULHUW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E4 /r */
   26400       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26401          delta = dis_AVX128_E_V_to_G(
   26402                     uses_vvvv, vbi, pfx, delta, "vpmulhuw", Iop_MulHi16Ux8 );
   26403          goto decode_success;
   26404       }
   26405       /* VPMULHUW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E4 /r */
   26406       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26407          delta = dis_AVX256_E_V_to_G(
   26408                     uses_vvvv, vbi, pfx, delta, "vpmulhuw", Iop_MulHi16Ux16 );
   26409          goto decode_success;
   26410       }
   26411       break;
   26412 
   26413    case 0xE5:
   26414       /* VPMULHW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E5 /r */
   26415       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26416          delta = dis_AVX128_E_V_to_G(
   26417                     uses_vvvv, vbi, pfx, delta, "vpmulhw", Iop_MulHi16Sx8 );
   26418          goto decode_success;
   26419       }
   26420       /* VPMULHW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E5 /r */
   26421       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26422          delta = dis_AVX256_E_V_to_G(
   26423                     uses_vvvv, vbi, pfx, delta, "vpmulhw", Iop_MulHi16Sx16 );
   26424          goto decode_success;
   26425       }
   26426       break;
   26427 
   26428    case 0xE6:
   26429       /* VCVTDQ2PD xmm2/m64, xmm1 = VEX.128.F3.0F.WIG E6 /r */
   26430       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
   26431          delta = dis_CVTDQ2PD_128(vbi, pfx, delta, True/*isAvx*/);
   26432          goto decode_success;
   26433       }
   26434       /* VCVTDQ2PD xmm2/m128, ymm1 = VEX.256.F3.0F.WIG E6 /r */
   26435       if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
   26436          delta = dis_CVTDQ2PD_256(vbi, pfx, delta);
   26437          goto decode_success;
   26438       }
   26439       /* VCVTTPD2DQ xmm2/m128, xmm1 = VEX.128.66.0F.WIG E6 /r */
   26440       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26441          delta = dis_CVTxPD2DQ_128(vbi, pfx, delta, True/*isAvx*/,
   26442                                    True/*r2zero*/);
   26443          goto decode_success;
   26444       }
   26445       /* VCVTTPD2DQ ymm2/m256, xmm1 = VEX.256.66.0F.WIG E6 /r */
   26446       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26447          delta = dis_CVTxPD2DQ_256(vbi, pfx, delta, True/*r2zero*/);
   26448          goto decode_success;
   26449       }
   26450       /* VCVTPD2DQ xmm2/m128, xmm1 = VEX.128.F2.0F.WIG E6 /r */
   26451       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26452          delta = dis_CVTxPD2DQ_128(vbi, pfx, delta, True/*isAvx*/,
   26453                                    False/*!r2zero*/);
   26454          goto decode_success;
   26455       }
   26456       /* VCVTPD2DQ ymm2/m256, xmm1 = VEX.256.F2.0F.WIG E6 /r */
   26457       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26458          delta = dis_CVTxPD2DQ_256(vbi, pfx, delta, False/*!r2zero*/);
   26459          goto decode_success;
   26460       }
   26461       break;
   26462 
   26463    case 0xE7:
   26464       /* VMOVNTDQ xmm1, m128 = VEX.128.66.0F.WIG E7 /r */
   26465       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26466          UChar modrm = getUChar(delta);
   26467          UInt rG     = gregOfRexRM(pfx,modrm);
   26468          if (!epartIsReg(modrm)) {
   26469             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   26470             gen_SEGV_if_not_16_aligned( addr );
   26471             storeLE( mkexpr(addr), getXMMReg(rG) );
   26472             DIP("vmovntdq %s,%s\n", dis_buf, nameXMMReg(rG));
   26473             delta += alen;
   26474             goto decode_success;
   26475          }
   26476          /* else fall through */
   26477       }
   26478       /* VMOVNTDQ ymm1, m256 = VEX.256.66.0F.WIG E7 /r */
   26479       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26480          UChar modrm = getUChar(delta);
   26481          UInt rG     = gregOfRexRM(pfx,modrm);
   26482          if (!epartIsReg(modrm)) {
   26483             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   26484             gen_SEGV_if_not_32_aligned( addr );
   26485             storeLE( mkexpr(addr), getYMMReg(rG) );
   26486             DIP("vmovntdq %s,%s\n", dis_buf, nameYMMReg(rG));
   26487             delta += alen;
   26488             goto decode_success;
   26489          }
   26490          /* else fall through */
   26491       }
   26492       break;
   26493 
   26494    case 0xE8:
   26495       /* VPSUBSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E8 /r */
   26496       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26497          delta = dis_AVX128_E_V_to_G(
   26498                     uses_vvvv, vbi, pfx, delta, "vpsubsb", Iop_QSub8Sx16 );
   26499          goto decode_success;
   26500       }
   26501       /* VPSUBSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E8 /r */
   26502       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26503          delta = dis_AVX256_E_V_to_G(
   26504                     uses_vvvv, vbi, pfx, delta, "vpsubsb", Iop_QSub8Sx32 );
   26505          goto decode_success;
   26506       }
   26507       break;
   26508 
   26509    case 0xE9:
   26510       /* VPSUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E9 /r */
   26511       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26512          delta = dis_AVX128_E_V_to_G(
   26513                     uses_vvvv, vbi, pfx, delta, "vpsubsw", Iop_QSub16Sx8 );
   26514          goto decode_success;
   26515       }
   26516       /* VPSUBSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E9 /r */
   26517       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26518          delta = dis_AVX256_E_V_to_G(
   26519                     uses_vvvv, vbi, pfx, delta, "vpsubsw", Iop_QSub16Sx16 );
   26520          goto decode_success;
   26521       }
   26522       break;
   26523 
   26524    case 0xEA:
   26525       /* VPMINSW r/m, rV, r ::: r = min-signed16s(rV, r/m) */
   26526       /* VPMINSW = VEX.NDS.128.66.0F.WIG EA /r */
   26527       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26528          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26529                     uses_vvvv, vbi, pfx, delta, "vpminsw", Iop_Min16Sx8 );
   26530          goto decode_success;
   26531       }
   26532       /* VPMINSW r/m, rV, r ::: r = min-signed16s(rV, r/m) */
   26533       /* VPMINSW = VEX.NDS.256.66.0F.WIG EA /r */
   26534       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26535          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26536                     uses_vvvv, vbi, pfx, delta, "vpminsw", Iop_Min16Sx16 );
   26537          goto decode_success;
   26538       }
   26539       break;
   26540 
   26541    case 0xEB:
   26542       /* VPOR r/m, rV, r ::: r = rV | r/m */
   26543       /* VPOR = VEX.NDS.128.66.0F.WIG EB /r */
   26544       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26545          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26546                     uses_vvvv, vbi, pfx, delta, "vpor", Iop_OrV128 );
   26547          goto decode_success;
   26548       }
   26549       /* VPOR r/m, rV, r ::: r = rV | r/m */
   26550       /* VPOR = VEX.NDS.256.66.0F.WIG EB /r */
   26551       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26552          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26553                     uses_vvvv, vbi, pfx, delta, "vpor", Iop_OrV256 );
   26554          goto decode_success;
   26555       }
   26556       break;
   26557 
   26558    case 0xEC:
   26559       /* VPADDSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG EC /r */
   26560       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26561          delta = dis_AVX128_E_V_to_G(
   26562                     uses_vvvv, vbi, pfx, delta, "vpaddsb", Iop_QAdd8Sx16 );
   26563          goto decode_success;
   26564       }
   26565       /* VPADDSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG EC /r */
   26566       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26567          delta = dis_AVX256_E_V_to_G(
   26568                     uses_vvvv, vbi, pfx, delta, "vpaddsb", Iop_QAdd8Sx32 );
   26569          goto decode_success;
   26570       }
   26571       break;
   26572 
   26573    case 0xED:
   26574       /* VPADDSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG ED /r */
   26575       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26576          delta = dis_AVX128_E_V_to_G(
   26577                     uses_vvvv, vbi, pfx, delta, "vpaddsw", Iop_QAdd16Sx8 );
   26578          goto decode_success;
   26579       }
   26580       /* VPADDSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG ED /r */
   26581       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26582          delta = dis_AVX256_E_V_to_G(
   26583                     uses_vvvv, vbi, pfx, delta, "vpaddsw", Iop_QAdd16Sx16 );
   26584          goto decode_success;
   26585       }
   26586       break;
   26587 
   26588    case 0xEE:
   26589       /* VPMAXSW r/m, rV, r ::: r = max-signed16s(rV, r/m) */
   26590       /* VPMAXSW = VEX.NDS.128.66.0F.WIG EE /r */
   26591       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26592          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26593                     uses_vvvv, vbi, pfx, delta, "vpmaxsw", Iop_Max16Sx8 );
   26594          goto decode_success;
   26595       }
   26596       /* VPMAXSW r/m, rV, r ::: r = max-signed16s(rV, r/m) */
   26597       /* VPMAXSW = VEX.NDS.256.66.0F.WIG EE /r */
   26598       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26599          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26600                     uses_vvvv, vbi, pfx, delta, "vpmaxsw", Iop_Max16Sx16 );
   26601          goto decode_success;
   26602       }
   26603       break;
   26604 
   26605    case 0xEF:
   26606       /* VPXOR r/m, rV, r ::: r = rV ^ r/m */
   26607       /* VPXOR = VEX.NDS.128.66.0F.WIG EF /r */
   26608       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26609          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26610                     uses_vvvv, vbi, pfx, delta, "vpxor", Iop_XorV128 );
   26611          goto decode_success;
   26612       }
   26613       /* VPXOR r/m, rV, r ::: r = rV ^ r/m */
   26614       /* VPXOR = VEX.NDS.256.66.0F.WIG EF /r */
   26615       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26616          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26617                     uses_vvvv, vbi, pfx, delta, "vpxor", Iop_XorV256 );
   26618          goto decode_success;
   26619       }
   26620       break;
   26621 
   26622    case 0xF0:
   26623       /* VLDDQU m256, ymm1 = VEX.256.F2.0F.WIG F0 /r */
   26624       if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26625          UChar  modrm = getUChar(delta);
   26626          UInt   rD    = gregOfRexRM(pfx, modrm);
   26627          IRTemp tD    = newTemp(Ity_V256);
   26628          if (epartIsReg(modrm)) break;
   26629          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   26630          delta += alen;
   26631          assign(tD, loadLE(Ity_V256, mkexpr(addr)));
   26632          DIP("vlddqu %s,%s\n", dis_buf, nameYMMReg(rD));
   26633          putYMMReg(rD, mkexpr(tD));
   26634          goto decode_success;
   26635       }
   26636       /* VLDDQU m128, xmm1 = VEX.128.F2.0F.WIG F0 /r */
   26637       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26638          UChar  modrm = getUChar(delta);
   26639          UInt   rD    = gregOfRexRM(pfx, modrm);
   26640          IRTemp tD    = newTemp(Ity_V128);
   26641          if (epartIsReg(modrm)) break;
   26642          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   26643          delta += alen;
   26644          assign(tD, loadLE(Ity_V128, mkexpr(addr)));
   26645          DIP("vlddqu %s,%s\n", dis_buf, nameXMMReg(rD));
   26646          putYMMRegLoAndZU(rD, mkexpr(tD));
   26647          goto decode_success;
   26648       }
   26649       break;
   26650 
   26651    case 0xF1:
   26652       /* VPSLLW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F1 /r */
   26653       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26654          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   26655                                         "vpsllw", Iop_ShlN16x8 );
   26656          *uses_vvvv = True;
   26657          goto decode_success;
   26658 
   26659       }
   26660       /* VPSLLW xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F1 /r */
   26661       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26662          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   26663                                         "vpsllw", Iop_ShlN16x16 );
   26664          *uses_vvvv = True;
   26665          goto decode_success;
   26666 
   26667       }
   26668       break;
   26669 
   26670    case 0xF2:
   26671       /* VPSLLD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F2 /r */
   26672       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26673          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   26674                                         "vpslld", Iop_ShlN32x4 );
   26675          *uses_vvvv = True;
   26676          goto decode_success;
   26677       }
   26678       /* VPSLLD xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F2 /r */
   26679       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26680          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   26681                                         "vpslld", Iop_ShlN32x8 );
   26682          *uses_vvvv = True;
   26683          goto decode_success;
   26684       }
   26685       break;
   26686 
   26687    case 0xF3:
   26688       /* VPSLLQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F3 /r */
   26689       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26690          delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
   26691                                         "vpsllq", Iop_ShlN64x2 );
   26692          *uses_vvvv = True;
   26693          goto decode_success;
   26694       }
   26695       /* VPSLLQ xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F3 /r */
   26696       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26697          delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
   26698                                         "vpsllq", Iop_ShlN64x4 );
   26699          *uses_vvvv = True;
   26700          goto decode_success;
   26701       }
   26702       break;
   26703 
   26704    case 0xF4:
   26705       /* VPMULUDQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F4 /r */
   26706       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26707          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   26708                     uses_vvvv, vbi, pfx, delta,
   26709                     "vpmuludq", math_PMULUDQ_128 );
   26710          goto decode_success;
   26711       }
   26712       /* VPMULUDQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F4 /r */
   26713       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26714          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   26715                     uses_vvvv, vbi, pfx, delta,
   26716                     "vpmuludq", math_PMULUDQ_256 );
   26717          goto decode_success;
   26718       }
   26719       break;
   26720 
   26721    case 0xF5:
   26722       /* VPMADDWD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F5 /r */
   26723       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26724          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   26725                     uses_vvvv, vbi, pfx, delta,
   26726                     "vpmaddwd", math_PMADDWD_128 );
   26727          goto decode_success;
   26728       }
   26729       /* VPMADDWD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F5 /r */
   26730       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26731          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   26732                     uses_vvvv, vbi, pfx, delta,
   26733                     "vpmaddwd", math_PMADDWD_256 );
   26734          goto decode_success;
   26735       }
   26736       break;
   26737 
   26738    case 0xF6:
   26739       /* VPSADBW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F6 /r */
   26740       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26741          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   26742                     uses_vvvv, vbi, pfx, delta,
   26743                     "vpsadbw", math_PSADBW_128 );
   26744          goto decode_success;
   26745       }
   26746       /* VPSADBW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F6 /r */
   26747       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26748          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   26749                     uses_vvvv, vbi, pfx, delta,
   26750                     "vpsadbw", math_PSADBW_256 );
   26751          goto decode_success;
   26752       }
   26753       break;
   26754 
   26755    case 0xF7:
   26756       /* VMASKMOVDQU xmm2, xmm1 = VEX.128.66.0F.WIG F7 /r */
   26757       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   26758           && epartIsReg(getUChar(delta))) {
   26759          delta = dis_MASKMOVDQU( vbi, pfx, delta, True/*isAvx*/ );
   26760          goto decode_success;
   26761       }
   26762       break;
   26763 
   26764    case 0xF8:
   26765       /* VPSUBB r/m, rV, r ::: r = rV - r/m */
   26766       /* VPSUBB = VEX.NDS.128.66.0F.WIG F8 /r */
   26767       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26768          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26769                     uses_vvvv, vbi, pfx, delta, "vpsubb", Iop_Sub8x16 );
   26770          goto decode_success;
   26771       }
   26772       /* VPSUBB r/m, rV, r ::: r = rV - r/m */
   26773       /* VPSUBB = VEX.NDS.256.66.0F.WIG F8 /r */
   26774       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26775          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26776                     uses_vvvv, vbi, pfx, delta, "vpsubb", Iop_Sub8x32 );
   26777          goto decode_success;
   26778       }
   26779       break;
   26780 
   26781    case 0xF9:
   26782       /* VPSUBW r/m, rV, r ::: r = rV - r/m */
   26783       /* VPSUBW = VEX.NDS.128.66.0F.WIG F9 /r */
   26784       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26785          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26786                     uses_vvvv, vbi, pfx, delta, "vpsubw", Iop_Sub16x8 );
   26787          goto decode_success;
   26788       }
   26789       /* VPSUBW r/m, rV, r ::: r = rV - r/m */
   26790       /* VPSUBW = VEX.NDS.256.66.0F.WIG F9 /r */
   26791       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26792          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26793                     uses_vvvv, vbi, pfx, delta, "vpsubw", Iop_Sub16x16 );
   26794          goto decode_success;
   26795       }
   26796       break;
   26797 
   26798    case 0xFA:
   26799       /* VPSUBD r/m, rV, r ::: r = rV - r/m */
   26800       /* VPSUBD = VEX.NDS.128.66.0F.WIG FA /r */
   26801       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26802          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26803                     uses_vvvv, vbi, pfx, delta, "vpsubd", Iop_Sub32x4 );
   26804          goto decode_success;
   26805       }
   26806       /* VPSUBD r/m, rV, r ::: r = rV - r/m */
   26807       /* VPSUBD = VEX.NDS.256.66.0F.WIG FA /r */
   26808       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26809          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26810                     uses_vvvv, vbi, pfx, delta, "vpsubd", Iop_Sub32x8 );
   26811          goto decode_success;
   26812       }
   26813       break;
   26814 
   26815    case 0xFB:
   26816       /* VPSUBQ r/m, rV, r ::: r = rV - r/m */
   26817       /* VPSUBQ = VEX.NDS.128.66.0F.WIG FB /r */
   26818       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26819          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26820                     uses_vvvv, vbi, pfx, delta, "vpsubq", Iop_Sub64x2 );
   26821          goto decode_success;
   26822       }
   26823       /* VPSUBQ r/m, rV, r ::: r = rV - r/m */
   26824       /* VPSUBQ = VEX.NDS.256.66.0F.WIG FB /r */
   26825       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26826          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26827                     uses_vvvv, vbi, pfx, delta, "vpsubq", Iop_Sub64x4 );
   26828          goto decode_success;
   26829       }
   26830       break;
   26831 
   26832    case 0xFC:
   26833       /* VPADDB r/m, rV, r ::: r = rV + r/m */
   26834       /* VPADDB = VEX.NDS.128.66.0F.WIG FC /r */
   26835       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26836          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26837                     uses_vvvv, vbi, pfx, delta, "vpaddb", Iop_Add8x16 );
   26838          goto decode_success;
   26839       }
   26840       /* VPADDB r/m, rV, r ::: r = rV + r/m */
   26841       /* VPADDB = VEX.NDS.256.66.0F.WIG FC /r */
   26842       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26843          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26844                     uses_vvvv, vbi, pfx, delta, "vpaddb", Iop_Add8x32 );
   26845          goto decode_success;
   26846       }
   26847       break;
   26848 
   26849    case 0xFD:
   26850       /* VPADDW r/m, rV, r ::: r = rV + r/m */
   26851       /* VPADDW = VEX.NDS.128.66.0F.WIG FD /r */
   26852       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26853          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26854                     uses_vvvv, vbi, pfx, delta, "vpaddw", Iop_Add16x8 );
   26855          goto decode_success;
   26856       }
   26857       /* VPADDW r/m, rV, r ::: r = rV + r/m */
   26858       /* VPADDW = VEX.NDS.256.66.0F.WIG FD /r */
   26859       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26860          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26861                     uses_vvvv, vbi, pfx, delta, "vpaddw", Iop_Add16x16 );
   26862          goto decode_success;
   26863       }
   26864       break;
   26865 
   26866    case 0xFE:
   26867       /* VPADDD r/m, rV, r ::: r = rV + r/m */
   26868       /* VPADDD = VEX.NDS.128.66.0F.WIG FE /r */
   26869       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   26870          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   26871                     uses_vvvv, vbi, pfx, delta, "vpaddd", Iop_Add32x4 );
   26872          goto decode_success;
   26873       }
   26874       /* VPADDD r/m, rV, r ::: r = rV + r/m */
   26875       /* VPADDD = VEX.NDS.256.66.0F.WIG FE /r */
   26876       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   26877          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   26878                     uses_vvvv, vbi, pfx, delta, "vpaddd", Iop_Add32x8 );
   26879          goto decode_success;
   26880       }
   26881       break;
   26882 
   26883    default:
   26884       break;
   26885 
   26886    }
   26887 
   26888   //decode_failure:
   26889    return deltaIN;
   26890 
   26891   decode_success:
   26892    return delta;
   26893 }
   26894 
   26895 
   26896 /*------------------------------------------------------------*/
   26897 /*---                                                      ---*/
   26898 /*--- Top-level post-escape decoders: dis_ESC_0F38__VEX    ---*/
   26899 /*---                                                      ---*/
   26900 /*------------------------------------------------------------*/
   26901 
   26902 static IRTemp math_PERMILPS_VAR_128 ( IRTemp dataV, IRTemp ctrlV )
   26903 {
   26904    /* In the control vector, zero out all but the bottom two bits of
   26905       each 32-bit lane. */
   26906    IRExpr* cv1 = binop(Iop_ShrN32x4,
   26907                        binop(Iop_ShlN32x4, mkexpr(ctrlV), mkU8(30)),
   26908                        mkU8(30));
   26909    /* And use the resulting cleaned-up control vector as steering
   26910       in a Perm operation. */
   26911    IRTemp res = newTemp(Ity_V128);
   26912    assign(res, binop(Iop_Perm32x4, mkexpr(dataV), cv1));
   26913    return res;
   26914 }
   26915 
   26916 static IRTemp math_PERMILPS_VAR_256 ( IRTemp dataV, IRTemp ctrlV )
   26917 {
   26918    IRTemp dHi, dLo, cHi, cLo;
   26919    dHi = dLo = cHi = cLo = IRTemp_INVALID;
   26920    breakupV256toV128s( dataV, &dHi, &dLo );
   26921    breakupV256toV128s( ctrlV, &cHi, &cLo );
   26922    IRTemp rHi = math_PERMILPS_VAR_128( dHi, cHi );
   26923    IRTemp rLo = math_PERMILPS_VAR_128( dLo, cLo );
   26924    IRTemp res = newTemp(Ity_V256);
   26925    assign(res, binop(Iop_V128HLtoV256, mkexpr(rHi), mkexpr(rLo)));
   26926    return res;
   26927 }
   26928 
   26929 static IRTemp math_PERMILPD_VAR_128 ( IRTemp dataV, IRTemp ctrlV )
   26930 {
   26931    /* No cleverness here .. */
   26932    IRTemp dHi, dLo, cHi, cLo;
   26933    dHi = dLo = cHi = cLo = IRTemp_INVALID;
   26934    breakupV128to64s( dataV, &dHi, &dLo );
   26935    breakupV128to64s( ctrlV, &cHi, &cLo );
   26936    IRExpr* rHi
   26937       = IRExpr_ITE( unop(Iop_64to1,
   26938                          binop(Iop_Shr64, mkexpr(cHi), mkU8(1))),
   26939                     mkexpr(dHi), mkexpr(dLo) );
   26940    IRExpr* rLo
   26941       = IRExpr_ITE( unop(Iop_64to1,
   26942                          binop(Iop_Shr64, mkexpr(cLo), mkU8(1))),
   26943                     mkexpr(dHi), mkexpr(dLo) );
   26944    IRTemp res = newTemp(Ity_V128);
   26945    assign(res, binop(Iop_64HLtoV128, rHi, rLo));
   26946    return res;
   26947 }
   26948 
   26949 static IRTemp math_PERMILPD_VAR_256 ( IRTemp dataV, IRTemp ctrlV )
   26950 {
   26951    IRTemp dHi, dLo, cHi, cLo;
   26952    dHi = dLo = cHi = cLo = IRTemp_INVALID;
   26953    breakupV256toV128s( dataV, &dHi, &dLo );
   26954    breakupV256toV128s( ctrlV, &cHi, &cLo );
   26955    IRTemp rHi = math_PERMILPD_VAR_128( dHi, cHi );
   26956    IRTemp rLo = math_PERMILPD_VAR_128( dLo, cLo );
   26957    IRTemp res = newTemp(Ity_V256);
   26958    assign(res, binop(Iop_V128HLtoV256, mkexpr(rHi), mkexpr(rLo)));
   26959    return res;
   26960 }
   26961 
   26962 static IRTemp math_VPERMD ( IRTemp ctrlV, IRTemp dataV )
   26963 {
   26964    /* In the control vector, zero out all but the bottom three bits of
   26965       each 32-bit lane. */
   26966    IRExpr* cv1 = binop(Iop_ShrN32x8,
   26967                        binop(Iop_ShlN32x8, mkexpr(ctrlV), mkU8(29)),
   26968                        mkU8(29));
   26969    /* And use the resulting cleaned-up control vector as steering
   26970       in a Perm operation. */
   26971    IRTemp res = newTemp(Ity_V256);
   26972    assign(res, binop(Iop_Perm32x8, mkexpr(dataV), cv1));
   26973    return res;
   26974 }
   26975 
   26976 static Long dis_SHIFTX ( /*OUT*/Bool* uses_vvvv,
   26977                          VexAbiInfo* vbi, Prefix pfx, Long delta,
   26978                          const HChar* opname, IROp op8 )
   26979 {
   26980    HChar   dis_buf[50];
   26981    Int     alen;
   26982    Int     size = getRexW(pfx) ? 8 : 4;
   26983    IRType  ty   = szToITy(size);
   26984    IRTemp  src  = newTemp(ty);
   26985    IRTemp  amt  = newTemp(ty);
   26986    UChar   rm   = getUChar(delta);
   26987 
   26988    assign( amt, getIRegV(size,pfx) );
   26989    if (epartIsReg(rm)) {
   26990       assign( src, getIRegE(size,pfx,rm) );
   26991       DIP("%s %s,%s,%s\n", opname, nameIRegV(size,pfx),
   26992                            nameIRegE(size,pfx,rm), nameIRegG(size,pfx,rm));
   26993       delta++;
   26994    } else {
   26995       IRTemp addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   26996       assign( src, loadLE(ty, mkexpr(addr)) );
   26997       DIP("%s %s,%s,%s\n", opname, nameIRegV(size,pfx), dis_buf,
   26998                            nameIRegG(size,pfx,rm));
   26999       delta += alen;
   27000    }
   27001 
   27002    putIRegG( size, pfx, rm,
   27003              binop(mkSizedOp(ty,op8), mkexpr(src),
   27004                    narrowTo(Ity_I8, binop(mkSizedOp(ty,Iop_And8), mkexpr(amt),
   27005                                           mkU(ty,8*size-1)))) );
   27006    /* Flags aren't modified.  */
   27007    *uses_vvvv = True;
   27008    return delta;
   27009 }
   27010 
   27011 
   27012 static Long dis_FMA ( VexAbiInfo* vbi, Prefix pfx, Long delta, UChar opc )
   27013 {
   27014    UChar  modrm   = getUChar(delta);
   27015    UInt   rG      = gregOfRexRM(pfx, modrm);
   27016    UInt   rV      = getVexNvvvv(pfx);
   27017    Bool   scalar  = (opc & 0xF) > 7 && (opc & 1);
   27018    IRType ty      = getRexW(pfx) ? Ity_F64 : Ity_F32;
   27019    IRType vty     = scalar ? ty : getVexL(pfx) ? Ity_V256 : Ity_V128;
   27020    IRTemp vX      = newTemp(vty);
   27021    IRTemp vY      = newTemp(vty);
   27022    IRTemp vZ      = newTemp(vty);
   27023    IRExpr *x[8], *y[8], *z[8];
   27024    IRTemp addr    = IRTemp_INVALID;
   27025    HChar  dis_buf[50];
   27026    Int    alen    = 0;
   27027    const HChar *name;
   27028    const HChar *suffix;
   27029    const HChar *order;
   27030    Bool   negateRes   = False;
   27031    Bool   negateZeven = False;
   27032    Bool   negateZodd  = False;
   27033    Int    i, j;
   27034    Int    count;
   27035    static IROp ops[] = { Iop_V256to64_0, Iop_V256to64_1,
   27036                          Iop_V256to64_2, Iop_V256to64_3,
   27037                          Iop_V128to64, Iop_V128HIto64 };
   27038 
   27039    switch (opc & 0xF) {
   27040    case 0x6:
   27041       name = "addsub";
   27042       negateZeven = True;
   27043       break;
   27044    case 0x7:
   27045       name = "subadd";
   27046       negateZodd = True;
   27047       break;
   27048    case 0x8:
   27049    case 0x9:
   27050       name = "add";
   27051       break;
   27052    case 0xA:
   27053    case 0xB:
   27054       name = "sub";
   27055       negateZeven = True;
   27056       negateZodd = True;
   27057       break;
   27058    case 0xC:
   27059    case 0xD:
   27060       name = "add";
   27061       negateRes = True;
   27062       negateZeven = True;
   27063       negateZodd = True;
   27064       break;
   27065    case 0xE:
   27066    case 0xF:
   27067       name = "sub";
   27068       negateRes = True;
   27069       break;
   27070    default:
   27071       vpanic("dis_FMA(amd64)");
   27072       break;
   27073    }
   27074    switch (opc & 0xF0) {
   27075    case 0x90: order = "132"; break;
   27076    case 0xA0: order = "213"; break;
   27077    case 0xB0: order = "231"; break;
   27078    default: vpanic("dis_FMA(amd64)"); break;
   27079    }
   27080    if (scalar)
   27081       suffix = ty == Ity_F64 ? "sd" : "ss";
   27082    else
   27083       suffix = ty == Ity_F64 ? "pd" : "ps";
   27084 
   27085    if (scalar) {
   27086       assign( vX, ty == Ity_F64
   27087                   ? getXMMRegLane64F(rG, 0) : getXMMRegLane32F(rG, 0) );
   27088       assign( vZ, ty == Ity_F64
   27089                   ? getXMMRegLane64F(rV, 0) : getXMMRegLane32F(rV, 0) );
   27090    } else {
   27091       assign( vX, vty == Ity_V256 ? getYMMReg(rG) : getXMMReg(rG) );
   27092       assign( vZ, vty == Ity_V256 ? getYMMReg(rV) : getXMMReg(rV) );
   27093    }
   27094 
   27095    if (epartIsReg(modrm)) {
   27096       UInt rE = eregOfRexRM(pfx, modrm);
   27097       delta += 1;
   27098       if (scalar)
   27099          assign( vY, ty == Ity_F64
   27100                      ? getXMMRegLane64F(rE, 0) : getXMMRegLane32F(rE, 0) );
   27101       else
   27102          assign( vY, vty == Ity_V256 ? getYMMReg(rE) : getXMMReg(rE) );
   27103       if (vty == Ity_V256) {
   27104          DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
   27105              name, order, suffix, nameYMMReg(rE), nameYMMReg(rV),
   27106              nameYMMReg(rG));
   27107       } else {
   27108          DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
   27109              name, order, suffix, nameXMMReg(rE), nameXMMReg(rV),
   27110              nameXMMReg(rG));
   27111       }
   27112    } else {
   27113       addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   27114       delta += alen;
   27115       assign(vY, loadLE(vty, mkexpr(addr)));
   27116       if (vty == Ity_V256) {
   27117          DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
   27118              name, order, suffix, dis_buf, nameYMMReg(rV),
   27119              nameYMMReg(rG));
   27120       } else {
   27121          DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
   27122              name, order, suffix, dis_buf, nameXMMReg(rV),
   27123              nameXMMReg(rG));
   27124       }
   27125    }
   27126 
   27127    /* vX/vY/vZ now in 132 order.  If it is different order, swap the
   27128       arguments.  */
   27129    if ((opc & 0xF0) != 0x90) {
   27130       IRTemp tem = vX;
   27131       if ((opc & 0xF0) == 0xA0) {
   27132          vX = vZ;
   27133          vZ = vY;
   27134          vY = tem;
   27135       } else {
   27136          vX = vZ;
   27137          vZ = tem;
   27138       }
   27139    }
   27140 
   27141    if (scalar) {
   27142       count = 1;
   27143       x[0] = mkexpr(vX);
   27144       y[0] = mkexpr(vY);
   27145       z[0] = mkexpr(vZ);
   27146    } else if (ty == Ity_F32) {
   27147       count = vty == Ity_V256 ? 8 : 4;
   27148       j = vty == Ity_V256 ? 0 : 4;
   27149       for (i = 0; i < count; i += 2) {
   27150          IRTemp tem = newTemp(Ity_I64);
   27151          assign(tem, unop(ops[i / 2 + j], mkexpr(vX)));
   27152          x[i] = unop(Iop_64to32, mkexpr(tem));
   27153          x[i + 1] = unop(Iop_64HIto32, mkexpr(tem));
   27154          tem = newTemp(Ity_I64);
   27155          assign(tem, unop(ops[i / 2 + j], mkexpr(vY)));
   27156          y[i] = unop(Iop_64to32, mkexpr(tem));
   27157          y[i + 1] = unop(Iop_64HIto32, mkexpr(tem));
   27158          tem = newTemp(Ity_I64);
   27159          assign(tem, unop(ops[i / 2 + j], mkexpr(vZ)));
   27160          z[i] = unop(Iop_64to32, mkexpr(tem));
   27161          z[i + 1] = unop(Iop_64HIto32, mkexpr(tem));
   27162       }
   27163    } else {
   27164       count = vty == Ity_V256 ? 4 : 2;
   27165       j = vty == Ity_V256 ? 0 : 4;
   27166       for (i = 0; i < count; i++) {
   27167          x[i] = unop(ops[i + j], mkexpr(vX));
   27168          y[i] = unop(ops[i + j], mkexpr(vY));
   27169          z[i] = unop(ops[i + j], mkexpr(vZ));
   27170       }
   27171    }
   27172    if (!scalar)
   27173       for (i = 0; i < count; i++) {
   27174          IROp op = ty == Ity_F64
   27175                    ? Iop_ReinterpI64asF64 : Iop_ReinterpI32asF32;
   27176          x[i] = unop(op, x[i]);
   27177          y[i] = unop(op, y[i]);
   27178          z[i] = unop(op, z[i]);
   27179       }
   27180    for (i = 0; i < count; i++) {
   27181       if ((i & 1) ? negateZodd : negateZeven)
   27182          z[i] = unop(ty == Ity_F64 ? Iop_NegF64 : Iop_NegF32, z[i]);
   27183       x[i] = IRExpr_Qop(ty == Ity_F64 ? Iop_MAddF64 : Iop_MAddF32,
   27184                         get_FAKE_roundingmode(), x[i], y[i], z[i]);
   27185       if (negateRes)
   27186          x[i] = unop(ty == Ity_F64 ? Iop_NegF64 : Iop_NegF32, x[i]);
   27187       if (ty == Ity_F64)
   27188          putYMMRegLane64F( rG, i, x[i] );
   27189       else
   27190          putYMMRegLane32F( rG, i, x[i] );
   27191    }
   27192    if (vty != Ity_V256)
   27193       putYMMRegLane128( rG, 1, mkV128(0) );
   27194 
   27195    return delta;
   27196 }
   27197 
   27198 
   27199 /* Masked load.  */
   27200 static ULong dis_VMASKMOV_load ( Bool *uses_vvvv, VexAbiInfo* vbi,
   27201                                  Prefix pfx, Long delta,
   27202                                  const HChar* opname, Bool isYMM, IRType ty )
   27203 {
   27204    HChar   dis_buf[50];
   27205    Int     alen, i;
   27206    IRTemp  addr;
   27207    UChar   modrm = getUChar(delta);
   27208    UInt    rG    = gregOfRexRM(pfx,modrm);
   27209    UInt    rV    = getVexNvvvv(pfx);
   27210    IRTemp  res[8], cond;
   27211    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   27212    if (isYMM) {
   27213       DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
   27214    } else {
   27215       DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   27216    }
   27217    delta += alen;
   27218 
   27219    for (i = 0; i < 2 * (isYMM ? 2 : 1) * (ty == Ity_I32 ? 2 : 1); i++) {
   27220       res[i] = newTemp(ty);
   27221       cond = newTemp(Ity_I1);
   27222       assign( cond,
   27223               binop(ty == Ity_I32 ? Iop_CmpLT32S : Iop_CmpLT64S,
   27224                     ty == Ity_I32 ? getYMMRegLane32( rV, i )
   27225                                   : getYMMRegLane64( rV, i ),
   27226                     mkU(ty, 0) ));
   27227       assign( res[i],
   27228               IRExpr_ITE(
   27229                  mkexpr(cond),
   27230                  loadLE(ty, IRExpr_ITE(
   27231                                mkexpr(cond),
   27232                                binop(Iop_Add64, mkexpr(addr),
   27233                                      mkU64(i*(ty == Ity_I32 ? 4 : 8))),
   27234                                getIReg64(R_RSP)
   27235                             )
   27236                        ),
   27237                  mkU(ty, 0)
   27238               )
   27239             );
   27240    }
   27241    switch (ty) {
   27242       case Ity_I32:
   27243          for (i = 0; i < 8; i++)
   27244             putYMMRegLane32( rG, i, (i < 4 || isYMM)
   27245                                     ? mkexpr(res[i]) : mkU32(0) );
   27246          break;
   27247       case Ity_I64:
   27248          for (i = 0; i < 4; i++)
   27249             putYMMRegLane64( rG, i, (i < 2 || isYMM)
   27250                                     ? mkexpr(res[i]) : mkU64(0) );
   27251          break;
   27252       default: vassert(0);
   27253    }
   27254 
   27255    *uses_vvvv = True;
   27256    return delta;
   27257 }
   27258 
   27259 
   27260 /* Gather.  */
   27261 static ULong dis_VGATHER ( Bool *uses_vvvv, VexAbiInfo* vbi,
   27262                            Prefix pfx, Long delta,
   27263                            const HChar* opname, Bool isYMM,
   27264                            Bool isVM64x, IRType ty )
   27265 {
   27266    HChar  dis_buf[50];
   27267    Int    alen, i, vscale, count1, count2;
   27268    IRTemp addr;
   27269    UChar  modrm = getUChar(delta);
   27270    UInt   rG    = gregOfRexRM(pfx,modrm);
   27271    UInt   rV    = getVexNvvvv(pfx);
   27272    UInt   rI;
   27273    IRType dstTy = (isYMM && (ty == Ity_I64 || !isVM64x)) ? Ity_V256 : Ity_V128;
   27274    IRType idxTy = (isYMM && (ty == Ity_I32 || isVM64x)) ? Ity_V256 : Ity_V128;
   27275    IRTemp cond;
   27276    addr = disAVSIBMode ( &alen, vbi, pfx, delta, dis_buf, &rI,
   27277                          idxTy, &vscale );
   27278    if (addr == IRTemp_INVALID || rI == rG || rI == rV || rG == rV)
   27279       return delta;
   27280    if (dstTy == Ity_V256) {
   27281       DIP("%s %s,%s,%s\n", opname, nameYMMReg(rV), dis_buf, nameYMMReg(rG) );
   27282    } else {
   27283       DIP("%s %s,%s,%s\n", opname, nameXMMReg(rV), dis_buf, nameXMMReg(rG) );
   27284    }
   27285    delta += alen;
   27286 
   27287    if (ty == Ity_I32) {
   27288       count1 = isYMM ? 8 : 4;
   27289       count2 = isVM64x ? count1 / 2 : count1;
   27290    } else {
   27291       count1 = count2 = isYMM ? 4 : 2;
   27292    }
   27293 
   27294    /* First update the mask register to copies of the sign bit.  */
   27295    if (ty == Ity_I32) {
   27296       if (isYMM)
   27297          putYMMReg( rV, binop(Iop_SarN32x8, getYMMReg( rV ), mkU8(31)) );
   27298       else
   27299          putYMMRegLoAndZU( rV, binop(Iop_SarN32x4, getXMMReg( rV ), mkU8(31)) );
   27300    } else {
   27301       for (i = 0; i < count1; i++) {
   27302          putYMMRegLane64( rV, i, binop(Iop_Sar64, getYMMRegLane64( rV, i ),
   27303                                        mkU8(63)) );
   27304       }
   27305    }
   27306 
   27307    /* Next gather the individual elements.  If any fault occurs, the
   27308       corresponding mask element will be set and the loop stops.  */
   27309    for (i = 0; i < count2; i++) {
   27310       IRExpr *expr, *addr_expr;
   27311       cond = newTemp(Ity_I1);
   27312       assign( cond,
   27313               binop(ty == Ity_I32 ? Iop_CmpLT32S : Iop_CmpLT64S,
   27314                     ty == Ity_I32 ? getYMMRegLane32( rV, i )
   27315                                   : getYMMRegLane64( rV, i ),
   27316                     mkU(ty, 0)) );
   27317       expr = ty == Ity_I32 ? getYMMRegLane32( rG, i )
   27318                            : getYMMRegLane64( rG, i );
   27319       addr_expr = isVM64x ? getYMMRegLane64( rI, i )
   27320                           : unop(Iop_32Sto64, getYMMRegLane32( rI, i ));
   27321       switch (vscale) {
   27322          case 2: addr_expr = binop(Iop_Shl64, addr_expr, mkU8(1)); break;
   27323          case 4: addr_expr = binop(Iop_Shl64, addr_expr, mkU8(2)); break;
   27324          case 8: addr_expr = binop(Iop_Shl64, addr_expr, mkU8(3)); break;
   27325          default: break;
   27326       }
   27327       addr_expr = binop(Iop_Add64, mkexpr(addr), addr_expr);
   27328       addr_expr = handleAddrOverrides(vbi, pfx, addr_expr);
   27329       addr_expr = IRExpr_ITE(mkexpr(cond), addr_expr, getIReg64(R_RSP));
   27330       expr = IRExpr_ITE(mkexpr(cond), loadLE(ty, addr_expr), expr);
   27331       if (ty == Ity_I32) {
   27332          putYMMRegLane32( rG, i, expr );
   27333          putYMMRegLane32( rV, i, mkU32(0) );
   27334       } else {
   27335          putYMMRegLane64( rG, i, expr);
   27336          putYMMRegLane64( rV, i, mkU64(0) );
   27337       }
   27338    }
   27339 
   27340    if (!isYMM || (ty == Ity_I32 && isVM64x)) {
   27341       if (ty == Ity_I64 || isYMM)
   27342          putYMMRegLane128( rV, 1, mkV128(0) );
   27343       else if (ty == Ity_I32 && count2 == 2) {
   27344          putYMMRegLane64( rV, 1, mkU64(0) );
   27345          putYMMRegLane64( rG, 1, mkU64(0) );
   27346       }
   27347       putYMMRegLane128( rG, 1, mkV128(0) );
   27348    }
   27349 
   27350    *uses_vvvv = True;
   27351    return delta;
   27352 }
   27353 
   27354 
   27355 __attribute__((noinline))
   27356 static
   27357 Long dis_ESC_0F38__VEX (
   27358         /*MB_OUT*/DisResult* dres,
   27359         /*OUT*/   Bool*      uses_vvvv,
   27360         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
   27361         Bool         resteerCisOk,
   27362         void*        callback_opaque,
   27363         VexArchInfo* archinfo,
   27364         VexAbiInfo*  vbi,
   27365         Prefix pfx, Int sz, Long deltaIN
   27366      )
   27367 {
   27368    IRTemp addr  = IRTemp_INVALID;
   27369    Int    alen  = 0;
   27370    HChar  dis_buf[50];
   27371    Long   delta = deltaIN;
   27372    UChar  opc   = getUChar(delta);
   27373    delta++;
   27374    *uses_vvvv = False;
   27375 
   27376    switch (opc) {
   27377 
   27378    case 0x00:
   27379       /* VPSHUFB r/m, rV, r ::: r = shuf(rV, r/m) */
   27380       /* VPSHUFB = VEX.NDS.128.66.0F38.WIG 00 /r */
   27381       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27382          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   27383                     uses_vvvv, vbi, pfx, delta, "vpshufb", math_PSHUFB_XMM );
   27384          goto decode_success;
   27385       }
   27386       /* VPSHUFB r/m, rV, r ::: r = shuf(rV, r/m) */
   27387       /* VPSHUFB = VEX.NDS.256.66.0F38.WIG 00 /r */
   27388       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27389          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   27390                     uses_vvvv, vbi, pfx, delta, "vpshufb", math_PSHUFB_YMM );
   27391          goto decode_success;
   27392       }
   27393       break;
   27394 
   27395    case 0x01:
   27396    case 0x02:
   27397    case 0x03:
   27398       /* VPHADDW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 01 /r */
   27399       /* VPHADDD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 02 /r */
   27400       /* VPHADDSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 03 /r */
   27401       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27402          delta = dis_PHADD_128( vbi, pfx, delta, True/*isAvx*/, opc );
   27403          *uses_vvvv = True;
   27404          goto decode_success;
   27405       }
   27406       /* VPHADDW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 01 /r */
   27407       /* VPHADDD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 02 /r */
   27408       /* VPHADDSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 03 /r */
   27409       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27410          delta = dis_PHADD_256( vbi, pfx, delta, opc );
   27411          *uses_vvvv = True;
   27412          goto decode_success;
   27413       }
   27414       break;
   27415 
   27416    case 0x04:
   27417       /* VPMADDUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 04 /r */
   27418       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27419          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   27420                     uses_vvvv, vbi, pfx, delta, "vpmaddubsw",
   27421                     math_PMADDUBSW_128 );
   27422          goto decode_success;
   27423       }
   27424       /* VPMADDUBSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 04 /r */
   27425       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27426          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   27427                     uses_vvvv, vbi, pfx, delta, "vpmaddubsw",
   27428                     math_PMADDUBSW_256 );
   27429          goto decode_success;
   27430       }
   27431       break;
   27432 
   27433    case 0x05:
   27434    case 0x06:
   27435    case 0x07:
   27436       /* VPHSUBW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 05 /r */
   27437       /* VPHSUBD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 06 /r */
   27438       /* VPHSUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 07 /r */
   27439       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27440          delta = dis_PHADD_128( vbi, pfx, delta, True/*isAvx*/, opc );
   27441          *uses_vvvv = True;
   27442          goto decode_success;
   27443       }
   27444       /* VPHSUBW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 05 /r */
   27445       /* VPHSUBD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 06 /r */
   27446       /* VPHSUBSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 07 /r */
   27447       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27448          delta = dis_PHADD_256( vbi, pfx, delta, opc );
   27449          *uses_vvvv = True;
   27450          goto decode_success;
   27451       }
   27452       break;
   27453 
   27454    case 0x08:
   27455    case 0x09:
   27456    case 0x0A:
   27457       /* VPSIGNB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 08 /r */
   27458       /* VPSIGNW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 09 /r */
   27459       /* VPSIGND xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 0A /r */
   27460       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27461          IRTemp sV      = newTemp(Ity_V128);
   27462          IRTemp dV      = newTemp(Ity_V128);
   27463          IRTemp sHi, sLo, dHi, dLo;
   27464          sHi = sLo = dHi = dLo = IRTemp_INVALID;
   27465          HChar  ch      = '?';
   27466          Int    laneszB = 0;
   27467          UChar  modrm   = getUChar(delta);
   27468          UInt   rG      = gregOfRexRM(pfx,modrm);
   27469          UInt   rV      = getVexNvvvv(pfx);
   27470 
   27471          switch (opc) {
   27472             case 0x08: laneszB = 1; ch = 'b'; break;
   27473             case 0x09: laneszB = 2; ch = 'w'; break;
   27474             case 0x0A: laneszB = 4; ch = 'd'; break;
   27475             default: vassert(0);
   27476          }
   27477 
   27478          assign( dV, getXMMReg(rV) );
   27479 
   27480          if (epartIsReg(modrm)) {
   27481             UInt rE = eregOfRexRM(pfx,modrm);
   27482             assign( sV, getXMMReg(rE) );
   27483             delta += 1;
   27484             DIP("vpsign%c %s,%s,%s\n", ch, nameXMMReg(rE),
   27485                 nameXMMReg(rV), nameXMMReg(rG));
   27486          } else {
   27487             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   27488             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   27489             delta += alen;
   27490             DIP("vpsign%c %s,%s,%s\n", ch, dis_buf,
   27491                 nameXMMReg(rV), nameXMMReg(rG));
   27492          }
   27493 
   27494          breakupV128to64s( dV, &dHi, &dLo );
   27495          breakupV128to64s( sV, &sHi, &sLo );
   27496 
   27497          putYMMRegLoAndZU(
   27498             rG,
   27499             binop(Iop_64HLtoV128,
   27500                   dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
   27501                   dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
   27502             )
   27503          );
   27504          *uses_vvvv = True;
   27505          goto decode_success;
   27506       }
   27507       /* VPSIGNB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 08 /r */
   27508       /* VPSIGNW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 09 /r */
   27509       /* VPSIGND ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 0A /r */
   27510       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27511          IRTemp sV      = newTemp(Ity_V256);
   27512          IRTemp dV      = newTemp(Ity_V256);
   27513          IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   27514          s3 = s2 = s1 = s0 = IRTemp_INVALID;
   27515          d3 = d2 = d1 = d0 = IRTemp_INVALID;
   27516          UChar  ch      = '?';
   27517          Int    laneszB = 0;
   27518          UChar  modrm   = getUChar(delta);
   27519          UInt   rG      = gregOfRexRM(pfx,modrm);
   27520          UInt   rV      = getVexNvvvv(pfx);
   27521 
   27522          switch (opc) {
   27523             case 0x08: laneszB = 1; ch = 'b'; break;
   27524             case 0x09: laneszB = 2; ch = 'w'; break;
   27525             case 0x0A: laneszB = 4; ch = 'd'; break;
   27526             default: vassert(0);
   27527          }
   27528 
   27529          assign( dV, getYMMReg(rV) );
   27530 
   27531          if (epartIsReg(modrm)) {
   27532             UInt rE = eregOfRexRM(pfx,modrm);
   27533             assign( sV, getYMMReg(rE) );
   27534             delta += 1;
   27535             DIP("vpsign%c %s,%s,%s\n", ch, nameYMMReg(rE),
   27536                 nameYMMReg(rV), nameYMMReg(rG));
   27537          } else {
   27538             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   27539             assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   27540             delta += alen;
   27541             DIP("vpsign%c %s,%s,%s\n", ch, dis_buf,
   27542                 nameYMMReg(rV), nameYMMReg(rG));
   27543          }
   27544 
   27545          breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
   27546          breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
   27547 
   27548          putYMMReg(
   27549             rG,
   27550             binop( Iop_V128HLtoV256,
   27551                    binop(Iop_64HLtoV128,
   27552                          dis_PSIGN_helper( mkexpr(s3), mkexpr(d3), laneszB ),
   27553                          dis_PSIGN_helper( mkexpr(s2), mkexpr(d2), laneszB )
   27554                    ),
   27555                    binop(Iop_64HLtoV128,
   27556                          dis_PSIGN_helper( mkexpr(s1), mkexpr(d1), laneszB ),
   27557                          dis_PSIGN_helper( mkexpr(s0), mkexpr(d0), laneszB )
   27558                    )
   27559             )
   27560          );
   27561          *uses_vvvv = True;
   27562          goto decode_success;
   27563       }
   27564       break;
   27565 
   27566    case 0x0B:
   27567       /* VPMULHRSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 0B /r */
   27568       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27569          IRTemp sV      = newTemp(Ity_V128);
   27570          IRTemp dV      = newTemp(Ity_V128);
   27571          IRTemp sHi, sLo, dHi, dLo;
   27572          sHi = sLo = dHi = dLo = IRTemp_INVALID;
   27573          UChar  modrm   = getUChar(delta);
   27574          UInt   rG      = gregOfRexRM(pfx,modrm);
   27575          UInt   rV      = getVexNvvvv(pfx);
   27576 
   27577          assign( dV, getXMMReg(rV) );
   27578 
   27579          if (epartIsReg(modrm)) {
   27580             UInt rE = eregOfRexRM(pfx,modrm);
   27581             assign( sV, getXMMReg(rE) );
   27582             delta += 1;
   27583             DIP("vpmulhrsw %s,%s,%s\n", nameXMMReg(rE),
   27584                 nameXMMReg(rV), nameXMMReg(rG));
   27585          } else {
   27586             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   27587             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   27588             delta += alen;
   27589             DIP("vpmulhrsw %s,%s,%s\n", dis_buf,
   27590                 nameXMMReg(rV), nameXMMReg(rG));
   27591          }
   27592 
   27593          breakupV128to64s( dV, &dHi, &dLo );
   27594          breakupV128to64s( sV, &sHi, &sLo );
   27595 
   27596          putYMMRegLoAndZU(
   27597             rG,
   27598             binop(Iop_64HLtoV128,
   27599                   dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
   27600                   dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
   27601             )
   27602          );
   27603          *uses_vvvv = True;
   27604          goto decode_success;
   27605       }
   27606       /* VPMULHRSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 0B /r */
   27607       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27608          IRTemp sV      = newTemp(Ity_V256);
   27609          IRTemp dV      = newTemp(Ity_V256);
   27610          IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   27611          s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   27612          UChar  modrm   = getUChar(delta);
   27613          UInt   rG      = gregOfRexRM(pfx,modrm);
   27614          UInt   rV      = getVexNvvvv(pfx);
   27615 
   27616          assign( dV, getYMMReg(rV) );
   27617 
   27618          if (epartIsReg(modrm)) {
   27619             UInt rE = eregOfRexRM(pfx,modrm);
   27620             assign( sV, getYMMReg(rE) );
   27621             delta += 1;
   27622             DIP("vpmulhrsw %s,%s,%s\n", nameYMMReg(rE),
   27623                 nameYMMReg(rV), nameYMMReg(rG));
   27624          } else {
   27625             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   27626             assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   27627             delta += alen;
   27628             DIP("vpmulhrsw %s,%s,%s\n", dis_buf,
   27629                 nameYMMReg(rV), nameYMMReg(rG));
   27630          }
   27631 
   27632          breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
   27633          breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
   27634 
   27635          putYMMReg(
   27636             rG,
   27637             binop(Iop_V128HLtoV256,
   27638                   binop(Iop_64HLtoV128,
   27639                         dis_PMULHRSW_helper( mkexpr(s3), mkexpr(d3) ),
   27640                         dis_PMULHRSW_helper( mkexpr(s2), mkexpr(d2) ) ),
   27641                   binop(Iop_64HLtoV128,
   27642                         dis_PMULHRSW_helper( mkexpr(s1), mkexpr(d1) ),
   27643                         dis_PMULHRSW_helper( mkexpr(s0), mkexpr(d0) ) )
   27644             )
   27645          );
   27646          *uses_vvvv = True;
   27647          goto decode_success;
   27648       }
   27649       break;
   27650 
   27651    case 0x0C:
   27652       /* VPERMILPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0C /r */
   27653       if (have66noF2noF3(pfx)
   27654           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   27655          UChar  modrm = getUChar(delta);
   27656          UInt   rG    = gregOfRexRM(pfx, modrm);
   27657          UInt   rV    = getVexNvvvv(pfx);
   27658          IRTemp ctrlV = newTemp(Ity_V128);
   27659          if (epartIsReg(modrm)) {
   27660             UInt rE = eregOfRexRM(pfx, modrm);
   27661             delta += 1;
   27662             DIP("vpermilps %s,%s,%s\n",
   27663                 nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   27664             assign(ctrlV, getXMMReg(rE));
   27665          } else {
   27666             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   27667             delta += alen;
   27668             DIP("vpermilps %s,%s,%s\n",
   27669                 dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   27670             assign(ctrlV, loadLE(Ity_V128, mkexpr(addr)));
   27671          }
   27672          IRTemp dataV = newTemp(Ity_V128);
   27673          assign(dataV, getXMMReg(rV));
   27674          IRTemp resV = math_PERMILPS_VAR_128(dataV, ctrlV);
   27675          putYMMRegLoAndZU(rG, mkexpr(resV));
   27676          *uses_vvvv = True;
   27677          goto decode_success;
   27678       }
   27679       /* VPERMILPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0C /r */
   27680       if (have66noF2noF3(pfx)
   27681           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   27682          UChar  modrm = getUChar(delta);
   27683          UInt   rG    = gregOfRexRM(pfx, modrm);
   27684          UInt   rV    = getVexNvvvv(pfx);
   27685          IRTemp ctrlV = newTemp(Ity_V256);
   27686          if (epartIsReg(modrm)) {
   27687             UInt rE = eregOfRexRM(pfx, modrm);
   27688             delta += 1;
   27689             DIP("vpermilps %s,%s,%s\n",
   27690                 nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   27691             assign(ctrlV, getYMMReg(rE));
   27692          } else {
   27693             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   27694             delta += alen;
   27695             DIP("vpermilps %s,%s,%s\n",
   27696                 dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   27697             assign(ctrlV, loadLE(Ity_V256, mkexpr(addr)));
   27698          }
   27699          IRTemp dataV = newTemp(Ity_V256);
   27700          assign(dataV, getYMMReg(rV));
   27701          IRTemp resV = math_PERMILPS_VAR_256(dataV, ctrlV);
   27702          putYMMReg(rG, mkexpr(resV));
   27703          *uses_vvvv = True;
   27704          goto decode_success;
   27705       }
   27706       break;
   27707 
   27708    case 0x0D:
   27709       /* VPERMILPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0D /r */
   27710       if (have66noF2noF3(pfx)
   27711           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   27712          UChar  modrm = getUChar(delta);
   27713          UInt   rG    = gregOfRexRM(pfx, modrm);
   27714          UInt   rV    = getVexNvvvv(pfx);
   27715          IRTemp ctrlV = newTemp(Ity_V128);
   27716          if (epartIsReg(modrm)) {
   27717             UInt rE = eregOfRexRM(pfx, modrm);
   27718             delta += 1;
   27719             DIP("vpermilpd %s,%s,%s\n",
   27720                 nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   27721             assign(ctrlV, getXMMReg(rE));
   27722          } else {
   27723             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   27724             delta += alen;
   27725             DIP("vpermilpd %s,%s,%s\n",
   27726                 dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   27727             assign(ctrlV, loadLE(Ity_V128, mkexpr(addr)));
   27728          }
   27729          IRTemp dataV = newTemp(Ity_V128);
   27730          assign(dataV, getXMMReg(rV));
   27731          IRTemp resV = math_PERMILPD_VAR_128(dataV, ctrlV);
   27732          putYMMRegLoAndZU(rG, mkexpr(resV));
   27733          *uses_vvvv = True;
   27734          goto decode_success;
   27735       }
   27736       /* VPERMILPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0D /r */
   27737       if (have66noF2noF3(pfx)
   27738           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   27739          UChar  modrm = getUChar(delta);
   27740          UInt   rG    = gregOfRexRM(pfx, modrm);
   27741          UInt   rV    = getVexNvvvv(pfx);
   27742          IRTemp ctrlV = newTemp(Ity_V256);
   27743          if (epartIsReg(modrm)) {
   27744             UInt rE = eregOfRexRM(pfx, modrm);
   27745             delta += 1;
   27746             DIP("vpermilpd %s,%s,%s\n",
   27747                 nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   27748             assign(ctrlV, getYMMReg(rE));
   27749          } else {
   27750             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   27751             delta += alen;
   27752             DIP("vpermilpd %s,%s,%s\n",
   27753                 dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   27754             assign(ctrlV, loadLE(Ity_V256, mkexpr(addr)));
   27755          }
   27756          IRTemp dataV = newTemp(Ity_V256);
   27757          assign(dataV, getYMMReg(rV));
   27758          IRTemp resV = math_PERMILPD_VAR_256(dataV, ctrlV);
   27759          putYMMReg(rG, mkexpr(resV));
   27760          *uses_vvvv = True;
   27761          goto decode_success;
   27762       }
   27763       break;
   27764 
   27765    case 0x0E:
   27766       /* VTESTPS xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 0E /r */
   27767       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27768          delta = dis_xTESTy_128( vbi, pfx, delta, True/*isAvx*/, 32 );
   27769          goto decode_success;
   27770       }
   27771       /* VTESTPS ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 0E /r */
   27772       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27773          delta = dis_xTESTy_256( vbi, pfx, delta, 32 );
   27774          goto decode_success;
   27775       }
   27776       break;
   27777 
   27778    case 0x0F:
   27779       /* VTESTPD xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 0F /r */
   27780       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27781          delta = dis_xTESTy_128( vbi, pfx, delta, True/*isAvx*/, 64 );
   27782          goto decode_success;
   27783       }
   27784       /* VTESTPD ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 0F /r */
   27785       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27786          delta = dis_xTESTy_256( vbi, pfx, delta, 64 );
   27787          goto decode_success;
   27788       }
   27789       break;
   27790 
   27791    case 0x16:
   27792       /* VPERMPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 16 /r */
   27793       if (have66noF2noF3(pfx)
   27794           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   27795          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   27796                     uses_vvvv, vbi, pfx, delta, "vpermps", math_VPERMD );
   27797          goto decode_success;
   27798       }
   27799       break;
   27800 
   27801    case 0x17:
   27802       /* VPTEST xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 17 /r */
   27803       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27804          delta = dis_xTESTy_128( vbi, pfx, delta, True/*isAvx*/, 0 );
   27805          goto decode_success;
   27806       }
   27807       /* VPTEST ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 17 /r */
   27808       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27809          delta = dis_xTESTy_256( vbi, pfx, delta, 0 );
   27810          goto decode_success;
   27811       }
   27812       break;
   27813 
   27814    case 0x18:
   27815       /* VBROADCASTSS m32, xmm1 = VEX.128.66.0F38.WIG 18 /r */
   27816       if (have66noF2noF3(pfx)
   27817           && 0==getVexL(pfx)/*128*/
   27818           && !epartIsReg(getUChar(delta))) {
   27819          UChar modrm = getUChar(delta);
   27820          UInt  rG    = gregOfRexRM(pfx, modrm);
   27821          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   27822          delta += alen;
   27823          DIP("vbroadcastss %s,%s\n", dis_buf, nameXMMReg(rG));
   27824          IRTemp t32 = newTemp(Ity_I32);
   27825          assign(t32, loadLE(Ity_I32, mkexpr(addr)));
   27826          IRTemp t64 = newTemp(Ity_I64);
   27827          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   27828          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
   27829          putYMMRegLoAndZU(rG, res);
   27830          goto decode_success;
   27831       }
   27832       /* VBROADCASTSS m32, ymm1 = VEX.256.66.0F38.WIG 18 /r */
   27833       if (have66noF2noF3(pfx)
   27834           && 1==getVexL(pfx)/*256*/
   27835           && !epartIsReg(getUChar(delta))) {
   27836          UChar modrm = getUChar(delta);
   27837          UInt  rG    = gregOfRexRM(pfx, modrm);
   27838          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   27839          delta += alen;
   27840          DIP("vbroadcastss %s,%s\n", dis_buf, nameYMMReg(rG));
   27841          IRTemp t32 = newTemp(Ity_I32);
   27842          assign(t32, loadLE(Ity_I32, mkexpr(addr)));
   27843          IRTemp t64 = newTemp(Ity_I64);
   27844          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   27845          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   27846                                                   mkexpr(t64), mkexpr(t64));
   27847          putYMMReg(rG, res);
   27848          goto decode_success;
   27849       }
   27850       /* VBROADCASTSS xmm2, xmm1 = VEX.128.66.0F38.WIG 18 /r */
   27851       if (have66noF2noF3(pfx)
   27852           && 0==getVexL(pfx)/*128*/
   27853           && epartIsReg(getUChar(delta))) {
   27854          UChar modrm = getUChar(delta);
   27855          UInt  rG    = gregOfRexRM(pfx, modrm);
   27856          UInt  rE    = eregOfRexRM(pfx, modrm);
   27857          DIP("vbroadcastss %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   27858          IRTemp t32 = newTemp(Ity_I32);
   27859          assign(t32, getXMMRegLane32(rE, 0));
   27860          IRTemp t64 = newTemp(Ity_I64);
   27861          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   27862          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
   27863          putYMMRegLoAndZU(rG, res);
   27864          delta++;
   27865          goto decode_success;
   27866       }
   27867       /* VBROADCASTSS xmm2, ymm1 = VEX.256.66.0F38.WIG 18 /r */
   27868       if (have66noF2noF3(pfx)
   27869           && 1==getVexL(pfx)/*256*/
   27870           && epartIsReg(getUChar(delta))) {
   27871          UChar modrm = getUChar(delta);
   27872          UInt  rG    = gregOfRexRM(pfx, modrm);
   27873          UInt  rE    = eregOfRexRM(pfx, modrm);
   27874          DIP("vbroadcastss %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   27875          IRTemp t32 = newTemp(Ity_I32);
   27876          assign(t32, getXMMRegLane32(rE, 0));
   27877          IRTemp t64 = newTemp(Ity_I64);
   27878          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   27879          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   27880                                                   mkexpr(t64), mkexpr(t64));
   27881          putYMMReg(rG, res);
   27882          delta++;
   27883          goto decode_success;
   27884       }
   27885       break;
   27886 
   27887    case 0x19:
   27888       /* VBROADCASTSD m64, ymm1 = VEX.256.66.0F38.WIG 19 /r */
   27889       if (have66noF2noF3(pfx)
   27890           && 1==getVexL(pfx)/*256*/
   27891           && !epartIsReg(getUChar(delta))) {
   27892          UChar modrm = getUChar(delta);
   27893          UInt  rG    = gregOfRexRM(pfx, modrm);
   27894          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   27895          delta += alen;
   27896          DIP("vbroadcastsd %s,%s\n", dis_buf, nameYMMReg(rG));
   27897          IRTemp t64 = newTemp(Ity_I64);
   27898          assign(t64, loadLE(Ity_I64, mkexpr(addr)));
   27899          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   27900                                                   mkexpr(t64), mkexpr(t64));
   27901          putYMMReg(rG, res);
   27902          goto decode_success;
   27903       }
   27904       /* VBROADCASTSD xmm2, ymm1 = VEX.256.66.0F38.WIG 19 /r */
   27905       if (have66noF2noF3(pfx)
   27906           && 1==getVexL(pfx)/*256*/
   27907           && epartIsReg(getUChar(delta))) {
   27908          UChar modrm = getUChar(delta);
   27909          UInt  rG    = gregOfRexRM(pfx, modrm);
   27910          UInt  rE    = eregOfRexRM(pfx, modrm);
   27911          DIP("vbroadcastsd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   27912          IRTemp t64 = newTemp(Ity_I64);
   27913          assign(t64, getXMMRegLane64(rE, 0));
   27914          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   27915                                                   mkexpr(t64), mkexpr(t64));
   27916          putYMMReg(rG, res);
   27917          delta++;
   27918          goto decode_success;
   27919       }
   27920       break;
   27921 
   27922    case 0x1A:
   27923       /* VBROADCASTF128 m128, ymm1 = VEX.256.66.0F38.WIG 1A /r */
   27924       if (have66noF2noF3(pfx)
   27925           && 1==getVexL(pfx)/*256*/
   27926           && !epartIsReg(getUChar(delta))) {
   27927          UChar modrm = getUChar(delta);
   27928          UInt  rG    = gregOfRexRM(pfx, modrm);
   27929          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   27930          delta += alen;
   27931          DIP("vbroadcastf128 %s,%s\n", dis_buf, nameYMMReg(rG));
   27932          IRTemp t128 = newTemp(Ity_V128);
   27933          assign(t128, loadLE(Ity_V128, mkexpr(addr)));
   27934          putYMMReg( rG, binop(Iop_V128HLtoV256, mkexpr(t128), mkexpr(t128)) );
   27935          goto decode_success;
   27936       }
   27937       break;
   27938 
   27939    case 0x1C:
   27940       /* VPABSB xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 1C /r */
   27941       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27942          delta = dis_AVX128_E_to_G_unary(
   27943                     uses_vvvv, vbi, pfx, delta,
   27944                     "vpabsb", math_PABS_XMM_pap1 );
   27945          goto decode_success;
   27946       }
   27947       /* VPABSB ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 1C /r */
   27948       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27949          delta = dis_AVX256_E_to_G_unary(
   27950                     uses_vvvv, vbi, pfx, delta,
   27951                     "vpabsb", math_PABS_YMM_pap1 );
   27952          goto decode_success;
   27953       }
   27954       break;
   27955 
   27956    case 0x1D:
   27957       /* VPABSW xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 1D /r */
   27958       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27959          delta = dis_AVX128_E_to_G_unary(
   27960                     uses_vvvv, vbi, pfx, delta,
   27961                     "vpabsw", math_PABS_XMM_pap2 );
   27962          goto decode_success;
   27963       }
   27964       /* VPABSW ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 1D /r */
   27965       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27966          delta = dis_AVX256_E_to_G_unary(
   27967                     uses_vvvv, vbi, pfx, delta,
   27968                     "vpabsw", math_PABS_YMM_pap2 );
   27969          goto decode_success;
   27970       }
   27971       break;
   27972 
   27973    case 0x1E:
   27974       /* VPABSD xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 1E /r */
   27975       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27976          delta = dis_AVX128_E_to_G_unary(
   27977                     uses_vvvv, vbi, pfx, delta,
   27978                     "vpabsd", math_PABS_XMM_pap4 );
   27979          goto decode_success;
   27980       }
   27981       /* VPABSD ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 1E /r */
   27982       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   27983          delta = dis_AVX256_E_to_G_unary(
   27984                     uses_vvvv, vbi, pfx, delta,
   27985                     "vpabsd", math_PABS_YMM_pap4 );
   27986          goto decode_success;
   27987       }
   27988       break;
   27989 
   27990    case 0x20:
   27991       /* VPMOVSXBW xmm2/m64, xmm1 */
   27992       /* VPMOVSXBW = VEX.128.66.0F38.WIG 20 /r */
   27993       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   27994          delta = dis_PMOVxXBW_128( vbi, pfx, delta,
   27995                                    True/*isAvx*/, False/*!xIsZ*/ );
   27996          goto decode_success;
   27997       }
   27998       /* VPMOVSXBW xmm2/m128, ymm1 */
   27999       /* VPMOVSXBW = VEX.256.66.0F38.WIG 20 /r */
   28000       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28001          delta = dis_PMOVxXBW_256( vbi, pfx, delta, False/*!xIsZ*/ );
   28002          goto decode_success;
   28003       }
   28004       break;
   28005 
   28006    case 0x21:
   28007       /* VPMOVSXBD xmm2/m32, xmm1 */
   28008       /* VPMOVSXBD = VEX.128.66.0F38.WIG 21 /r */
   28009       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28010          delta = dis_PMOVxXBD_128( vbi, pfx, delta,
   28011                                    True/*isAvx*/, False/*!xIsZ*/ );
   28012          goto decode_success;
   28013       }
   28014       /* VPMOVSXBD xmm2/m64, ymm1 */
   28015       /* VPMOVSXBD = VEX.256.66.0F38.WIG 21 /r */
   28016       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28017          delta = dis_PMOVxXBD_256( vbi, pfx, delta, False/*!xIsZ*/ );
   28018          goto decode_success;
   28019       }
   28020       break;
   28021 
   28022    case 0x22:
   28023       /* VPMOVSXBQ xmm2/m16, xmm1 */
   28024       /* VPMOVSXBQ = VEX.128.66.0F38.WIG 22 /r */
   28025       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28026          delta = dis_PMOVSXBQ_128( vbi, pfx, delta, True/*isAvx*/ );
   28027          goto decode_success;
   28028       }
   28029       /* VPMOVSXBQ xmm2/m32, ymm1 */
   28030       /* VPMOVSXBQ = VEX.256.66.0F38.WIG 22 /r */
   28031       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28032          delta = dis_PMOVSXBQ_256( vbi, pfx, delta );
   28033          goto decode_success;
   28034       }
   28035       break;
   28036 
   28037    case 0x23:
   28038       /* VPMOVSXWD xmm2/m64, xmm1 = VEX.128.66.0F38.WIG 23 /r */
   28039       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28040          delta = dis_PMOVxXWD_128( vbi, pfx, delta,
   28041                                    True/*isAvx*/, False/*!xIsZ*/ );
   28042          goto decode_success;
   28043       }
   28044       /* VPMOVSXWD xmm2/m128, ymm1 = VEX.256.66.0F38.WIG 23 /r */
   28045       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28046          delta = dis_PMOVxXWD_256( vbi, pfx, delta, False/*!xIsZ*/ );
   28047          goto decode_success;
   28048       }
   28049       break;
   28050 
   28051    case 0x24:
   28052       /* VPMOVSXWQ xmm2/m32, xmm1 = VEX.128.66.0F38.WIG 24 /r */
   28053       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28054          delta = dis_PMOVSXWQ_128( vbi, pfx, delta, True/*isAvx*/ );
   28055          goto decode_success;
   28056       }
   28057       /* VPMOVSXWQ xmm2/m64, ymm1 = VEX.256.66.0F38.WIG 24 /r */
   28058       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28059          delta = dis_PMOVSXWQ_256( vbi, pfx, delta );
   28060          goto decode_success;
   28061       }
   28062       break;
   28063 
   28064    case 0x25:
   28065       /* VPMOVSXDQ xmm2/m64, xmm1 = VEX.128.66.0F38.WIG 25 /r */
   28066       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28067          delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
   28068                                    True/*isAvx*/, False/*!xIsZ*/ );
   28069          goto decode_success;
   28070       }
   28071       /* VPMOVSXDQ xmm2/m128, ymm1 = VEX.256.66.0F38.WIG 25 /r */
   28072       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28073          delta = dis_PMOVxXDQ_256( vbi, pfx, delta, False/*!xIsZ*/ );
   28074          goto decode_success;
   28075       }
   28076       break;
   28077 
   28078    case 0x28:
   28079       /* VPMULDQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 28 /r */
   28080       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28081          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
   28082                     uses_vvvv, vbi, pfx, delta,
   28083                     "vpmuldq", math_PMULDQ_128 );
   28084          goto decode_success;
   28085       }
   28086       /* VPMULDQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 28 /r */
   28087       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28088          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   28089                     uses_vvvv, vbi, pfx, delta,
   28090                     "vpmuldq", math_PMULDQ_256 );
   28091          goto decode_success;
   28092       }
   28093       break;
   28094 
   28095    case 0x29:
   28096       /* VPCMPEQQ r/m, rV, r ::: r = rV `eq-by-64s` r/m */
   28097       /* VPCMPEQQ = VEX.NDS.128.66.0F38.WIG 29 /r */
   28098       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28099          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28100                     uses_vvvv, vbi, pfx, delta, "vpcmpeqq", Iop_CmpEQ64x2 );
   28101          goto decode_success;
   28102       }
   28103       /* VPCMPEQQ r/m, rV, r ::: r = rV `eq-by-64s` r/m */
   28104       /* VPCMPEQQ = VEX.NDS.256.66.0F38.WIG 29 /r */
   28105       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28106          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28107                     uses_vvvv, vbi, pfx, delta, "vpcmpeqq", Iop_CmpEQ64x4 );
   28108          goto decode_success;
   28109       }
   28110       break;
   28111 
   28112    case 0x2A:
   28113       /* VMOVNTDQA m128, xmm1 = VEX.128.66.0F38.WIG 2A /r */
   28114       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28115           && !epartIsReg(getUChar(delta))) {
   28116          UChar  modrm = getUChar(delta);
   28117          UInt   rD    = gregOfRexRM(pfx, modrm);
   28118          IRTemp tD    = newTemp(Ity_V128);
   28119          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28120          delta += alen;
   28121          gen_SEGV_if_not_16_aligned(addr);
   28122          assign(tD, loadLE(Ity_V128, mkexpr(addr)));
   28123          DIP("vmovntdqa %s,%s\n", dis_buf, nameXMMReg(rD));
   28124          putYMMRegLoAndZU(rD, mkexpr(tD));
   28125          goto decode_success;
   28126       }
   28127       /* VMOVNTDQA m256, ymm1 = VEX.256.66.0F38.WIG 2A /r */
   28128       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28129           && !epartIsReg(getUChar(delta))) {
   28130          UChar  modrm = getUChar(delta);
   28131          UInt   rD    = gregOfRexRM(pfx, modrm);
   28132          IRTemp tD    = newTemp(Ity_V256);
   28133          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28134          delta += alen;
   28135          gen_SEGV_if_not_32_aligned(addr);
   28136          assign(tD, loadLE(Ity_V256, mkexpr(addr)));
   28137          DIP("vmovntdqa %s,%s\n", dis_buf, nameYMMReg(rD));
   28138          putYMMReg(rD, mkexpr(tD));
   28139          goto decode_success;
   28140       }
   28141       break;
   28142 
   28143    case 0x2B:
   28144       /* VPACKUSDW r/m, rV, r ::: r = QNarrowBin32Sto16Ux8(rV, r/m) */
   28145       /* VPACKUSDW = VEX.NDS.128.66.0F38.WIG 2B /r */
   28146       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28147          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
   28148                     uses_vvvv, vbi, pfx, delta, "vpackusdw",
   28149                     Iop_QNarrowBin32Sto16Ux8, NULL,
   28150                     False/*!invertLeftArg*/, True/*swapArgs*/ );
   28151          goto decode_success;
   28152       }
   28153       /* VPACKUSDW r/m, rV, r ::: r = QNarrowBin32Sto16Ux8(rV, r/m) */
   28154       /* VPACKUSDW = VEX.NDS.256.66.0F38.WIG 2B /r */
   28155       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28156          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   28157                     uses_vvvv, vbi, pfx, delta, "vpackusdw",
   28158                     math_VPACKUSDW_YMM );
   28159          goto decode_success;
   28160       }
   28161       break;
   28162 
   28163    case 0x2C:
   28164       /* VMASKMOVPS m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 2C /r */
   28165       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28166           && !epartIsReg(getUChar(delta))) {
   28167          delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
   28168                                     /*!isYMM*/False, Ity_I32 );
   28169          goto decode_success;
   28170       }
   28171       /* VMASKMOVPS m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 2C /r */
   28172       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28173           && !epartIsReg(getUChar(delta))) {
   28174          delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
   28175                                     /*isYMM*/True, Ity_I32 );
   28176          goto decode_success;
   28177       }
   28178       break;
   28179 
   28180    case 0x2D:
   28181       /* VMASKMOVPD m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 2D /r */
   28182       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28183           && !epartIsReg(getUChar(delta))) {
   28184          delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
   28185                                     /*!isYMM*/False, Ity_I64 );
   28186          goto decode_success;
   28187       }
   28188       /* VMASKMOVPD m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 2D /r */
   28189       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28190           && !epartIsReg(getUChar(delta))) {
   28191          delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
   28192                                     /*isYMM*/True, Ity_I64 );
   28193          goto decode_success;
   28194       }
   28195       break;
   28196 
   28197    case 0x30:
   28198       /* VPMOVZXBW xmm2/m64, xmm1 */
   28199       /* VPMOVZXBW = VEX.128.66.0F38.WIG 30 /r */
   28200       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28201          delta = dis_PMOVxXBW_128( vbi, pfx, delta,
   28202                                    True/*isAvx*/, True/*xIsZ*/ );
   28203          goto decode_success;
   28204       }
   28205       /* VPMOVZXBW xmm2/m128, ymm1 */
   28206       /* VPMOVZXBW = VEX.256.66.0F38.WIG 30 /r */
   28207       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28208          delta = dis_PMOVxXBW_256( vbi, pfx, delta, True/*xIsZ*/ );
   28209          goto decode_success;
   28210       }
   28211       break;
   28212 
   28213    case 0x31:
   28214       /* VPMOVZXBD xmm2/m32, xmm1 */
   28215       /* VPMOVZXBD = VEX.128.66.0F38.WIG 31 /r */
   28216       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28217          delta = dis_PMOVxXBD_128( vbi, pfx, delta,
   28218                                    True/*isAvx*/, True/*xIsZ*/ );
   28219          goto decode_success;
   28220       }
   28221       /* VPMOVZXBD xmm2/m64, ymm1 */
   28222       /* VPMOVZXBD = VEX.256.66.0F38.WIG 31 /r */
   28223       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28224          delta = dis_PMOVxXBD_256( vbi, pfx, delta, True/*xIsZ*/ );
   28225          goto decode_success;
   28226       }
   28227       break;
   28228 
   28229    case 0x32:
   28230       /* VPMOVZXBQ xmm2/m16, xmm1 */
   28231       /* VPMOVZXBQ = VEX.128.66.0F38.WIG 32 /r */
   28232       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28233          delta = dis_PMOVZXBQ_128( vbi, pfx, delta, True/*isAvx*/ );
   28234          goto decode_success;
   28235       }
   28236       /* VPMOVZXBQ xmm2/m32, ymm1 */
   28237       /* VPMOVZXBQ = VEX.256.66.0F38.WIG 32 /r */
   28238       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28239          delta = dis_PMOVZXBQ_256( vbi, pfx, delta );
   28240          goto decode_success;
   28241       }
   28242       break;
   28243 
   28244    case 0x33:
   28245       /* VPMOVZXWD xmm2/m64, xmm1 */
   28246       /* VPMOVZXWD = VEX.128.66.0F38.WIG 33 /r */
   28247       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28248          delta = dis_PMOVxXWD_128( vbi, pfx, delta,
   28249                                    True/*isAvx*/, True/*xIsZ*/ );
   28250          goto decode_success;
   28251       }
   28252       /* VPMOVZXWD xmm2/m128, ymm1 */
   28253       /* VPMOVZXWD = VEX.256.66.0F38.WIG 33 /r */
   28254       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28255          delta = dis_PMOVxXWD_256( vbi, pfx, delta, True/*xIsZ*/ );
   28256          goto decode_success;
   28257       }
   28258       break;
   28259 
   28260    case 0x34:
   28261       /* VPMOVZXWQ xmm2/m32, xmm1 = VEX.128.66.0F38.WIG 34 /r */
   28262       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28263          delta = dis_PMOVZXWQ_128( vbi, pfx, delta, True/*isAvx*/ );
   28264          goto decode_success;
   28265       }
   28266       /* VPMOVZXWQ xmm2/m64, ymm1 = VEX.256.66.0F38.WIG 34 /r */
   28267       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28268          delta = dis_PMOVZXWQ_256( vbi, pfx, delta );
   28269          goto decode_success;
   28270       }
   28271       break;
   28272 
   28273    case 0x35:
   28274       /* VPMOVZXDQ xmm2/m64, xmm1 = VEX.128.66.0F38.WIG 35 /r */
   28275       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28276          delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
   28277                                    True/*isAvx*/, True/*xIsZ*/ );
   28278          goto decode_success;
   28279       }
   28280       /* VPMOVZXDQ xmm2/m128, ymm1 = VEX.256.66.0F38.WIG 35 /r */
   28281       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28282          delta = dis_PMOVxXDQ_256( vbi, pfx, delta, True/*xIsZ*/ );
   28283          goto decode_success;
   28284       }
   28285       break;
   28286 
   28287    case 0x36:
   28288       /* VPERMD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 36 /r */
   28289       if (have66noF2noF3(pfx)
   28290           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   28291          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
   28292                     uses_vvvv, vbi, pfx, delta, "vpermd", math_VPERMD );
   28293          goto decode_success;
   28294       }
   28295       break;
   28296 
   28297    case 0x37:
   28298       /* VPCMPGTQ r/m, rV, r ::: r = rV `>s-by-64s` r/m */
   28299       /* VPCMPGTQ = VEX.NDS.128.66.0F38.WIG 37 /r */
   28300       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28301          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28302                     uses_vvvv, vbi, pfx, delta, "vpcmpgtq", Iop_CmpGT64Sx2 );
   28303          goto decode_success;
   28304       }
   28305       /* VPCMPGTQ r/m, rV, r ::: r = rV `>s-by-64s` r/m */
   28306       /* VPCMPGTQ = VEX.NDS.256.66.0F38.WIG 37 /r */
   28307       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28308          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28309                     uses_vvvv, vbi, pfx, delta, "vpcmpgtq", Iop_CmpGT64Sx4 );
   28310          goto decode_success;
   28311       }
   28312       break;
   28313 
   28314    case 0x38:
   28315       /* VPMINSB r/m, rV, r ::: r = min-signed-8s(rV, r/m) */
   28316       /* VPMINSB = VEX.NDS.128.66.0F38.WIG 38 /r */
   28317       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28318          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28319                     uses_vvvv, vbi, pfx, delta, "vpminsb", Iop_Min8Sx16 );
   28320          goto decode_success;
   28321       }
   28322       /* VPMINSB r/m, rV, r ::: r = min-signed-8s(rV, r/m) */
   28323       /* VPMINSB = VEX.NDS.256.66.0F38.WIG 38 /r */
   28324       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28325          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28326                     uses_vvvv, vbi, pfx, delta, "vpminsb", Iop_Min8Sx32 );
   28327          goto decode_success;
   28328       }
   28329       break;
   28330 
   28331    case 0x39:
   28332       /* VPMINSD r/m, rV, r ::: r = min-signed-32s(rV, r/m) */
   28333       /* VPMINSD = VEX.NDS.128.66.0F38.WIG 39 /r */
   28334       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28335          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28336                     uses_vvvv, vbi, pfx, delta, "vpminsd", Iop_Min32Sx4 );
   28337          goto decode_success;
   28338       }
   28339       /* VPMINSD r/m, rV, r ::: r = min-signed-32s(rV, r/m) */
   28340       /* VPMINSD = VEX.NDS.256.66.0F38.WIG 39 /r */
   28341       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28342          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28343                     uses_vvvv, vbi, pfx, delta, "vpminsd", Iop_Min32Sx8 );
   28344          goto decode_success;
   28345       }
   28346       break;
   28347 
   28348    case 0x3A:
   28349       /* VPMINUW r/m, rV, r ::: r = min-unsigned-16s(rV, r/m) */
   28350       /* VPMINUW = VEX.NDS.128.66.0F38.WIG 3A /r */
   28351       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28352          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28353                     uses_vvvv, vbi, pfx, delta, "vpminuw", Iop_Min16Ux8 );
   28354          goto decode_success;
   28355       }
   28356       /* VPMINUW r/m, rV, r ::: r = min-unsigned-16s(rV, r/m) */
   28357       /* VPMINUW = VEX.NDS.256.66.0F38.WIG 3A /r */
   28358       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28359          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28360                     uses_vvvv, vbi, pfx, delta, "vpminuw", Iop_Min16Ux16 );
   28361          goto decode_success;
   28362       }
   28363       break;
   28364 
   28365    case 0x3B:
   28366       /* VPMINUD r/m, rV, r ::: r = min-unsigned-32s(rV, r/m) */
   28367       /* VPMINUD = VEX.NDS.128.66.0F38.WIG 3B /r */
   28368       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28369          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28370                     uses_vvvv, vbi, pfx, delta, "vpminud", Iop_Min32Ux4 );
   28371          goto decode_success;
   28372       }
   28373       /* VPMINUD r/m, rV, r ::: r = min-unsigned-32s(rV, r/m) */
   28374       /* VPMINUD = VEX.NDS.256.66.0F38.WIG 3B /r */
   28375       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28376          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28377                     uses_vvvv, vbi, pfx, delta, "vpminud", Iop_Min32Ux8 );
   28378          goto decode_success;
   28379       }
   28380       break;
   28381 
   28382    case 0x3C:
   28383       /* VPMAXSB r/m, rV, r ::: r = max-signed-8s(rV, r/m) */
   28384       /* VPMAXSB = VEX.NDS.128.66.0F38.WIG 3C /r */
   28385       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28386          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28387                     uses_vvvv, vbi, pfx, delta, "vpmaxsb", Iop_Max8Sx16 );
   28388          goto decode_success;
   28389       }
   28390       /* VPMAXSB r/m, rV, r ::: r = max-signed-8s(rV, r/m) */
   28391       /* VPMAXSB = VEX.NDS.256.66.0F38.WIG 3C /r */
   28392       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28393          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28394                     uses_vvvv, vbi, pfx, delta, "vpmaxsb", Iop_Max8Sx32 );
   28395          goto decode_success;
   28396       }
   28397       break;
   28398 
   28399    case 0x3D:
   28400       /* VPMAXSD r/m, rV, r ::: r = max-signed-32s(rV, r/m) */
   28401       /* VPMAXSD = VEX.NDS.128.66.0F38.WIG 3D /r */
   28402       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28403          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28404                     uses_vvvv, vbi, pfx, delta, "vpmaxsd", Iop_Max32Sx4 );
   28405          goto decode_success;
   28406       }
   28407       /* VPMAXSD r/m, rV, r ::: r = max-signed-32s(rV, r/m) */
   28408       /* VPMAXSD = VEX.NDS.256.66.0F38.WIG 3D /r */
   28409       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28410          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28411                     uses_vvvv, vbi, pfx, delta, "vpmaxsd", Iop_Max32Sx8 );
   28412          goto decode_success;
   28413       }
   28414       break;
   28415 
   28416    case 0x3E:
   28417       /* VPMAXUW r/m, rV, r ::: r = max-unsigned-16s(rV, r/m) */
   28418       /* VPMAXUW = VEX.NDS.128.66.0F38.WIG 3E /r */
   28419       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28420          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28421                     uses_vvvv, vbi, pfx, delta, "vpmaxuw", Iop_Max16Ux8 );
   28422          goto decode_success;
   28423       }
   28424       /* VPMAXUW r/m, rV, r ::: r = max-unsigned-16s(rV, r/m) */
   28425       /* VPMAXUW = VEX.NDS.256.66.0F38.WIG 3E /r */
   28426       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28427          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28428                     uses_vvvv, vbi, pfx, delta, "vpmaxuw", Iop_Max16Ux16 );
   28429          goto decode_success;
   28430       }
   28431       break;
   28432 
   28433    case 0x3F:
   28434       /* VPMAXUD r/m, rV, r ::: r = max-unsigned-32s(rV, r/m) */
   28435       /* VPMAXUD = VEX.NDS.128.66.0F38.WIG 3F /r */
   28436       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28437          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28438                     uses_vvvv, vbi, pfx, delta, "vpmaxud", Iop_Max32Ux4 );
   28439          goto decode_success;
   28440       }
   28441       /* VPMAXUD r/m, rV, r ::: r = max-unsigned-32s(rV, r/m) */
   28442       /* VPMAXUD = VEX.NDS.256.66.0F38.WIG 3F /r */
   28443       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28444          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28445                     uses_vvvv, vbi, pfx, delta, "vpmaxud", Iop_Max32Ux8 );
   28446          goto decode_success;
   28447       }
   28448       break;
   28449 
   28450    case 0x40:
   28451       /* VPMULLD r/m, rV, r ::: r = mul-32s(rV, r/m) */
   28452       /* VPMULLD = VEX.NDS.128.66.0F38.WIG 40 /r */
   28453       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28454          delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
   28455                     uses_vvvv, vbi, pfx, delta, "vpmulld", Iop_Mul32x4 );
   28456          goto decode_success;
   28457       }
   28458       /* VPMULLD r/m, rV, r ::: r = mul-32s(rV, r/m) */
   28459       /* VPMULLD = VEX.NDS.256.66.0F38.WIG 40 /r */
   28460       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   28461          delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
   28462                     uses_vvvv, vbi, pfx, delta, "vpmulld", Iop_Mul32x8 );
   28463          goto decode_success;
   28464       }
   28465       break;
   28466 
   28467    case 0x41:
   28468       /* VPHMINPOSUW xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 41 /r */
   28469       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   28470          delta = dis_PHMINPOSUW_128( vbi, pfx, delta, True/*isAvx*/ );
   28471          goto decode_success;
   28472       }
   28473       break;
   28474 
   28475    case 0x45:
   28476       /* VPSRLVD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 45 /r */
   28477       /* VPSRLVD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 45 /r */
   28478       if (have66noF2noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
   28479          delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsrlvd",
   28480                                          Iop_Shr32, 1==getVexL(pfx) );
   28481          *uses_vvvv = True;
   28482          goto decode_success;
   28483       }
   28484       /* VPSRLVQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 45 /r */
   28485       /* VPSRLVQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 45 /r */
   28486       if (have66noF2noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
   28487          delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsrlvq",
   28488                                          Iop_Shr64, 1==getVexL(pfx) );
   28489          *uses_vvvv = True;
   28490          goto decode_success;
   28491       }
   28492       break;
   28493 
   28494    case 0x46:
   28495       /* VPSRAVD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 46 /r */
   28496       /* VPSRAVD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 46 /r */
   28497       if (have66noF2noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
   28498          delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsravd",
   28499                                          Iop_Sar32, 1==getVexL(pfx) );
   28500          *uses_vvvv = True;
   28501          goto decode_success;
   28502       }
   28503       break;
   28504 
   28505    case 0x47:
   28506       /* VPSLLVD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 47 /r */
   28507       /* VPSLLVD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 47 /r */
   28508       if (have66noF2noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
   28509          delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsllvd",
   28510                                          Iop_Shl32, 1==getVexL(pfx) );
   28511          *uses_vvvv = True;
   28512          goto decode_success;
   28513       }
   28514       /* VPSLLVQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 47 /r */
   28515       /* VPSLLVQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 47 /r */
   28516       if (have66noF2noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
   28517          delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsllvq",
   28518                                          Iop_Shl64, 1==getVexL(pfx) );
   28519          *uses_vvvv = True;
   28520          goto decode_success;
   28521       }
   28522       break;
   28523 
   28524    case 0x58:
   28525       /* VPBROADCASTD xmm2/m32, xmm1 = VEX.128.66.0F38.W0 58 /r */
   28526       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28527           && 0==getRexW(pfx)/*W0*/) {
   28528          UChar modrm = getUChar(delta);
   28529          UInt  rG    = gregOfRexRM(pfx, modrm);
   28530          IRTemp t32 = newTemp(Ity_I32);
   28531          if (epartIsReg(modrm)) {
   28532             UInt rE = eregOfRexRM(pfx, modrm);
   28533             delta++;
   28534             DIP("vpbroadcastd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   28535             assign(t32, getXMMRegLane32(rE, 0));
   28536          } else {
   28537             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28538             delta += alen;
   28539             DIP("vpbroadcastd %s,%s\n", dis_buf, nameXMMReg(rG));
   28540             assign(t32, loadLE(Ity_I32, mkexpr(addr)));
   28541          }
   28542          IRTemp t64 = newTemp(Ity_I64);
   28543          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   28544          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
   28545          putYMMRegLoAndZU(rG, res);
   28546          goto decode_success;
   28547       }
   28548       /* VPBROADCASTD xmm2/m32, ymm1 = VEX.256.66.0F38.W0 58 /r */
   28549       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28550           && 0==getRexW(pfx)/*W0*/) {
   28551          UChar modrm = getUChar(delta);
   28552          UInt  rG    = gregOfRexRM(pfx, modrm);
   28553          IRTemp t32 = newTemp(Ity_I32);
   28554          if (epartIsReg(modrm)) {
   28555             UInt rE = eregOfRexRM(pfx, modrm);
   28556             delta++;
   28557             DIP("vpbroadcastd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   28558             assign(t32, getXMMRegLane32(rE, 0));
   28559          } else {
   28560             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28561             delta += alen;
   28562             DIP("vpbroadcastd %s,%s\n", dis_buf, nameYMMReg(rG));
   28563             assign(t32, loadLE(Ity_I32, mkexpr(addr)));
   28564          }
   28565          IRTemp t64 = newTemp(Ity_I64);
   28566          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   28567          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   28568                                                   mkexpr(t64), mkexpr(t64));
   28569          putYMMReg(rG, res);
   28570          goto decode_success;
   28571       }
   28572       break;
   28573 
   28574    case 0x59:
   28575       /* VPBROADCASTQ xmm2/m64, xmm1 = VEX.128.66.0F38.W0 59 /r */
   28576       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28577           && 0==getRexW(pfx)/*W0*/) {
   28578          UChar modrm = getUChar(delta);
   28579          UInt  rG    = gregOfRexRM(pfx, modrm);
   28580          IRTemp t64 = newTemp(Ity_I64);
   28581          if (epartIsReg(modrm)) {
   28582             UInt rE = eregOfRexRM(pfx, modrm);
   28583             delta++;
   28584             DIP("vpbroadcastq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   28585             assign(t64, getXMMRegLane64(rE, 0));
   28586          } else {
   28587             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28588             delta += alen;
   28589             DIP("vpbroadcastq %s,%s\n", dis_buf, nameXMMReg(rG));
   28590             assign(t64, loadLE(Ity_I64, mkexpr(addr)));
   28591          }
   28592          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
   28593          putYMMRegLoAndZU(rG, res);
   28594          goto decode_success;
   28595       }
   28596       /* VPBROADCASTQ xmm2/m64, ymm1 = VEX.256.66.0F38.W0 59 /r */
   28597       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28598           && 0==getRexW(pfx)/*W0*/) {
   28599          UChar modrm = getUChar(delta);
   28600          UInt  rG    = gregOfRexRM(pfx, modrm);
   28601          IRTemp t64 = newTemp(Ity_I64);
   28602          if (epartIsReg(modrm)) {
   28603             UInt rE = eregOfRexRM(pfx, modrm);
   28604             delta++;
   28605             DIP("vpbroadcastq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   28606             assign(t64, getXMMRegLane64(rE, 0));
   28607          } else {
   28608             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28609             delta += alen;
   28610             DIP("vpbroadcastq %s,%s\n", dis_buf, nameYMMReg(rG));
   28611             assign(t64, loadLE(Ity_I64, mkexpr(addr)));
   28612          }
   28613          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   28614                                                   mkexpr(t64), mkexpr(t64));
   28615          putYMMReg(rG, res);
   28616          goto decode_success;
   28617       }
   28618       break;
   28619 
   28620    case 0x5A:
   28621       /* VBROADCASTI128 m128, ymm1 = VEX.256.66.0F38.WIG 5A /r */
   28622       if (have66noF2noF3(pfx)
   28623           && 1==getVexL(pfx)/*256*/
   28624           && !epartIsReg(getUChar(delta))) {
   28625          UChar modrm = getUChar(delta);
   28626          UInt  rG    = gregOfRexRM(pfx, modrm);
   28627          addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28628          delta += alen;
   28629          DIP("vbroadcasti128 %s,%s\n", dis_buf, nameYMMReg(rG));
   28630          IRTemp t128 = newTemp(Ity_V128);
   28631          assign(t128, loadLE(Ity_V128, mkexpr(addr)));
   28632          putYMMReg( rG, binop(Iop_V128HLtoV256, mkexpr(t128), mkexpr(t128)) );
   28633          goto decode_success;
   28634       }
   28635       break;
   28636 
   28637    case 0x78:
   28638       /* VPBROADCASTB xmm2/m8, xmm1 = VEX.128.66.0F38.W0 78 /r */
   28639       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28640           && 0==getRexW(pfx)/*W0*/) {
   28641          UChar modrm = getUChar(delta);
   28642          UInt  rG    = gregOfRexRM(pfx, modrm);
   28643          IRTemp t8   = newTemp(Ity_I8);
   28644          if (epartIsReg(modrm)) {
   28645             UInt rE = eregOfRexRM(pfx, modrm);
   28646             DIP("vpbroadcastb %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   28647             assign(t8, unop(Iop_32to8, getXMMRegLane32(rE, 0)));
   28648          } else {
   28649             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28650             delta += alen;
   28651             DIP("vpbroadcastb %s,%s\n", dis_buf, nameXMMReg(rG));
   28652             assign(t8, loadLE(Ity_I8, mkexpr(addr)));
   28653          }
   28654          IRTemp t16 = newTemp(Ity_I16);
   28655          assign(t16, binop(Iop_8HLto16, mkexpr(t8), mkexpr(t8)));
   28656          IRTemp t32 = newTemp(Ity_I32);
   28657          assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
   28658          IRTemp t64 = newTemp(Ity_I64);
   28659          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   28660          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
   28661          putYMMRegLoAndZU(rG, res);
   28662          goto decode_success;
   28663       }
   28664       /* VPBROADCASTB xmm2/m8, ymm1 = VEX.256.66.0F38.W0 78 /r */
   28665       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28666           && 0==getRexW(pfx)/*W0*/) {
   28667          UChar modrm = getUChar(delta);
   28668          UInt  rG    = gregOfRexRM(pfx, modrm);
   28669          IRTemp t8   = newTemp(Ity_I8);
   28670          if (epartIsReg(modrm)) {
   28671             UInt rE = eregOfRexRM(pfx, modrm);
   28672             DIP("vpbroadcastb %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   28673             assign(t8, unop(Iop_32to8, getXMMRegLane32(rE, 0)));
   28674          } else {
   28675             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28676             delta += alen;
   28677             DIP("vpbroadcastb %s,%s\n", dis_buf, nameYMMReg(rG));
   28678             assign(t8, loadLE(Ity_I8, mkexpr(addr)));
   28679          }
   28680          IRTemp t16 = newTemp(Ity_I16);
   28681          assign(t16, binop(Iop_8HLto16, mkexpr(t8), mkexpr(t8)));
   28682          IRTemp t32 = newTemp(Ity_I32);
   28683          assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
   28684          IRTemp t64 = newTemp(Ity_I64);
   28685          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   28686          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   28687                                                   mkexpr(t64), mkexpr(t64));
   28688          putYMMReg(rG, res);
   28689          goto decode_success;
   28690       }
   28691       break;
   28692 
   28693    case 0x79:
   28694       /* VPBROADCASTW xmm2/m16, xmm1 = VEX.128.66.0F38.W0 79 /r */
   28695       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28696           && 0==getRexW(pfx)/*W0*/) {
   28697          UChar modrm = getUChar(delta);
   28698          UInt  rG    = gregOfRexRM(pfx, modrm);
   28699          IRTemp t16  = newTemp(Ity_I16);
   28700          if (epartIsReg(modrm)) {
   28701             UInt rE = eregOfRexRM(pfx, modrm);
   28702             DIP("vpbroadcastw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   28703             assign(t16, unop(Iop_32to16, getXMMRegLane32(rE, 0)));
   28704          } else {
   28705             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28706             delta += alen;
   28707             DIP("vpbroadcastw %s,%s\n", dis_buf, nameXMMReg(rG));
   28708             assign(t16, loadLE(Ity_I16, mkexpr(addr)));
   28709          }
   28710          IRTemp t32 = newTemp(Ity_I32);
   28711          assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
   28712          IRTemp t64 = newTemp(Ity_I64);
   28713          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   28714          IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
   28715          putYMMRegLoAndZU(rG, res);
   28716          goto decode_success;
   28717       }
   28718       /* VPBROADCASTW xmm2/m16, ymm1 = VEX.256.66.0F38.W0 79 /r */
   28719       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28720           && 0==getRexW(pfx)/*W0*/) {
   28721          UChar modrm = getUChar(delta);
   28722          UInt  rG    = gregOfRexRM(pfx, modrm);
   28723          IRTemp t16  = newTemp(Ity_I16);
   28724          if (epartIsReg(modrm)) {
   28725             UInt rE = eregOfRexRM(pfx, modrm);
   28726             DIP("vpbroadcastw %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   28727             assign(t16, unop(Iop_32to16, getXMMRegLane32(rE, 0)));
   28728          } else {
   28729             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   28730             delta += alen;
   28731             DIP("vpbroadcastw %s,%s\n", dis_buf, nameYMMReg(rG));
   28732             assign(t16, loadLE(Ity_I16, mkexpr(addr)));
   28733          }
   28734          IRTemp t32 = newTemp(Ity_I32);
   28735          assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
   28736          IRTemp t64 = newTemp(Ity_I64);
   28737          assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
   28738          IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
   28739                                                   mkexpr(t64), mkexpr(t64));
   28740          putYMMReg(rG, res);
   28741          goto decode_success;
   28742       }
   28743       break;
   28744 
   28745    case 0x8C:
   28746       /* VPMASKMOVD m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 8C /r */
   28747       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28748           && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   28749          delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
   28750                                     /*!isYMM*/False, Ity_I32 );
   28751          goto decode_success;
   28752       }
   28753       /* VPMASKMOVD m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 8C /r */
   28754       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28755           && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   28756          delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
   28757                                     /*isYMM*/True, Ity_I32 );
   28758          goto decode_success;
   28759       }
   28760       /* VPMASKMOVQ m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 8C /r */
   28761       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28762           && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   28763          delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
   28764                                     /*!isYMM*/False, Ity_I64 );
   28765          goto decode_success;
   28766       }
   28767       /* VPMASKMOVQ m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 8C /r */
   28768       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28769           && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   28770          delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
   28771                                     /*isYMM*/True, Ity_I64 );
   28772          goto decode_success;
   28773       }
   28774       break;
   28775 
   28776    case 0x90:
   28777       /* VPGATHERDD xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W0 90 /r */
   28778       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28779           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   28780          Long delta0 = delta;
   28781          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdd",
   28782                               /*!isYMM*/False, /*!isVM64x*/False, Ity_I32 );
   28783          if (delta != delta0)
   28784             goto decode_success;
   28785       }
   28786       /* VPGATHERDD ymm2, vm32y, ymm1 = VEX.DDS.256.66.0F38.W0 90 /r */
   28787       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28788           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   28789          Long delta0 = delta;
   28790          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdd",
   28791                               /*isYMM*/True, /*!isVM64x*/False, Ity_I32 );
   28792          if (delta != delta0)
   28793             goto decode_success;
   28794       }
   28795       /* VPGATHERDQ xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W1 90 /r */
   28796       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28797           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   28798          Long delta0 = delta;
   28799          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdq",
   28800                               /*!isYMM*/False, /*!isVM64x*/False, Ity_I64 );
   28801          if (delta != delta0)
   28802             goto decode_success;
   28803       }
   28804       /* VPGATHERDQ ymm2, vm32x, ymm1 = VEX.DDS.256.66.0F38.W1 90 /r */
   28805       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28806           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   28807          Long delta0 = delta;
   28808          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdq",
   28809                               /*isYMM*/True, /*!isVM64x*/False, Ity_I64 );
   28810          if (delta != delta0)
   28811             goto decode_success;
   28812       }
   28813       break;
   28814 
   28815    case 0x91:
   28816       /* VPGATHERQD xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W0 91 /r */
   28817       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28818           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   28819          Long delta0 = delta;
   28820          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqd",
   28821                               /*!isYMM*/False, /*isVM64x*/True, Ity_I32 );
   28822          if (delta != delta0)
   28823             goto decode_success;
   28824       }
   28825       /* VPGATHERQD xmm2, vm64y, xmm1 = VEX.DDS.256.66.0F38.W0 91 /r */
   28826       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28827           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   28828          Long delta0 = delta;
   28829          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqd",
   28830                               /*isYMM*/True, /*isVM64x*/True, Ity_I32 );
   28831          if (delta != delta0)
   28832             goto decode_success;
   28833       }
   28834       /* VPGATHERQQ xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W1 91 /r */
   28835       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28836           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   28837          Long delta0 = delta;
   28838          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqq",
   28839                               /*!isYMM*/False, /*isVM64x*/True, Ity_I64 );
   28840          if (delta != delta0)
   28841             goto decode_success;
   28842       }
   28843       /* VPGATHERQQ ymm2, vm64y, ymm1 = VEX.DDS.256.66.0F38.W1 91 /r */
   28844       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28845           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   28846          Long delta0 = delta;
   28847          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqq",
   28848                               /*isYMM*/True, /*isVM64x*/True, Ity_I64 );
   28849          if (delta != delta0)
   28850             goto decode_success;
   28851       }
   28852       break;
   28853 
   28854    case 0x92:
   28855       /* VGATHERDPS xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W0 92 /r */
   28856       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28857           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   28858          Long delta0 = delta;
   28859          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdps",
   28860                               /*!isYMM*/False, /*!isVM64x*/False, Ity_I32 );
   28861          if (delta != delta0)
   28862             goto decode_success;
   28863       }
   28864       /* VGATHERDPS ymm2, vm32y, ymm1 = VEX.DDS.256.66.0F38.W0 92 /r */
   28865       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28866           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   28867          Long delta0 = delta;
   28868          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdps",
   28869                               /*isYMM*/True, /*!isVM64x*/False, Ity_I32 );
   28870          if (delta != delta0)
   28871             goto decode_success;
   28872       }
   28873       /* VGATHERDPD xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W1 92 /r */
   28874       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28875           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   28876          Long delta0 = delta;
   28877          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdpd",
   28878                               /*!isYMM*/False, /*!isVM64x*/False, Ity_I64 );
   28879          if (delta != delta0)
   28880             goto decode_success;
   28881       }
   28882       /* VGATHERDPD ymm2, vm32x, ymm1 = VEX.DDS.256.66.0F38.W1 92 /r */
   28883       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28884           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   28885          Long delta0 = delta;
   28886          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdpd",
   28887                               /*isYMM*/True, /*!isVM64x*/False, Ity_I64 );
   28888          if (delta != delta0)
   28889             goto decode_success;
   28890       }
   28891       break;
   28892 
   28893    case 0x93:
   28894       /* VGATHERQPS xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W0 93 /r */
   28895       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28896           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   28897          Long delta0 = delta;
   28898          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqps",
   28899                               /*!isYMM*/False, /*isVM64x*/True, Ity_I32 );
   28900          if (delta != delta0)
   28901             goto decode_success;
   28902       }
   28903       /* VGATHERQPS xmm2, vm64y, xmm1 = VEX.DDS.256.66.0F38.W0 93 /r */
   28904       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28905           && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
   28906          Long delta0 = delta;
   28907          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqps",
   28908                               /*isYMM*/True, /*isVM64x*/True, Ity_I32 );
   28909          if (delta != delta0)
   28910             goto decode_success;
   28911       }
   28912       /* VGATHERQPD xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W1 93 /r */
   28913       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
   28914           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   28915          Long delta0 = delta;
   28916          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqpd",
   28917                               /*!isYMM*/False, /*isVM64x*/True, Ity_I64 );
   28918          if (delta != delta0)
   28919             goto decode_success;
   28920       }
   28921       /* VGATHERQPD ymm2, vm64y, ymm1 = VEX.DDS.256.66.0F38.W1 93 /r */
   28922       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   28923           && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
   28924          Long delta0 = delta;
   28925          delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqpd",
   28926                               /*isYMM*/True, /*isVM64x*/True, Ity_I64 );
   28927          if (delta != delta0)
   28928             goto decode_success;
   28929       }
   28930       break;
   28931 
   28932    case 0x96 ... 0x9F:
   28933    case 0xA6 ... 0xAF:
   28934    case 0xB6 ... 0xBF:
   28935       /* VFMADDSUB132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 96 /r */
   28936       /* VFMADDSUB132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 96 /r */
   28937       /* VFMADDSUB132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 96 /r */
   28938       /* VFMADDSUB132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 96 /r */
   28939       /* VFMSUBADD132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 97 /r */
   28940       /* VFMSUBADD132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 97 /r */
   28941       /* VFMSUBADD132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 97 /r */
   28942       /* VFMSUBADD132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 97 /r */
   28943       /* VFMADD132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 98 /r */
   28944       /* VFMADD132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 98 /r */
   28945       /* VFMADD132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 98 /r */
   28946       /* VFMADD132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 98 /r */
   28947       /* VFMADD132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 99 /r */
   28948       /* VFMADD132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 99 /r */
   28949       /* VFMSUB132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 9A /r */
   28950       /* VFMSUB132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 9A /r */
   28951       /* VFMSUB132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 9A /r */
   28952       /* VFMSUB132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 9A /r */
   28953       /* VFMSUB132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 9B /r */
   28954       /* VFMSUB132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 9B /r */
   28955       /* VFNMADD132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 9C /r */
   28956       /* VFNMADD132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 9C /r */
   28957       /* VFNMADD132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 9C /r */
   28958       /* VFNMADD132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 9C /r */
   28959       /* VFNMADD132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 9D /r */
   28960       /* VFNMADD132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 9D /r */
   28961       /* VFNMSUB132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 9E /r */
   28962       /* VFNMSUB132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 9E /r */
   28963       /* VFNMSUB132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 9E /r */
   28964       /* VFNMSUB132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 9E /r */
   28965       /* VFNMSUB132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 9F /r */
   28966       /* VFNMSUB132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 9F /r */
   28967       /* VFMADDSUB213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 A6 /r */
   28968       /* VFMADDSUB213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 A6 /r */
   28969       /* VFMADDSUB213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 A6 /r */
   28970       /* VFMADDSUB213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 A6 /r */
   28971       /* VFMSUBADD213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 A7 /r */
   28972       /* VFMSUBADD213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 A7 /r */
   28973       /* VFMSUBADD213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 A7 /r */
   28974       /* VFMSUBADD213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 A7 /r */
   28975       /* VFMADD213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 A8 /r */
   28976       /* VFMADD213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 A8 /r */
   28977       /* VFMADD213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 A8 /r */
   28978       /* VFMADD213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 A8 /r */
   28979       /* VFMADD213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 A9 /r */
   28980       /* VFMADD213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 A9 /r */
   28981       /* VFMSUB213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 AA /r */
   28982       /* VFMSUB213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 AA /r */
   28983       /* VFMSUB213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 AA /r */
   28984       /* VFMSUB213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 AA /r */
   28985       /* VFMSUB213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 AB /r */
   28986       /* VFMSUB213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 AB /r */
   28987       /* VFNMADD213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 AC /r */
   28988       /* VFNMADD213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 AC /r */
   28989       /* VFNMADD213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 AC /r */
   28990       /* VFNMADD213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 AC /r */
   28991       /* VFNMADD213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 AD /r */
   28992       /* VFNMADD213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 AD /r */
   28993       /* VFNMSUB213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 AE /r */
   28994       /* VFNMSUB213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 AE /r */
   28995       /* VFNMSUB213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 AE /r */
   28996       /* VFNMSUB213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 AE /r */
   28997       /* VFNMSUB213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 AF /r */
   28998       /* VFNMSUB213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 AF /r */
   28999       /* VFMADDSUB231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 B6 /r */
   29000       /* VFMADDSUB231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 B6 /r */
   29001       /* VFMADDSUB231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 B6 /r */
   29002       /* VFMADDSUB231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 B6 /r */
   29003       /* VFMSUBADD231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 B7 /r */
   29004       /* VFMSUBADD231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 B7 /r */
   29005       /* VFMSUBADD231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 B7 /r */
   29006       /* VFMSUBADD231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 B7 /r */
   29007       /* VFMADD231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 B8 /r */
   29008       /* VFMADD231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 B8 /r */
   29009       /* VFMADD231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 B8 /r */
   29010       /* VFMADD231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 B8 /r */
   29011       /* VFMADD231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 B9 /r */
   29012       /* VFMADD231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 B9 /r */
   29013       /* VFMSUB231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 BA /r */
   29014       /* VFMSUB231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 BA /r */
   29015       /* VFMSUB231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 BA /r */
   29016       /* VFMSUB231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 BA /r */
   29017       /* VFMSUB231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 BB /r */
   29018       /* VFMSUB231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 BB /r */
   29019       /* VFNMADD231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 BC /r */
   29020       /* VFNMADD231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 BC /r */
   29021       /* VFNMADD231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 BC /r */
   29022       /* VFNMADD231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 BC /r */
   29023       /* VFNMADD231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 BD /r */
   29024       /* VFNMADD231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 BD /r */
   29025       /* VFNMSUB231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 BE /r */
   29026       /* VFNMSUB231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 BE /r */
   29027       /* VFNMSUB231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 BE /r */
   29028       /* VFNMSUB231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 BE /r */
   29029       /* VFNMSUB231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 BF /r */
   29030       /* VFNMSUB231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 BF /r */
   29031       if (have66noF2noF3(pfx)) {
   29032          delta = dis_FMA( vbi, pfx, delta, opc );
   29033          *uses_vvvv = True;
   29034          goto decode_success;
   29035       }
   29036       break;
   29037 
   29038    case 0xDB:
   29039    case 0xDC:
   29040    case 0xDD:
   29041    case 0xDE:
   29042    case 0xDF:
   29043       /* VAESIMC xmm2/m128, xmm1 = VEX.128.66.0F38.WIG DB /r */
   29044       /* VAESENC xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DC /r */
   29045       /* VAESENCLAST xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DD /r */
   29046       /* VAESDEC xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DE /r */
   29047       /* VAESDECLAST xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DF /r */
   29048       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   29049          delta = dis_AESx( vbi, pfx, delta, True/*!isAvx*/, opc );
   29050          if (opc != 0xDB) *uses_vvvv = True;
   29051          goto decode_success;
   29052       }
   29053       break;
   29054 
   29055    case 0xF2:
   29056       /* ANDN r/m32, r32b, r32a = VEX.NDS.LZ.0F38.W0 F2 /r */
   29057       /* ANDN r/m64, r64b, r64a = VEX.NDS.LZ.0F38.W1 F2 /r */
   29058       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   29059          Int     size = getRexW(pfx) ? 8 : 4;
   29060          IRType  ty   = szToITy(size);
   29061          IRTemp  dst  = newTemp(ty);
   29062          IRTemp  src1 = newTemp(ty);
   29063          IRTemp  src2 = newTemp(ty);
   29064          UChar   rm   = getUChar(delta);
   29065 
   29066          assign( src1, getIRegV(size,pfx) );
   29067          if (epartIsReg(rm)) {
   29068             assign( src2, getIRegE(size,pfx,rm) );
   29069             DIP("andn %s,%s,%s\n", nameIRegE(size,pfx,rm),
   29070                 nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
   29071             delta++;
   29072          } else {
   29073             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29074             assign( src2, loadLE(ty, mkexpr(addr)) );
   29075             DIP("andn %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
   29076                 nameIRegG(size,pfx,rm));
   29077             delta += alen;
   29078          }
   29079 
   29080          assign( dst, binop( mkSizedOp(ty,Iop_And8),
   29081                              unop( mkSizedOp(ty,Iop_Not8), mkexpr(src1) ),
   29082                              mkexpr(src2) ) );
   29083          putIRegG( size, pfx, rm, mkexpr(dst) );
   29084          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
   29085                                                ? AMD64G_CC_OP_ANDN64
   29086                                                : AMD64G_CC_OP_ANDN32)) );
   29087          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
   29088          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
   29089          *uses_vvvv = True;
   29090          goto decode_success;
   29091       }
   29092       break;
   29093 
   29094    case 0xF3:
   29095       /* BLSI r/m32, r32 = VEX.NDD.LZ.0F38.W0 F3 /3 */
   29096       /* BLSI r/m64, r64 = VEX.NDD.LZ.0F38.W1 F3 /3 */
   29097       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/
   29098           && !haveREX(pfx) && gregLO3ofRM(getUChar(delta)) == 3) {
   29099          Int     size = getRexW(pfx) ? 8 : 4;
   29100          IRType  ty   = szToITy(size);
   29101          IRTemp  src  = newTemp(ty);
   29102          IRTemp  dst  = newTemp(ty);
   29103          UChar   rm   = getUChar(delta);
   29104 
   29105          if (epartIsReg(rm)) {
   29106             assign( src, getIRegE(size,pfx,rm) );
   29107             DIP("blsi %s,%s\n", nameIRegE(size,pfx,rm),
   29108                 nameIRegV(size,pfx));
   29109             delta++;
   29110          } else {
   29111             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29112             assign( src, loadLE(ty, mkexpr(addr)) );
   29113             DIP("blsi %s,%s\n", dis_buf, nameIRegV(size,pfx));
   29114             delta += alen;
   29115          }
   29116 
   29117          assign( dst, binop(mkSizedOp(ty,Iop_And8),
   29118                             binop(mkSizedOp(ty,Iop_Sub8), mkU(ty, 0),
   29119                                   mkexpr(src)), mkexpr(src)) );
   29120          putIRegV( size, pfx, mkexpr(dst) );
   29121          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
   29122                                                ? AMD64G_CC_OP_BLSI64
   29123                                                : AMD64G_CC_OP_BLSI32)) );
   29124          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
   29125          stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(src))) );
   29126          *uses_vvvv = True;
   29127          goto decode_success;
   29128       }
   29129       /* BLSMSK r/m32, r32 = VEX.NDD.LZ.0F38.W0 F3 /2 */
   29130       /* BLSMSK r/m64, r64 = VEX.NDD.LZ.0F38.W1 F3 /2 */
   29131       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/
   29132           && !haveREX(pfx) && gregLO3ofRM(getUChar(delta)) == 2) {
   29133          Int     size = getRexW(pfx) ? 8 : 4;
   29134          IRType  ty   = szToITy(size);
   29135          IRTemp  src  = newTemp(ty);
   29136          IRTemp  dst  = newTemp(ty);
   29137          UChar   rm   = getUChar(delta);
   29138 
   29139          if (epartIsReg(rm)) {
   29140             assign( src, getIRegE(size,pfx,rm) );
   29141             DIP("blsmsk %s,%s\n", nameIRegE(size,pfx,rm),
   29142                 nameIRegV(size,pfx));
   29143             delta++;
   29144          } else {
   29145             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29146             assign( src, loadLE(ty, mkexpr(addr)) );
   29147             DIP("blsmsk %s,%s\n", dis_buf, nameIRegV(size,pfx));
   29148             delta += alen;
   29149          }
   29150 
   29151          assign( dst, binop(mkSizedOp(ty,Iop_Xor8),
   29152                             binop(mkSizedOp(ty,Iop_Sub8), mkexpr(src),
   29153                                   mkU(ty, 1)), mkexpr(src)) );
   29154          putIRegV( size, pfx, mkexpr(dst) );
   29155          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
   29156                                                ? AMD64G_CC_OP_BLSMSK64
   29157                                                : AMD64G_CC_OP_BLSMSK32)) );
   29158          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
   29159          stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(src))) );
   29160          *uses_vvvv = True;
   29161          goto decode_success;
   29162       }
   29163       /* BLSR r/m32, r32 = VEX.NDD.LZ.0F38.W0 F3 /1 */
   29164       /* BLSR r/m64, r64 = VEX.NDD.LZ.0F38.W1 F3 /1 */
   29165       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/
   29166           && !haveREX(pfx) && gregLO3ofRM(getUChar(delta)) == 1) {
   29167          Int     size = getRexW(pfx) ? 8 : 4;
   29168          IRType  ty   = szToITy(size);
   29169          IRTemp  src  = newTemp(ty);
   29170          IRTemp  dst  = newTemp(ty);
   29171          UChar   rm   = getUChar(delta);
   29172 
   29173          if (epartIsReg(rm)) {
   29174             assign( src, getIRegE(size,pfx,rm) );
   29175             DIP("blsr %s,%s\n", nameIRegE(size,pfx,rm),
   29176                 nameIRegV(size,pfx));
   29177             delta++;
   29178          } else {
   29179             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29180             assign( src, loadLE(ty, mkexpr(addr)) );
   29181             DIP("blsr %s,%s\n", dis_buf, nameIRegV(size,pfx));
   29182             delta += alen;
   29183          }
   29184 
   29185          assign( dst, binop(mkSizedOp(ty,Iop_And8),
   29186                             binop(mkSizedOp(ty,Iop_Sub8), mkexpr(src),
   29187                                   mkU(ty, 1)), mkexpr(src)) );
   29188          putIRegV( size, pfx, mkexpr(dst) );
   29189          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
   29190                                                ? AMD64G_CC_OP_BLSR64
   29191                                                : AMD64G_CC_OP_BLSR32)) );
   29192          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
   29193          stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(src))) );
   29194          *uses_vvvv = True;
   29195          goto decode_success;
   29196       }
   29197       break;
   29198 
   29199    case 0xF5:
   29200       /* BZHI r32b, r/m32, r32a = VEX.NDS.LZ.0F38.W0 F5 /r */
   29201       /* BZHI r64b, r/m64, r64a = VEX.NDS.LZ.0F38.W1 F5 /r */
   29202       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   29203          Int     size  = getRexW(pfx) ? 8 : 4;
   29204          IRType  ty    = szToITy(size);
   29205          IRTemp  dst   = newTemp(ty);
   29206          IRTemp  src1  = newTemp(ty);
   29207          IRTemp  src2  = newTemp(ty);
   29208          IRTemp  start = newTemp(Ity_I8);
   29209          IRTemp  cond  = newTemp(Ity_I1);
   29210          UChar   rm    = getUChar(delta);
   29211 
   29212          assign( src2, getIRegV(size,pfx) );
   29213          if (epartIsReg(rm)) {
   29214             assign( src1, getIRegE(size,pfx,rm) );
   29215             DIP("bzhi %s,%s,%s\n", nameIRegV(size,pfx),
   29216                 nameIRegE(size,pfx,rm), nameIRegG(size,pfx,rm));
   29217             delta++;
   29218          } else {
   29219             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29220             assign( src1, loadLE(ty, mkexpr(addr)) );
   29221             DIP("bzhi %s,%s,%s\n", nameIRegV(size,pfx), dis_buf,
   29222                 nameIRegG(size,pfx,rm));
   29223             delta += alen;
   29224          }
   29225 
   29226          assign( start, narrowTo( Ity_I8, mkexpr(src2) ) );
   29227          assign( cond, binop(Iop_CmpLT32U,
   29228                              unop(Iop_8Uto32, mkexpr(start)),
   29229                              mkU32(8*size)) );
   29230          /* if (start < opsize) {
   29231                if (start == 0)
   29232                   dst = 0;
   29233                else
   29234                   dst = (src1 << (opsize-start)) u>> (opsize-start);
   29235             } else {
   29236                dst = src1;
   29237             } */
   29238          assign( dst,
   29239                  IRExpr_ITE(
   29240                     mkexpr(cond),
   29241                     IRExpr_ITE(
   29242                        binop(Iop_CmpEQ8, mkexpr(start), mkU8(0)),
   29243                        mkU(ty, 0),
   29244                        binop(
   29245                           mkSizedOp(ty,Iop_Shr8),
   29246                           binop(
   29247                              mkSizedOp(ty,Iop_Shl8),
   29248                              mkexpr(src1),
   29249                              binop(Iop_Sub8, mkU8(8*size), mkexpr(start))
   29250                           ),
   29251                           binop(Iop_Sub8, mkU8(8*size), mkexpr(start))
   29252                        )
   29253                     ),
   29254                     mkexpr(src1)
   29255                  )
   29256                );
   29257          putIRegG( size, pfx, rm, mkexpr(dst) );
   29258          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
   29259                                                ? AMD64G_CC_OP_BLSR64
   29260                                                : AMD64G_CC_OP_BLSR32)) );
   29261          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
   29262          stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(cond))) );
   29263          *uses_vvvv = True;
   29264          goto decode_success;
   29265       }
   29266       /* PDEP r/m32, r32b, r32a = VEX.NDS.LZ.F2.0F38.W0 F5 /r */
   29267       /* PDEP r/m64, r64b, r64a = VEX.NDS.LZ.F2.0F38.W1 F5 /r */
   29268       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   29269          Int     size = getRexW(pfx) ? 8 : 4;
   29270          IRType  ty   = szToITy(size);
   29271          IRTemp  src  = newTemp(ty);
   29272          IRTemp  mask = newTemp(ty);
   29273          UChar   rm   = getUChar(delta);
   29274 
   29275          assign( src, getIRegV(size,pfx) );
   29276          if (epartIsReg(rm)) {
   29277             assign( mask, getIRegE(size,pfx,rm) );
   29278             DIP("pdep %s,%s,%s\n", nameIRegE(size,pfx,rm),
   29279                 nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
   29280             delta++;
   29281          } else {
   29282             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29283             assign( mask, loadLE(ty, mkexpr(addr)) );
   29284             DIP("pdep %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
   29285                 nameIRegG(size,pfx,rm));
   29286             delta += alen;
   29287          }
   29288 
   29289          IRExpr** args = mkIRExprVec_2( widenUto64(mkexpr(src)),
   29290                                         widenUto64(mkexpr(mask)) );
   29291          putIRegG( size, pfx, rm,
   29292                    narrowTo(ty, mkIRExprCCall(Ity_I64, 0/*regparms*/,
   29293                                               "amd64g_calculate_pdep",
   29294                                               &amd64g_calculate_pdep, args)) );
   29295          *uses_vvvv = True;
   29296          /* Flags aren't modified.  */
   29297          goto decode_success;
   29298       }
   29299       /* PEXT r/m32, r32b, r32a = VEX.NDS.LZ.F3.0F38.W0 F5 /r */
   29300       /* PEXT r/m64, r64b, r64a = VEX.NDS.LZ.F3.0F38.W1 F5 /r */
   29301       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   29302          Int     size = getRexW(pfx) ? 8 : 4;
   29303          IRType  ty   = szToITy(size);
   29304          IRTemp  src  = newTemp(ty);
   29305          IRTemp  mask = newTemp(ty);
   29306          UChar   rm   = getUChar(delta);
   29307 
   29308          assign( src, getIRegV(size,pfx) );
   29309          if (epartIsReg(rm)) {
   29310             assign( mask, getIRegE(size,pfx,rm) );
   29311             DIP("pext %s,%s,%s\n", nameIRegE(size,pfx,rm),
   29312                 nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
   29313             delta++;
   29314          } else {
   29315             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29316             assign( mask, loadLE(ty, mkexpr(addr)) );
   29317             DIP("pext %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
   29318                 nameIRegG(size,pfx,rm));
   29319             delta += alen;
   29320          }
   29321 
   29322          /* First mask off bits not set in mask, they are ignored
   29323             and it should be fine if they contain undefined values.  */
   29324          IRExpr* masked = binop(mkSizedOp(ty,Iop_And8),
   29325                                 mkexpr(src), mkexpr(mask));
   29326          IRExpr** args = mkIRExprVec_2( widenUto64(masked),
   29327                                         widenUto64(mkexpr(mask)) );
   29328          putIRegG( size, pfx, rm,
   29329                    narrowTo(ty, mkIRExprCCall(Ity_I64, 0/*regparms*/,
   29330                                               "amd64g_calculate_pext",
   29331                                               &amd64g_calculate_pext, args)) );
   29332          *uses_vvvv = True;
   29333          /* Flags aren't modified.  */
   29334          goto decode_success;
   29335       }
   29336       break;
   29337 
   29338    case 0xF6:
   29339       /* MULX r/m32, r32b, r32a = VEX.NDD.LZ.F2.0F38.W0 F6 /r */
   29340       /* MULX r/m64, r64b, r64a = VEX.NDD.LZ.F2.0F38.W1 F6 /r */
   29341       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   29342          Int     size = getRexW(pfx) ? 8 : 4;
   29343          IRType  ty   = szToITy(size);
   29344          IRTemp  src1 = newTemp(ty);
   29345          IRTemp  src2 = newTemp(ty);
   29346          IRTemp  res  = newTemp(size == 8 ? Ity_I128 : Ity_I64);
   29347          UChar   rm   = getUChar(delta);
   29348 
   29349          assign( src1, getIRegRDX(size) );
   29350          if (epartIsReg(rm)) {
   29351             assign( src2, getIRegE(size,pfx,rm) );
   29352             DIP("mulx %s,%s,%s\n", nameIRegE(size,pfx,rm),
   29353                 nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
   29354             delta++;
   29355          } else {
   29356             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29357             assign( src2, loadLE(ty, mkexpr(addr)) );
   29358             DIP("mulx %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
   29359                 nameIRegG(size,pfx,rm));
   29360             delta += alen;
   29361          }
   29362 
   29363          assign( res, binop(size == 8 ? Iop_MullU64 : Iop_MullU32,
   29364                             mkexpr(src1), mkexpr(src2)) );
   29365          putIRegV( size, pfx,
   29366                    unop(size == 8 ? Iop_128to64 : Iop_64to32, mkexpr(res)) );
   29367          putIRegG( size, pfx, rm,
   29368                    unop(size == 8 ? Iop_128HIto64 : Iop_64HIto32,
   29369                         mkexpr(res)) );
   29370          *uses_vvvv = True;
   29371          /* Flags aren't modified.  */
   29372          goto decode_success;
   29373       }
   29374       break;
   29375 
   29376    case 0xF7:
   29377       /* SARX r32b, r/m32, r32a = VEX.NDS.LZ.F3.0F38.W0 F7 /r */
   29378       /* SARX r64b, r/m64, r64a = VEX.NDS.LZ.F3.0F38.W1 F7 /r */
   29379       if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   29380          delta = dis_SHIFTX( uses_vvvv, vbi, pfx, delta, "sarx", Iop_Sar8 );
   29381          goto decode_success;
   29382       }
   29383       /* SHLX r32b, r/m32, r32a = VEX.NDS.LZ.66.0F38.W0 F7 /r */
   29384       /* SHLX r64b, r/m64, r64a = VEX.NDS.LZ.66.0F38.W1 F7 /r */
   29385       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   29386          delta = dis_SHIFTX( uses_vvvv, vbi, pfx, delta, "shlx", Iop_Shl8 );
   29387          goto decode_success;
   29388       }
   29389       /* SHRX r32b, r/m32, r32a = VEX.NDS.LZ.F2.0F38.W0 F7 /r */
   29390       /* SHRX r64b, r/m64, r64a = VEX.NDS.LZ.F2.0F38.W1 F7 /r */
   29391       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   29392          delta = dis_SHIFTX( uses_vvvv, vbi, pfx, delta, "shrx", Iop_Shr8 );
   29393          goto decode_success;
   29394       }
   29395       /* BEXTR r32b, r/m32, r32a = VEX.NDS.LZ.0F38.W0 F7 /r */
   29396       /* BEXTR r64b, r/m64, r64a = VEX.NDS.LZ.0F38.W1 F7 /r */
   29397       if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   29398          Int     size  = getRexW(pfx) ? 8 : 4;
   29399          IRType  ty    = szToITy(size);
   29400          IRTemp  dst   = newTemp(ty);
   29401          IRTemp  src1  = newTemp(ty);
   29402          IRTemp  src2  = newTemp(ty);
   29403          IRTemp  stle  = newTemp(Ity_I16);
   29404          IRTemp  start = newTemp(Ity_I8);
   29405          IRTemp  len   = newTemp(Ity_I8);
   29406          UChar   rm    = getUChar(delta);
   29407 
   29408          assign( src2, getIRegV(size,pfx) );
   29409          if (epartIsReg(rm)) {
   29410             assign( src1, getIRegE(size,pfx,rm) );
   29411             DIP("bextr %s,%s,%s\n", nameIRegV(size,pfx),
   29412                 nameIRegE(size,pfx,rm), nameIRegG(size,pfx,rm));
   29413             delta++;
   29414          } else {
   29415             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   29416             assign( src1, loadLE(ty, mkexpr(addr)) );
   29417             DIP("bextr %s,%s,%s\n", nameIRegV(size,pfx), dis_buf,
   29418                 nameIRegG(size,pfx,rm));
   29419             delta += alen;
   29420          }
   29421 
   29422          assign( stle, narrowTo( Ity_I16, mkexpr(src2) ) );
   29423          assign( start, unop( Iop_16to8, mkexpr(stle) ) );
   29424          assign( len, unop( Iop_16HIto8, mkexpr(stle) ) );
   29425          /* if (start+len < opsize) {
   29426                if (len != 0)
   29427                   dst = (src1 << (opsize-start-len)) u>> (opsize-len);
   29428                else
   29429                   dst = 0;
   29430             } else {
   29431                if (start < opsize)
   29432                   dst = src1 u>> start;
   29433                else
   29434                   dst = 0;
   29435             } */
   29436          assign( dst,
   29437                  IRExpr_ITE(
   29438                     binop(Iop_CmpLT32U,
   29439                           binop(Iop_Add32,
   29440                                 unop(Iop_8Uto32, mkexpr(start)),
   29441                                 unop(Iop_8Uto32, mkexpr(len))),
   29442                           mkU32(8*size)),
   29443                     IRExpr_ITE(
   29444                        binop(Iop_CmpEQ8, mkexpr(len), mkU8(0)),
   29445                        mkU(ty, 0),
   29446                        binop(mkSizedOp(ty,Iop_Shr8),
   29447                              binop(mkSizedOp(ty,Iop_Shl8), mkexpr(src1),
   29448                                    binop(Iop_Sub8,
   29449                                          binop(Iop_Sub8, mkU8(8*size),
   29450                                                mkexpr(start)),
   29451                                          mkexpr(len))),
   29452                              binop(Iop_Sub8, mkU8(8*size),
   29453                                    mkexpr(len)))
   29454                     ),
   29455                     IRExpr_ITE(
   29456                        binop(Iop_CmpLT32U,
   29457                              unop(Iop_8Uto32, mkexpr(start)),
   29458                              mkU32(8*size)),
   29459                        binop(mkSizedOp(ty,Iop_Shr8), mkexpr(src1),
   29460                              mkexpr(start)),
   29461                        mkU(ty, 0)
   29462                     )
   29463                  )
   29464                );
   29465          putIRegG( size, pfx, rm, mkexpr(dst) );
   29466          stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
   29467                                                ? AMD64G_CC_OP_ANDN64
   29468                                                : AMD64G_CC_OP_ANDN32)) );
   29469          stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
   29470          stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
   29471          *uses_vvvv = True;
   29472          goto decode_success;
   29473       }
   29474       break;
   29475 
   29476    default:
   29477       break;
   29478 
   29479    }
   29480 
   29481   //decode_failure:
   29482    return deltaIN;
   29483 
   29484   decode_success:
   29485    return delta;
   29486 }
   29487 
   29488 
   29489 /*------------------------------------------------------------*/
   29490 /*---                                                      ---*/
   29491 /*--- Top-level post-escape decoders: dis_ESC_0F3A__VEX    ---*/
   29492 /*---                                                      ---*/
   29493 /*------------------------------------------------------------*/
   29494 
   29495 static IRTemp math_VPERMILPS_128 ( IRTemp sV, UInt imm8 )
   29496 {
   29497    vassert(imm8 < 256);
   29498    IRTemp s3, s2, s1, s0;
   29499    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   29500    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   29501 #  define SEL(_nn) (((_nn)==0) ? s0 : ((_nn)==1) ? s1 \
   29502                                     : ((_nn)==2) ? s2 : s3)
   29503    IRTemp res = newTemp(Ity_V128);
   29504    assign(res, mkV128from32s( SEL((imm8 >> 6) & 3),
   29505                               SEL((imm8 >> 4) & 3),
   29506                               SEL((imm8 >> 2) & 3),
   29507                               SEL((imm8 >> 0) & 3) ));
   29508 #  undef SEL
   29509    return res;
   29510 }
   29511 
   29512 __attribute__((noinline))
   29513 static
   29514 Long dis_ESC_0F3A__VEX (
   29515         /*MB_OUT*/DisResult* dres,
   29516         /*OUT*/   Bool*      uses_vvvv,
   29517         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
   29518         Bool         resteerCisOk,
   29519         void*        callback_opaque,
   29520         VexArchInfo* archinfo,
   29521         VexAbiInfo*  vbi,
   29522         Prefix pfx, Int sz, Long deltaIN
   29523      )
   29524 {
   29525    IRTemp addr  = IRTemp_INVALID;
   29526    Int    alen  = 0;
   29527    HChar  dis_buf[50];
   29528    Long   delta = deltaIN;
   29529    UChar  opc   = getUChar(delta);
   29530    delta++;
   29531    *uses_vvvv = False;
   29532 
   29533    switch (opc) {
   29534 
   29535    case 0x00:
   29536    case 0x01:
   29537       /* VPERMQ imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.W1 00 /r ib */
   29538       /* VPERMPD imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.W1 01 /r ib */
   29539       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
   29540           && 1==getRexW(pfx)/*W1*/) {
   29541          UChar  modrm = getUChar(delta);
   29542          UInt   imm8  = 0;
   29543          UInt   rG    = gregOfRexRM(pfx, modrm);
   29544          IRTemp sV    = newTemp(Ity_V256);
   29545          const HChar *name  = opc == 0 ? "vpermq" : "vpermpd";
   29546          if (epartIsReg(modrm)) {
   29547             UInt rE = eregOfRexRM(pfx, modrm);
   29548             delta += 1;
   29549             imm8 = getUChar(delta);
   29550             DIP("%s $%u,%s,%s\n",
   29551                 name, imm8, nameYMMReg(rE), nameYMMReg(rG));
   29552             assign(sV, getYMMReg(rE));
   29553          } else {
   29554             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   29555             delta += alen;
   29556             imm8 = getUChar(delta);
   29557             DIP("%s $%u,%s,%s\n",
   29558                 name, imm8, dis_buf, nameYMMReg(rG));
   29559             assign(sV, loadLE(Ity_V256, mkexpr(addr)));
   29560          }
   29561          delta++;
   29562          IRTemp s[4];
   29563          s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID;
   29564          breakupV256to64s(sV, &s[3], &s[2], &s[1], &s[0]);
   29565          IRTemp dV = newTemp(Ity_V256);
   29566          assign(dV, IRExpr_Qop(Iop_64x4toV256,
   29567                                mkexpr(s[(imm8 >> 6) & 3]),
   29568                                mkexpr(s[(imm8 >> 4) & 3]),
   29569                                mkexpr(s[(imm8 >> 2) & 3]),
   29570                                mkexpr(s[(imm8 >> 0) & 3])));
   29571          putYMMReg(rG, mkexpr(dV));
   29572          goto decode_success;
   29573       }
   29574       break;
   29575 
   29576    case 0x02:
   29577       /* VPBLENDD imm8, xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W0 02 /r ib */
   29578       if (have66noF2noF3(pfx)
   29579           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   29580          UChar  modrm = getUChar(delta);
   29581          UInt   imm8  = 0;
   29582          UInt   rG    = gregOfRexRM(pfx, modrm);
   29583          UInt   rV    = getVexNvvvv(pfx);
   29584          IRTemp sV    = newTemp(Ity_V128);
   29585          IRTemp dV    = newTemp(Ity_V128);
   29586          UInt   i;
   29587          IRTemp s[4], d[4];
   29588          assign(sV, getXMMReg(rV));
   29589          if (epartIsReg(modrm)) {
   29590             UInt rE = eregOfRexRM(pfx, modrm);
   29591             delta += 1;
   29592             imm8 = getUChar(delta);
   29593             DIP("vpblendd $%u,%s,%s,%s\n",
   29594                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   29595             assign(dV, getXMMReg(rE));
   29596          } else {
   29597             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   29598             delta += alen;
   29599             imm8 = getUChar(delta);
   29600             DIP("vpblendd $%u,%s,%s,%s\n",
   29601                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   29602             assign(dV, loadLE(Ity_V128, mkexpr(addr)));
   29603          }
   29604          delta++;
   29605          for (i = 0; i < 4; i++) {
   29606             s[i] = IRTemp_INVALID;
   29607             d[i] = IRTemp_INVALID;
   29608          }
   29609          breakupV128to32s( sV, &s[3], &s[2], &s[1], &s[0] );
   29610          breakupV128to32s( dV, &d[3], &d[2], &d[1], &d[0] );
   29611          for (i = 0; i < 4; i++)
   29612             putYMMRegLane32(rG, i, mkexpr((imm8 & (1<<i)) ? d[i] : s[i]));
   29613          putYMMRegLane128(rG, 1, mkV128(0));
   29614          *uses_vvvv = True;
   29615          goto decode_success;
   29616       }
   29617       /* VPBLENDD imm8, ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F3A.W0 02 /r ib */
   29618       if (have66noF2noF3(pfx)
   29619           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   29620          UChar  modrm = getUChar(delta);
   29621          UInt   imm8  = 0;
   29622          UInt   rG    = gregOfRexRM(pfx, modrm);
   29623          UInt   rV    = getVexNvvvv(pfx);
   29624          IRTemp sV    = newTemp(Ity_V256);
   29625          IRTemp dV    = newTemp(Ity_V256);
   29626          UInt   i;
   29627          IRTemp s[8], d[8];
   29628          assign(sV, getYMMReg(rV));
   29629          if (epartIsReg(modrm)) {
   29630             UInt rE = eregOfRexRM(pfx, modrm);
   29631             delta += 1;
   29632             imm8 = getUChar(delta);
   29633             DIP("vpblendd $%u,%s,%s,%s\n",
   29634                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   29635             assign(dV, getYMMReg(rE));
   29636          } else {
   29637             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   29638             delta += alen;
   29639             imm8 = getUChar(delta);
   29640             DIP("vpblendd $%u,%s,%s,%s\n",
   29641                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   29642             assign(dV, loadLE(Ity_V256, mkexpr(addr)));
   29643          }
   29644          delta++;
   29645          for (i = 0; i < 8; i++) {
   29646             s[i] = IRTemp_INVALID;
   29647             d[i] = IRTemp_INVALID;
   29648          }
   29649          breakupV256to32s( sV, &s[7], &s[6], &s[5], &s[4],
   29650                                &s[3], &s[2], &s[1], &s[0] );
   29651          breakupV256to32s( dV, &d[7], &d[6], &d[5], &d[4],
   29652                                &d[3], &d[2], &d[1], &d[0] );
   29653          for (i = 0; i < 8; i++)
   29654             putYMMRegLane32(rG, i, mkexpr((imm8 & (1<<i)) ? d[i] : s[i]));
   29655          *uses_vvvv = True;
   29656          goto decode_success;
   29657       }
   29658       break;
   29659 
   29660    case 0x04:
   29661       /* VPERMILPS imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.WIG 04 /r ib */
   29662       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   29663          UChar  modrm = getUChar(delta);
   29664          UInt   imm8  = 0;
   29665          UInt   rG    = gregOfRexRM(pfx, modrm);
   29666          IRTemp sV    = newTemp(Ity_V256);
   29667          if (epartIsReg(modrm)) {
   29668             UInt rE = eregOfRexRM(pfx, modrm);
   29669             delta += 1;
   29670             imm8 = getUChar(delta);
   29671             DIP("vpermilps $%u,%s,%s\n",
   29672                 imm8, nameYMMReg(rE), nameYMMReg(rG));
   29673             assign(sV, getYMMReg(rE));
   29674          } else {
   29675             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   29676             delta += alen;
   29677             imm8 = getUChar(delta);
   29678             DIP("vpermilps $%u,%s,%s\n",
   29679                 imm8, dis_buf, nameYMMReg(rG));
   29680             assign(sV, loadLE(Ity_V256, mkexpr(addr)));
   29681          }
   29682          delta++;
   29683          IRTemp  sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   29684          breakupV256toV128s( sV, &sVhi, &sVlo );
   29685          IRTemp  dVhi = math_VPERMILPS_128( sVhi, imm8 );
   29686          IRTemp  dVlo = math_VPERMILPS_128( sVlo, imm8 );
   29687          IRExpr* res  = binop(Iop_V128HLtoV256, mkexpr(dVhi), mkexpr(dVlo));
   29688          putYMMReg(rG, res);
   29689          goto decode_success;
   29690       }
   29691       /* VPERMILPS imm8, xmm2/m128, xmm1 = VEX.128.66.0F3A.WIG 04 /r ib */
   29692       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   29693          UChar  modrm = getUChar(delta);
   29694          UInt   imm8  = 0;
   29695          UInt   rG    = gregOfRexRM(pfx, modrm);
   29696          IRTemp sV    = newTemp(Ity_V128);
   29697          if (epartIsReg(modrm)) {
   29698             UInt rE = eregOfRexRM(pfx, modrm);
   29699             delta += 1;
   29700             imm8 = getUChar(delta);
   29701             DIP("vpermilps $%u,%s,%s\n",
   29702                 imm8, nameXMMReg(rE), nameXMMReg(rG));
   29703             assign(sV, getXMMReg(rE));
   29704          } else {
   29705             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   29706             delta += alen;
   29707             imm8 = getUChar(delta);
   29708             DIP("vpermilps $%u,%s,%s\n",
   29709                 imm8, dis_buf, nameXMMReg(rG));
   29710             assign(sV, loadLE(Ity_V128, mkexpr(addr)));
   29711          }
   29712          delta++;
   29713          putYMMRegLoAndZU(rG, mkexpr ( math_VPERMILPS_128 ( sV, imm8 ) ) );
   29714          goto decode_success;
   29715       }
   29716       break;
   29717 
   29718    case 0x05:
   29719       /* VPERMILPD imm8, xmm2/m128, xmm1 = VEX.128.66.0F3A.WIG 05 /r ib */
   29720       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   29721          UChar  modrm = getUChar(delta);
   29722          UInt   imm8  = 0;
   29723          UInt   rG    = gregOfRexRM(pfx, modrm);
   29724          IRTemp sV    = newTemp(Ity_V128);
   29725          if (epartIsReg(modrm)) {
   29726             UInt rE = eregOfRexRM(pfx, modrm);
   29727             delta += 1;
   29728             imm8 = getUChar(delta);
   29729             DIP("vpermilpd $%u,%s,%s\n",
   29730                 imm8, nameXMMReg(rE), nameXMMReg(rG));
   29731             assign(sV, getXMMReg(rE));
   29732          } else {
   29733             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   29734             delta += alen;
   29735             imm8 = getUChar(delta);
   29736             DIP("vpermilpd $%u,%s,%s\n",
   29737                 imm8, dis_buf, nameXMMReg(rG));
   29738             assign(sV, loadLE(Ity_V128, mkexpr(addr)));
   29739          }
   29740          delta++;
   29741          IRTemp s1 = newTemp(Ity_I64);
   29742          IRTemp s0 = newTemp(Ity_I64);
   29743          assign(s1, unop(Iop_V128HIto64, mkexpr(sV)));
   29744          assign(s0, unop(Iop_V128to64,   mkexpr(sV)));
   29745          IRTemp dV = newTemp(Ity_V128);
   29746          assign(dV, binop(Iop_64HLtoV128,
   29747                                mkexpr((imm8 & (1<<1)) ? s1 : s0),
   29748                                mkexpr((imm8 & (1<<0)) ? s1 : s0)));
   29749          putYMMRegLoAndZU(rG, mkexpr(dV));
   29750          goto decode_success;
   29751       }
   29752       /* VPERMILPD imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.WIG 05 /r ib */
   29753       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   29754          UChar  modrm = getUChar(delta);
   29755          UInt   imm8  = 0;
   29756          UInt   rG    = gregOfRexRM(pfx, modrm);
   29757          IRTemp sV    = newTemp(Ity_V256);
   29758          if (epartIsReg(modrm)) {
   29759             UInt rE = eregOfRexRM(pfx, modrm);
   29760             delta += 1;
   29761             imm8 = getUChar(delta);
   29762             DIP("vpermilpd $%u,%s,%s\n",
   29763                 imm8, nameYMMReg(rE), nameYMMReg(rG));
   29764             assign(sV, getYMMReg(rE));
   29765          } else {
   29766             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   29767             delta += alen;
   29768             imm8 = getUChar(delta);
   29769             DIP("vpermilpd $%u,%s,%s\n",
   29770                 imm8, dis_buf, nameYMMReg(rG));
   29771             assign(sV, loadLE(Ity_V256, mkexpr(addr)));
   29772          }
   29773          delta++;
   29774          IRTemp s3, s2, s1, s0;
   29775          s3 = s2 = s1 = s0 = IRTemp_INVALID;
   29776          breakupV256to64s(sV, &s3, &s2, &s1, &s0);
   29777          IRTemp dV = newTemp(Ity_V256);
   29778          assign(dV, IRExpr_Qop(Iop_64x4toV256,
   29779                                mkexpr((imm8 & (1<<3)) ? s3 : s2),
   29780                                mkexpr((imm8 & (1<<2)) ? s3 : s2),
   29781                                mkexpr((imm8 & (1<<1)) ? s1 : s0),
   29782                                mkexpr((imm8 & (1<<0)) ? s1 : s0)));
   29783          putYMMReg(rG, mkexpr(dV));
   29784          goto decode_success;
   29785       }
   29786       break;
   29787 
   29788    case 0x06:
   29789       /* VPERM2F128 imm8, ymm3/m256, ymm2, ymm1 = VEX.NDS.66.0F3A.W0 06 /r ib */
   29790       if (have66noF2noF3(pfx)
   29791           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   29792          UChar  modrm = getUChar(delta);
   29793          UInt   imm8  = 0;
   29794          UInt   rG    = gregOfRexRM(pfx, modrm);
   29795          UInt   rV    = getVexNvvvv(pfx);
   29796          IRTemp s00   = newTemp(Ity_V128);
   29797          IRTemp s01   = newTemp(Ity_V128);
   29798          IRTemp s10   = newTemp(Ity_V128);
   29799          IRTemp s11   = newTemp(Ity_V128);
   29800          assign(s00, getYMMRegLane128(rV, 0));
   29801          assign(s01, getYMMRegLane128(rV, 1));
   29802          if (epartIsReg(modrm)) {
   29803             UInt rE = eregOfRexRM(pfx, modrm);
   29804             delta += 1;
   29805             imm8 = getUChar(delta);
   29806             DIP("vperm2f128 $%u,%s,%s,%s\n",
   29807                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   29808             assign(s10, getYMMRegLane128(rE, 0));
   29809             assign(s11, getYMMRegLane128(rE, 1));
   29810          } else {
   29811             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   29812             delta += alen;
   29813             imm8 = getUChar(delta);
   29814             DIP("vperm2f128 $%u,%s,%s,%s\n",
   29815                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   29816             assign(s10, loadLE(Ity_V128, binop(Iop_Add64,
   29817                                                mkexpr(addr), mkU64(0))));
   29818             assign(s11, loadLE(Ity_V128, binop(Iop_Add64,
   29819                                                mkexpr(addr), mkU64(16))));
   29820          }
   29821          delta++;
   29822 #        define SEL(_nn) (((_nn)==0) ? s00 : ((_nn)==1) ? s01 \
   29823                                            : ((_nn)==2) ? s10 : s11)
   29824          putYMMRegLane128(rG, 0, mkexpr(SEL((imm8 >> 0) & 3)));
   29825          putYMMRegLane128(rG, 1, mkexpr(SEL((imm8 >> 4) & 3)));
   29826 #        undef SEL
   29827          if (imm8 & (1<<3)) putYMMRegLane128(rG, 0, mkV128(0));
   29828          if (imm8 & (1<<7)) putYMMRegLane128(rG, 1, mkV128(0));
   29829          *uses_vvvv = True;
   29830          goto decode_success;
   29831       }
   29832       break;
   29833 
   29834    case 0x08:
   29835       /* VROUNDPS imm8, xmm2/m128, xmm1 */
   29836       /* VROUNDPS = VEX.NDS.128.66.0F3A.WIG 08 ib */
   29837       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   29838          UChar  modrm = getUChar(delta);
   29839          UInt   rG    = gregOfRexRM(pfx, modrm);
   29840          IRTemp src   = newTemp(Ity_V128);
   29841          IRTemp s0    = IRTemp_INVALID;
   29842          IRTemp s1    = IRTemp_INVALID;
   29843          IRTemp s2    = IRTemp_INVALID;
   29844          IRTemp s3    = IRTemp_INVALID;
   29845          IRTemp rm    = newTemp(Ity_I32);
   29846          Int    imm   = 0;
   29847 
   29848          modrm = getUChar(delta);
   29849 
   29850          if (epartIsReg(modrm)) {
   29851             UInt rE = eregOfRexRM(pfx, modrm);
   29852             assign( src, getXMMReg( rE ) );
   29853             imm = getUChar(delta+1);
   29854             if (imm & ~15) break;
   29855             delta += 1+1;
   29856             DIP( "vroundps $%d,%s,%s\n", imm, nameXMMReg(rE), nameXMMReg(rG) );
   29857          } else {
   29858             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   29859             assign( src, loadLE(Ity_V128, mkexpr(addr) ) );
   29860             imm = getUChar(delta+alen);
   29861             if (imm & ~15) break;
   29862             delta += alen+1;
   29863             DIP( "vroundps $%d,%s,%s\n", imm, dis_buf, nameXMMReg(rG) );
   29864          }
   29865 
   29866          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   29867             that encoding is the same as the encoding for IRRoundingMode,
   29868             we can use that value directly in the IR as a rounding
   29869             mode. */
   29870          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   29871 
   29872          breakupV128to32s( src, &s3, &s2, &s1, &s0 );
   29873          putYMMRegLane128( rG, 1, mkV128(0) );
   29874 #        define CVT(s) binop(Iop_RoundF32toInt, mkexpr(rm), \
   29875                              unop(Iop_ReinterpI32asF32, mkexpr(s)))
   29876          putYMMRegLane32F( rG, 3, CVT(s3) );
   29877          putYMMRegLane32F( rG, 2, CVT(s2) );
   29878          putYMMRegLane32F( rG, 1, CVT(s1) );
   29879          putYMMRegLane32F( rG, 0, CVT(s0) );
   29880 #        undef CVT
   29881          goto decode_success;
   29882       }
   29883       /* VROUNDPS imm8, ymm2/m256, ymm1 */
   29884       /* VROUNDPS = VEX.NDS.256.66.0F3A.WIG 08 ib */
   29885       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   29886          UChar  modrm = getUChar(delta);
   29887          UInt   rG    = gregOfRexRM(pfx, modrm);
   29888          IRTemp src   = newTemp(Ity_V256);
   29889          IRTemp s0    = IRTemp_INVALID;
   29890          IRTemp s1    = IRTemp_INVALID;
   29891          IRTemp s2    = IRTemp_INVALID;
   29892          IRTemp s3    = IRTemp_INVALID;
   29893          IRTemp s4    = IRTemp_INVALID;
   29894          IRTemp s5    = IRTemp_INVALID;
   29895          IRTemp s6    = IRTemp_INVALID;
   29896          IRTemp s7    = IRTemp_INVALID;
   29897          IRTemp rm    = newTemp(Ity_I32);
   29898          Int    imm   = 0;
   29899 
   29900          modrm = getUChar(delta);
   29901 
   29902          if (epartIsReg(modrm)) {
   29903             UInt rE = eregOfRexRM(pfx, modrm);
   29904             assign( src, getYMMReg( rE ) );
   29905             imm = getUChar(delta+1);
   29906             if (imm & ~15) break;
   29907             delta += 1+1;
   29908             DIP( "vroundps $%d,%s,%s\n", imm, nameYMMReg(rE), nameYMMReg(rG) );
   29909          } else {
   29910             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   29911             assign( src, loadLE(Ity_V256, mkexpr(addr) ) );
   29912             imm = getUChar(delta+alen);
   29913             if (imm & ~15) break;
   29914             delta += alen+1;
   29915             DIP( "vroundps $%d,%s,%s\n", imm, dis_buf, nameYMMReg(rG) );
   29916          }
   29917 
   29918          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   29919             that encoding is the same as the encoding for IRRoundingMode,
   29920             we can use that value directly in the IR as a rounding
   29921             mode. */
   29922          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   29923 
   29924          breakupV256to32s( src, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
   29925 #        define CVT(s) binop(Iop_RoundF32toInt, mkexpr(rm), \
   29926                              unop(Iop_ReinterpI32asF32, mkexpr(s)))
   29927          putYMMRegLane32F( rG, 7, CVT(s7) );
   29928          putYMMRegLane32F( rG, 6, CVT(s6) );
   29929          putYMMRegLane32F( rG, 5, CVT(s5) );
   29930          putYMMRegLane32F( rG, 4, CVT(s4) );
   29931          putYMMRegLane32F( rG, 3, CVT(s3) );
   29932          putYMMRegLane32F( rG, 2, CVT(s2) );
   29933          putYMMRegLane32F( rG, 1, CVT(s1) );
   29934          putYMMRegLane32F( rG, 0, CVT(s0) );
   29935 #        undef CVT
   29936          goto decode_success;
   29937       }
   29938 
   29939    case 0x09:
   29940       /* VROUNDPD imm8, xmm2/m128, xmm1 */
   29941       /* VROUNDPD = VEX.NDS.128.66.0F3A.WIG 09 ib */
   29942       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   29943          UChar  modrm = getUChar(delta);
   29944          UInt   rG    = gregOfRexRM(pfx, modrm);
   29945          IRTemp src   = newTemp(Ity_V128);
   29946          IRTemp s0    = IRTemp_INVALID;
   29947          IRTemp s1    = IRTemp_INVALID;
   29948          IRTemp rm    = newTemp(Ity_I32);
   29949          Int    imm   = 0;
   29950 
   29951          modrm = getUChar(delta);
   29952 
   29953          if (epartIsReg(modrm)) {
   29954             UInt rE = eregOfRexRM(pfx, modrm);
   29955             assign( src, getXMMReg( rE ) );
   29956             imm = getUChar(delta+1);
   29957             if (imm & ~15) break;
   29958             delta += 1+1;
   29959             DIP( "vroundpd $%d,%s,%s\n", imm, nameXMMReg(rE), nameXMMReg(rG) );
   29960          } else {
   29961             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   29962             assign( src, loadLE(Ity_V128, mkexpr(addr) ) );
   29963             imm = getUChar(delta+alen);
   29964             if (imm & ~15) break;
   29965             delta += alen+1;
   29966             DIP( "vroundpd $%d,%s,%s\n", imm, dis_buf, nameXMMReg(rG) );
   29967          }
   29968 
   29969          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   29970             that encoding is the same as the encoding for IRRoundingMode,
   29971             we can use that value directly in the IR as a rounding
   29972             mode. */
   29973          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   29974 
   29975          breakupV128to64s( src, &s1, &s0 );
   29976          putYMMRegLane128( rG, 1, mkV128(0) );
   29977 #        define CVT(s) binop(Iop_RoundF64toInt, mkexpr(rm), \
   29978                              unop(Iop_ReinterpI64asF64, mkexpr(s)))
   29979          putYMMRegLane64F( rG, 1, CVT(s1) );
   29980          putYMMRegLane64F( rG, 0, CVT(s0) );
   29981 #        undef CVT
   29982          goto decode_success;
   29983       }
   29984       /* VROUNDPD imm8, ymm2/m256, ymm1 */
   29985       /* VROUNDPD = VEX.NDS.256.66.0F3A.WIG 09 ib */
   29986       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   29987          UChar  modrm = getUChar(delta);
   29988          UInt   rG    = gregOfRexRM(pfx, modrm);
   29989          IRTemp src   = newTemp(Ity_V256);
   29990          IRTemp s0    = IRTemp_INVALID;
   29991          IRTemp s1    = IRTemp_INVALID;
   29992          IRTemp s2    = IRTemp_INVALID;
   29993          IRTemp s3    = IRTemp_INVALID;
   29994          IRTemp rm    = newTemp(Ity_I32);
   29995          Int    imm   = 0;
   29996 
   29997          modrm = getUChar(delta);
   29998 
   29999          if (epartIsReg(modrm)) {
   30000             UInt rE = eregOfRexRM(pfx, modrm);
   30001             assign( src, getYMMReg( rE ) );
   30002             imm = getUChar(delta+1);
   30003             if (imm & ~15) break;
   30004             delta += 1+1;
   30005             DIP( "vroundpd $%d,%s,%s\n", imm, nameYMMReg(rE), nameYMMReg(rG) );
   30006          } else {
   30007             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30008             assign( src, loadLE(Ity_V256, mkexpr(addr) ) );
   30009             imm = getUChar(delta+alen);
   30010             if (imm & ~15) break;
   30011             delta += alen+1;
   30012             DIP( "vroundps $%d,%s,%s\n", imm, dis_buf, nameYMMReg(rG) );
   30013          }
   30014 
   30015          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   30016             that encoding is the same as the encoding for IRRoundingMode,
   30017             we can use that value directly in the IR as a rounding
   30018             mode. */
   30019          assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
   30020 
   30021          breakupV256to64s( src, &s3, &s2, &s1, &s0 );
   30022 #        define CVT(s) binop(Iop_RoundF64toInt, mkexpr(rm), \
   30023                              unop(Iop_ReinterpI64asF64, mkexpr(s)))
   30024          putYMMRegLane64F( rG, 3, CVT(s3) );
   30025          putYMMRegLane64F( rG, 2, CVT(s2) );
   30026          putYMMRegLane64F( rG, 1, CVT(s1) );
   30027          putYMMRegLane64F( rG, 0, CVT(s0) );
   30028 #        undef CVT
   30029          goto decode_success;
   30030       }
   30031 
   30032    case 0x0A:
   30033    case 0x0B:
   30034       /* VROUNDSS imm8, xmm3/m32, xmm2, xmm1 */
   30035       /* VROUNDSS = VEX.NDS.128.66.0F3A.WIG 0A ib */
   30036       /* VROUNDSD imm8, xmm3/m64, xmm2, xmm1 */
   30037       /* VROUNDSD = VEX.NDS.128.66.0F3A.WIG 0B ib */
   30038       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30039          UChar  modrm = getUChar(delta);
   30040          UInt   rG    = gregOfRexRM(pfx, modrm);
   30041          UInt   rV    = getVexNvvvv(pfx);
   30042          Bool   isD   = opc == 0x0B;
   30043          IRTemp src   = newTemp(isD ? Ity_F64 : Ity_F32);
   30044          IRTemp res   = newTemp(isD ? Ity_F64 : Ity_F32);
   30045          Int    imm   = 0;
   30046 
   30047          if (epartIsReg(modrm)) {
   30048             UInt rE = eregOfRexRM(pfx, modrm);
   30049             assign( src,
   30050                     isD ? getXMMRegLane64F(rE, 0) : getXMMRegLane32F(rE, 0) );
   30051             imm = getUChar(delta+1);
   30052             if (imm & ~15) break;
   30053             delta += 1+1;
   30054             DIP( "vrounds%c $%d,%s,%s,%s\n",
   30055                  isD ? 'd' : 's',
   30056                  imm, nameXMMReg( rE ), nameXMMReg( rV ), nameXMMReg( rG ) );
   30057          } else {
   30058             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30059             assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
   30060             imm = getUChar(delta+alen);
   30061             if (imm & ~15) break;
   30062             delta += alen+1;
   30063             DIP( "vrounds%c $%d,%s,%s,%s\n",
   30064                  isD ? 'd' : 's',
   30065                  imm, dis_buf, nameXMMReg( rV ), nameXMMReg( rG ) );
   30066          }
   30067 
   30068          /* (imm & 3) contains an Intel-encoded rounding mode.  Because
   30069             that encoding is the same as the encoding for IRRoundingMode,
   30070             we can use that value directly in the IR as a rounding
   30071             mode. */
   30072          assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
   30073                            (imm & 4) ? get_sse_roundingmode()
   30074                                      : mkU32(imm & 3),
   30075                            mkexpr(src)) );
   30076 
   30077          if (isD)
   30078             putXMMRegLane64F( rG, 0, mkexpr(res) );
   30079          else {
   30080             putXMMRegLane32F( rG, 0, mkexpr(res) );
   30081             putXMMRegLane32F( rG, 1, getXMMRegLane32F( rV, 1 ) );
   30082          }
   30083          putXMMRegLane64F( rG, 1, getXMMRegLane64F( rV, 1 ) );
   30084          putYMMRegLane128( rG, 1, mkV128(0) );
   30085          *uses_vvvv = True;
   30086          goto decode_success;
   30087       }
   30088       break;
   30089 
   30090    case 0x0C:
   30091       /* VBLENDPS imm8, ymm3/m256, ymm2, ymm1 */
   30092       /* VBLENDPS = VEX.NDS.256.66.0F3A.WIG 0C /r ib */
   30093       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   30094          UChar  modrm = getUChar(delta);
   30095          UInt   imm8;
   30096          UInt   rG    = gregOfRexRM(pfx, modrm);
   30097          UInt   rV    = getVexNvvvv(pfx);
   30098          IRTemp sV    = newTemp(Ity_V256);
   30099          IRTemp sE    = newTemp(Ity_V256);
   30100          assign ( sV, getYMMReg(rV) );
   30101          if (epartIsReg(modrm)) {
   30102             UInt rE = eregOfRexRM(pfx, modrm);
   30103             delta += 1;
   30104             imm8 = getUChar(delta);
   30105             DIP("vblendps $%u,%s,%s,%s\n",
   30106                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   30107             assign(sE, getYMMReg(rE));
   30108          } else {
   30109             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30110             delta += alen;
   30111             imm8 = getUChar(delta);
   30112             DIP("vblendps $%u,%s,%s,%s\n",
   30113                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   30114             assign(sE, loadLE(Ity_V256, mkexpr(addr)));
   30115          }
   30116          delta++;
   30117          putYMMReg( rG,
   30118                     mkexpr( math_BLENDPS_256( sE, sV, imm8) ) );
   30119          *uses_vvvv = True;
   30120          goto decode_success;
   30121       }
   30122       /* VBLENDPS imm8, xmm3/m128, xmm2, xmm1 */
   30123       /* VBLENDPS = VEX.NDS.128.66.0F3A.WIG 0C /r ib */
   30124       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30125          UChar  modrm = getUChar(delta);
   30126          UInt   imm8;
   30127          UInt   rG    = gregOfRexRM(pfx, modrm);
   30128          UInt   rV    = getVexNvvvv(pfx);
   30129          IRTemp sV    = newTemp(Ity_V128);
   30130          IRTemp sE    = newTemp(Ity_V128);
   30131          assign ( sV, getXMMReg(rV) );
   30132          if (epartIsReg(modrm)) {
   30133             UInt rE = eregOfRexRM(pfx, modrm);
   30134             delta += 1;
   30135             imm8 = getUChar(delta);
   30136             DIP("vblendps $%u,%s,%s,%s\n",
   30137                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   30138             assign(sE, getXMMReg(rE));
   30139          } else {
   30140             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30141             delta += alen;
   30142             imm8 = getUChar(delta);
   30143             DIP("vblendps $%u,%s,%s,%s\n",
   30144                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   30145             assign(sE, loadLE(Ity_V128, mkexpr(addr)));
   30146          }
   30147          delta++;
   30148          putYMMRegLoAndZU( rG,
   30149                            mkexpr( math_BLENDPS_128( sE, sV, imm8) ) );
   30150          *uses_vvvv = True;
   30151          goto decode_success;
   30152       }
   30153       break;
   30154 
   30155    case 0x0D:
   30156       /* VBLENDPD imm8, ymm3/m256, ymm2, ymm1 */
   30157       /* VBLENDPD = VEX.NDS.256.66.0F3A.WIG 0D /r ib */
   30158       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   30159          UChar  modrm = getUChar(delta);
   30160          UInt   imm8;
   30161          UInt   rG    = gregOfRexRM(pfx, modrm);
   30162          UInt   rV    = getVexNvvvv(pfx);
   30163          IRTemp sV    = newTemp(Ity_V256);
   30164          IRTemp sE    = newTemp(Ity_V256);
   30165          assign ( sV, getYMMReg(rV) );
   30166          if (epartIsReg(modrm)) {
   30167             UInt rE = eregOfRexRM(pfx, modrm);
   30168             delta += 1;
   30169             imm8 = getUChar(delta);
   30170             DIP("vblendpd $%u,%s,%s,%s\n",
   30171                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   30172             assign(sE, getYMMReg(rE));
   30173          } else {
   30174             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30175             delta += alen;
   30176             imm8 = getUChar(delta);
   30177             DIP("vblendpd $%u,%s,%s,%s\n",
   30178                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   30179             assign(sE, loadLE(Ity_V256, mkexpr(addr)));
   30180          }
   30181          delta++;
   30182          putYMMReg( rG,
   30183                     mkexpr( math_BLENDPD_256( sE, sV, imm8) ) );
   30184          *uses_vvvv = True;
   30185          goto decode_success;
   30186       }
   30187       /* VBLENDPD imm8, xmm3/m128, xmm2, xmm1 */
   30188       /* VBLENDPD = VEX.NDS.128.66.0F3A.WIG 0D /r ib */
   30189       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30190          UChar  modrm = getUChar(delta);
   30191          UInt   imm8;
   30192          UInt   rG    = gregOfRexRM(pfx, modrm);
   30193          UInt   rV    = getVexNvvvv(pfx);
   30194          IRTemp sV    = newTemp(Ity_V128);
   30195          IRTemp sE    = newTemp(Ity_V128);
   30196          assign ( sV, getXMMReg(rV) );
   30197          if (epartIsReg(modrm)) {
   30198             UInt rE = eregOfRexRM(pfx, modrm);
   30199             delta += 1;
   30200             imm8 = getUChar(delta);
   30201             DIP("vblendpd $%u,%s,%s,%s\n",
   30202                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   30203             assign(sE, getXMMReg(rE));
   30204          } else {
   30205             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30206             delta += alen;
   30207             imm8 = getUChar(delta);
   30208             DIP("vblendpd $%u,%s,%s,%s\n",
   30209                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   30210             assign(sE, loadLE(Ity_V128, mkexpr(addr)));
   30211          }
   30212          delta++;
   30213          putYMMRegLoAndZU( rG,
   30214                            mkexpr( math_BLENDPD_128( sE, sV, imm8) ) );
   30215          *uses_vvvv = True;
   30216          goto decode_success;
   30217       }
   30218       break;
   30219 
   30220    case 0x0E:
   30221       /* VPBLENDW imm8, xmm3/m128, xmm2, xmm1 */
   30222       /* VPBLENDW = VEX.NDS.128.66.0F3A.WIG 0E /r ib */
   30223       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30224          UChar  modrm = getUChar(delta);
   30225          UInt   imm8;
   30226          UInt   rG    = gregOfRexRM(pfx, modrm);
   30227          UInt   rV    = getVexNvvvv(pfx);
   30228          IRTemp sV    = newTemp(Ity_V128);
   30229          IRTemp sE    = newTemp(Ity_V128);
   30230          assign ( sV, getXMMReg(rV) );
   30231          if (epartIsReg(modrm)) {
   30232             UInt rE = eregOfRexRM(pfx, modrm);
   30233             delta += 1;
   30234             imm8 = getUChar(delta);
   30235             DIP("vpblendw $%u,%s,%s,%s\n",
   30236                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
   30237             assign(sE, getXMMReg(rE));
   30238          } else {
   30239             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30240             delta += alen;
   30241             imm8 = getUChar(delta);
   30242             DIP("vpblendw $%u,%s,%s,%s\n",
   30243                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
   30244             assign(sE, loadLE(Ity_V128, mkexpr(addr)));
   30245          }
   30246          delta++;
   30247          putYMMRegLoAndZU( rG,
   30248                            mkexpr( math_PBLENDW_128( sE, sV, imm8) ) );
   30249          *uses_vvvv = True;
   30250          goto decode_success;
   30251       }
   30252       /* VPBLENDW imm8, ymm3/m256, ymm2, ymm1 */
   30253       /* VPBLENDW = VEX.NDS.256.66.0F3A.WIG 0E /r ib */
   30254       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   30255          UChar  modrm = getUChar(delta);
   30256          UInt   imm8;
   30257          UInt   rG    = gregOfRexRM(pfx, modrm);
   30258          UInt   rV    = getVexNvvvv(pfx);
   30259          IRTemp sV    = newTemp(Ity_V256);
   30260          IRTemp sE    = newTemp(Ity_V256);
   30261          IRTemp sVhi, sVlo, sEhi, sElo;
   30262          sVhi = sVlo = sEhi = sElo = IRTemp_INVALID;
   30263          assign ( sV, getYMMReg(rV) );
   30264          if (epartIsReg(modrm)) {
   30265             UInt rE = eregOfRexRM(pfx, modrm);
   30266             delta += 1;
   30267             imm8 = getUChar(delta);
   30268             DIP("vpblendw $%u,%s,%s,%s\n",
   30269                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   30270             assign(sE, getYMMReg(rE));
   30271          } else {
   30272             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30273             delta += alen;
   30274             imm8 = getUChar(delta);
   30275             DIP("vpblendw $%u,%s,%s,%s\n",
   30276                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   30277             assign(sE, loadLE(Ity_V256, mkexpr(addr)));
   30278          }
   30279          delta++;
   30280          breakupV256toV128s( sV, &sVhi, &sVlo );
   30281          breakupV256toV128s( sE, &sEhi, &sElo );
   30282          putYMMReg( rG, binop( Iop_V128HLtoV256,
   30283                                mkexpr( math_PBLENDW_128( sEhi, sVhi, imm8) ),
   30284                                mkexpr( math_PBLENDW_128( sElo, sVlo, imm8) ) ) );
   30285          *uses_vvvv = True;
   30286          goto decode_success;
   30287       }
   30288       break;
   30289 
   30290    case 0x0F:
   30291       /* VPALIGNR imm8, xmm3/m128, xmm2, xmm1 */
   30292       /* VPALIGNR = VEX.NDS.128.66.0F3A.WIG 0F /r ib */
   30293       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30294          UChar  modrm = getUChar(delta);
   30295          UInt   rG    = gregOfRexRM(pfx, modrm);
   30296          UInt   rV    = getVexNvvvv(pfx);
   30297          IRTemp sV    = newTemp(Ity_V128);
   30298          IRTemp dV    = newTemp(Ity_V128);
   30299          UInt   imm8;
   30300 
   30301          assign( dV, getXMMReg(rV) );
   30302 
   30303          if ( epartIsReg( modrm ) ) {
   30304             UInt   rE = eregOfRexRM(pfx, modrm);
   30305             assign( sV, getXMMReg(rE) );
   30306             imm8 = getUChar(delta+1);
   30307             delta += 1+1;
   30308             DIP("vpalignr $%d,%s,%s,%s\n", imm8, nameXMMReg(rE),
   30309                                            nameXMMReg(rV), nameXMMReg(rG));
   30310          } else {
   30311             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30312             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   30313             imm8 = getUChar(delta+alen);
   30314             delta += alen+1;
   30315             DIP("vpalignr $%d,%s,%s,%s\n", imm8, dis_buf,
   30316                                            nameXMMReg(rV), nameXMMReg(rG));
   30317          }
   30318 
   30319          IRTemp res = math_PALIGNR_XMM( sV, dV, imm8 );
   30320          putYMMRegLoAndZU( rG, mkexpr(res) );
   30321          *uses_vvvv = True;
   30322          goto decode_success;
   30323       }
   30324       /* VPALIGNR imm8, ymm3/m256, ymm2, ymm1 */
   30325       /* VPALIGNR = VEX.NDS.256.66.0F3A.WIG 0F /r ib */
   30326       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   30327          UChar  modrm = getUChar(delta);
   30328          UInt   rG    = gregOfRexRM(pfx, modrm);
   30329          UInt   rV    = getVexNvvvv(pfx);
   30330          IRTemp sV    = newTemp(Ity_V256);
   30331          IRTemp dV    = newTemp(Ity_V256);
   30332          IRTemp sHi, sLo, dHi, dLo;
   30333          sHi = sLo = dHi = dLo = IRTemp_INVALID;
   30334          UInt   imm8;
   30335 
   30336          assign( dV, getYMMReg(rV) );
   30337 
   30338          if ( epartIsReg( modrm ) ) {
   30339             UInt   rE = eregOfRexRM(pfx, modrm);
   30340             assign( sV, getYMMReg(rE) );
   30341             imm8 = getUChar(delta+1);
   30342             delta += 1+1;
   30343             DIP("vpalignr $%d,%s,%s,%s\n", imm8, nameYMMReg(rE),
   30344                                            nameYMMReg(rV), nameYMMReg(rG));
   30345          } else {
   30346             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30347             assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   30348             imm8 = getUChar(delta+alen);
   30349             delta += alen+1;
   30350             DIP("vpalignr $%d,%s,%s,%s\n", imm8, dis_buf,
   30351                                            nameYMMReg(rV), nameYMMReg(rG));
   30352          }
   30353 
   30354          breakupV256toV128s( dV, &dHi, &dLo );
   30355          breakupV256toV128s( sV, &sHi, &sLo );
   30356          putYMMReg( rG, binop( Iop_V128HLtoV256,
   30357                                mkexpr( math_PALIGNR_XMM( sHi, dHi, imm8 ) ),
   30358                                mkexpr( math_PALIGNR_XMM( sLo, dLo, imm8 ) ) )
   30359                     );
   30360          *uses_vvvv = True;
   30361          goto decode_success;
   30362       }
   30363       break;
   30364 
   30365    case 0x14:
   30366       /* VPEXTRB imm8, xmm2, reg/m8 = VEX.128.66.0F3A.W0 14 /r ib */
   30367       if (have66noF2noF3(pfx)
   30368           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   30369          delta = dis_PEXTRB_128_GtoE( vbi, pfx, delta, False/*!isAvx*/ );
   30370          goto decode_success;
   30371       }
   30372       break;
   30373 
   30374    case 0x15:
   30375       /* VPEXTRW imm8, reg/m16, xmm2 */
   30376       /* VPEXTRW = VEX.128.66.0F3A.W0 15 /r ib */
   30377       if (have66noF2noF3(pfx)
   30378           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   30379          delta = dis_PEXTRW( vbi, pfx, delta, True/*isAvx*/ );
   30380          goto decode_success;
   30381       }
   30382       break;
   30383 
   30384    case 0x16:
   30385       /* VPEXTRD imm8, r32/m32, xmm2 */
   30386       /* VPEXTRD = VEX.128.66.0F3A.W0 16 /r ib */
   30387       if (have66noF2noF3(pfx)
   30388           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   30389          delta = dis_PEXTRD( vbi, pfx, delta, True/*isAvx*/ );
   30390          goto decode_success;
   30391       }
   30392       /* VPEXTRQ = VEX.128.66.0F3A.W1 16 /r ib */
   30393       if (have66noF2noF3(pfx)
   30394           && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
   30395          delta = dis_PEXTRQ( vbi, pfx, delta, True/*isAvx*/ );
   30396          goto decode_success;
   30397       }
   30398       break;
   30399 
   30400    case 0x17:
   30401       /* VEXTRACTPS imm8, xmm1, r32/m32 = VEX.128.66.0F3A.WIG 17 /r ib */
   30402       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30403          delta = dis_EXTRACTPS( vbi, pfx, delta, True/*isAvx*/ );
   30404          goto decode_success;
   30405       }
   30406       break;
   30407 
   30408    case 0x18:
   30409       /* VINSERTF128 r/m, rV, rD
   30410          ::: rD = insertinto(a lane in rV, 128 bits from r/m) */
   30411       /* VINSERTF128 = VEX.NDS.256.66.0F3A.W0 18 /r ib */
   30412       if (have66noF2noF3(pfx)
   30413           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   30414          UChar  modrm = getUChar(delta);
   30415          UInt   ib    = 0;
   30416          UInt   rG    = gregOfRexRM(pfx, modrm);
   30417          UInt   rV    = getVexNvvvv(pfx);
   30418          IRTemp t128  = newTemp(Ity_V128);
   30419          if (epartIsReg(modrm)) {
   30420             UInt rE = eregOfRexRM(pfx, modrm);
   30421             delta += 1;
   30422             assign(t128, getXMMReg(rE));
   30423             ib = getUChar(delta);
   30424             DIP("vinsertf128 $%u,%s,%s,%s\n",
   30425                 ib, nameXMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   30426          } else {
   30427             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30428             assign(t128, loadLE(Ity_V128, mkexpr(addr)));
   30429             delta += alen;
   30430             ib = getUChar(delta);
   30431             DIP("vinsertf128 $%u,%s,%s,%s\n",
   30432                 ib, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   30433          }
   30434          delta++;
   30435          putYMMRegLane128(rG, 0,   getYMMRegLane128(rV, 0));
   30436          putYMMRegLane128(rG, 1,   getYMMRegLane128(rV, 1));
   30437          putYMMRegLane128(rG, ib & 1, mkexpr(t128));
   30438          *uses_vvvv = True;
   30439          goto decode_success;
   30440       }
   30441       break;
   30442 
   30443    case 0x19:
   30444      /* VEXTRACTF128 $lane_no, rS, r/m
   30445         ::: r/m:V128 = a lane of rS:V256 (RM format) */
   30446      /* VEXTRACTF128 = VEX.256.66.0F3A.W0 19 /r ib */
   30447       if (have66noF2noF3(pfx)
   30448           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   30449          UChar  modrm = getUChar(delta);
   30450          UInt   ib    = 0;
   30451          UInt   rS    = gregOfRexRM(pfx, modrm);
   30452          IRTemp t128  = newTemp(Ity_V128);
   30453          if (epartIsReg(modrm)) {
   30454             UInt rD = eregOfRexRM(pfx, modrm);
   30455             delta += 1;
   30456             ib = getUChar(delta);
   30457             assign(t128, getYMMRegLane128(rS, ib & 1));
   30458             putYMMRegLoAndZU(rD, mkexpr(t128));
   30459             DIP("vextractf128 $%u,%s,%s\n",
   30460                 ib, nameXMMReg(rS), nameYMMReg(rD));
   30461          } else {
   30462             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30463             delta += alen;
   30464             ib = getUChar(delta);
   30465             assign(t128, getYMMRegLane128(rS, ib & 1));
   30466             storeLE(mkexpr(addr), mkexpr(t128));
   30467             DIP("vextractf128 $%u,%s,%s\n",
   30468                 ib, nameYMMReg(rS), dis_buf);
   30469          }
   30470          delta++;
   30471          /* doesn't use vvvv */
   30472          goto decode_success;
   30473       }
   30474       break;
   30475 
   30476    case 0x20:
   30477       /* VPINSRB r32/m8, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W0 20 /r ib */
   30478       if (have66noF2noF3(pfx)
   30479           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   30480          UChar  modrm  = getUChar(delta);
   30481          UInt   rG     = gregOfRexRM(pfx, modrm);
   30482          UInt   rV     = getVexNvvvv(pfx);
   30483          Int    imm8;
   30484          IRTemp src_u8 = newTemp(Ity_I8);
   30485 
   30486          if ( epartIsReg( modrm ) ) {
   30487             UInt rE = eregOfRexRM(pfx,modrm);
   30488             imm8 = (Int)(getUChar(delta+1) & 15);
   30489             assign( src_u8, unop(Iop_32to8, getIReg32( rE )) );
   30490             delta += 1+1;
   30491             DIP( "vpinsrb $%d,%s,%s,%s\n",
   30492                  imm8, nameIReg32(rE), nameXMMReg(rV), nameXMMReg(rG) );
   30493          } else {
   30494             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30495             imm8 = (Int)(getUChar(delta+alen) & 15);
   30496             assign( src_u8, loadLE( Ity_I8, mkexpr(addr) ) );
   30497             delta += alen+1;
   30498             DIP( "vpinsrb $%d,%s,%s,%s\n",
   30499                  imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   30500          }
   30501 
   30502          IRTemp src_vec = newTemp(Ity_V128);
   30503          assign(src_vec, getXMMReg( rV ));
   30504          IRTemp res_vec = math_PINSRB_128( src_vec, src_u8, imm8 );
   30505          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   30506          *uses_vvvv = True;
   30507          goto decode_success;
   30508       }
   30509       break;
   30510 
   30511    case 0x21:
   30512       /* VINSERTPS imm8, xmm3/m32, xmm2, xmm1
   30513          = VEX.NDS.128.66.0F3A.WIG 21 /r ib */
   30514       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30515          UChar  modrm = getUChar(delta);
   30516          UInt   rG    = gregOfRexRM(pfx, modrm);
   30517          UInt   rV    = getVexNvvvv(pfx);
   30518          UInt   imm8;
   30519          IRTemp d2ins = newTemp(Ity_I32); /* comes from the E part */
   30520          const IRTemp inval = IRTemp_INVALID;
   30521 
   30522          if ( epartIsReg( modrm ) ) {
   30523             UInt   rE = eregOfRexRM(pfx, modrm);
   30524             IRTemp vE = newTemp(Ity_V128);
   30525             assign( vE, getXMMReg(rE) );
   30526             IRTemp dsE[4] = { inval, inval, inval, inval };
   30527             breakupV128to32s( vE, &dsE[3], &dsE[2], &dsE[1], &dsE[0] );
   30528             imm8 = getUChar(delta+1);
   30529             d2ins = dsE[(imm8 >> 6) & 3]; /* "imm8_count_s" */
   30530             delta += 1+1;
   30531             DIP( "insertps $%u, %s,%s\n",
   30532                  imm8, nameXMMReg(rE), nameXMMReg(rG) );
   30533          } else {
   30534             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30535             assign( d2ins, loadLE( Ity_I32, mkexpr(addr) ) );
   30536             imm8 = getUChar(delta+alen);
   30537             delta += alen+1;
   30538             DIP( "insertps $%u, %s,%s\n",
   30539                  imm8, dis_buf, nameXMMReg(rG) );
   30540          }
   30541 
   30542          IRTemp vV = newTemp(Ity_V128);
   30543          assign( vV, getXMMReg(rV) );
   30544 
   30545          putYMMRegLoAndZU( rG, mkexpr(math_INSERTPS( vV, d2ins, imm8 )) );
   30546          *uses_vvvv = True;
   30547          goto decode_success;
   30548       }
   30549       break;
   30550 
   30551    case 0x22:
   30552       /* VPINSRD r32/m32, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W0 22 /r ib */
   30553       if (have66noF2noF3(pfx)
   30554           && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
   30555          UChar  modrm = getUChar(delta);
   30556          UInt   rG    = gregOfRexRM(pfx, modrm);
   30557          UInt   rV    = getVexNvvvv(pfx);
   30558          Int    imm8_10;
   30559          IRTemp src_u32 = newTemp(Ity_I32);
   30560 
   30561          if ( epartIsReg( modrm ) ) {
   30562             UInt rE = eregOfRexRM(pfx,modrm);
   30563             imm8_10 = (Int)(getUChar(delta+1) & 3);
   30564             assign( src_u32, getIReg32( rE ) );
   30565             delta += 1+1;
   30566             DIP( "vpinsrd $%d,%s,%s,%s\n",
   30567                  imm8_10, nameIReg32(rE), nameXMMReg(rV), nameXMMReg(rG) );
   30568          } else {
   30569             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30570             imm8_10 = (Int)(getUChar(delta+alen) & 3);
   30571             assign( src_u32, loadLE( Ity_I32, mkexpr(addr) ) );
   30572             delta += alen+1;
   30573             DIP( "vpinsrd $%d,%s,%s,%s\n",
   30574                  imm8_10, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   30575          }
   30576 
   30577          IRTemp src_vec = newTemp(Ity_V128);
   30578          assign(src_vec, getXMMReg( rV ));
   30579          IRTemp res_vec = math_PINSRD_128( src_vec, src_u32, imm8_10 );
   30580          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   30581          *uses_vvvv = True;
   30582          goto decode_success;
   30583       }
   30584       /* VPINSRQ r64/m64, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W1 22 /r ib */
   30585       if (have66noF2noF3(pfx)
   30586           && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
   30587          UChar  modrm = getUChar(delta);
   30588          UInt   rG    = gregOfRexRM(pfx, modrm);
   30589          UInt   rV    = getVexNvvvv(pfx);
   30590          Int    imm8_0;
   30591          IRTemp src_u64 = newTemp(Ity_I64);
   30592 
   30593          if ( epartIsReg( modrm ) ) {
   30594             UInt rE = eregOfRexRM(pfx,modrm);
   30595             imm8_0 = (Int)(getUChar(delta+1) & 1);
   30596             assign( src_u64, getIReg64( rE ) );
   30597             delta += 1+1;
   30598             DIP( "vpinsrq $%d,%s,%s,%s\n",
   30599                  imm8_0, nameIReg64(rE), nameXMMReg(rV), nameXMMReg(rG) );
   30600          } else {
   30601             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30602             imm8_0 = (Int)(getUChar(delta+alen) & 1);
   30603             assign( src_u64, loadLE( Ity_I64, mkexpr(addr) ) );
   30604             delta += alen+1;
   30605             DIP( "vpinsrd $%d,%s,%s,%s\n",
   30606                  imm8_0, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   30607          }
   30608 
   30609          IRTemp src_vec = newTemp(Ity_V128);
   30610          assign(src_vec, getXMMReg( rV ));
   30611          IRTemp res_vec = math_PINSRQ_128( src_vec, src_u64, imm8_0 );
   30612          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   30613          *uses_vvvv = True;
   30614          goto decode_success;
   30615       }
   30616       break;
   30617 
   30618    case 0x38:
   30619       /* VINSERTI128 r/m, rV, rD
   30620          ::: rD = insertinto(a lane in rV, 128 bits from r/m) */
   30621       /* VINSERTI128 = VEX.NDS.256.66.0F3A.W0 38 /r ib */
   30622       if (have66noF2noF3(pfx)
   30623           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   30624          UChar  modrm = getUChar(delta);
   30625          UInt   ib    = 0;
   30626          UInt   rG    = gregOfRexRM(pfx, modrm);
   30627          UInt   rV    = getVexNvvvv(pfx);
   30628          IRTemp t128  = newTemp(Ity_V128);
   30629          if (epartIsReg(modrm)) {
   30630             UInt rE = eregOfRexRM(pfx, modrm);
   30631             delta += 1;
   30632             assign(t128, getXMMReg(rE));
   30633             ib = getUChar(delta);
   30634             DIP("vinserti128 $%u,%s,%s,%s\n",
   30635                 ib, nameXMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   30636          } else {
   30637             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30638             assign(t128, loadLE(Ity_V128, mkexpr(addr)));
   30639             delta += alen;
   30640             ib = getUChar(delta);
   30641             DIP("vinserti128 $%u,%s,%s,%s\n",
   30642                 ib, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   30643          }
   30644          delta++;
   30645          putYMMRegLane128(rG, 0,   getYMMRegLane128(rV, 0));
   30646          putYMMRegLane128(rG, 1,   getYMMRegLane128(rV, 1));
   30647          putYMMRegLane128(rG, ib & 1, mkexpr(t128));
   30648          *uses_vvvv = True;
   30649          goto decode_success;
   30650       }
   30651       break;
   30652 
   30653    case 0x39:
   30654       /* VEXTRACTI128 $lane_no, rS, r/m
   30655          ::: r/m:V128 = a lane of rS:V256 (RM format) */
   30656       /* VEXTRACTI128 = VEX.256.66.0F3A.W0 39 /r ib */
   30657       if (have66noF2noF3(pfx)
   30658           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   30659          UChar  modrm = getUChar(delta);
   30660          UInt   ib    = 0;
   30661          UInt   rS    = gregOfRexRM(pfx, modrm);
   30662          IRTemp t128  = newTemp(Ity_V128);
   30663          if (epartIsReg(modrm)) {
   30664             UInt rD = eregOfRexRM(pfx, modrm);
   30665             delta += 1;
   30666             ib = getUChar(delta);
   30667             assign(t128, getYMMRegLane128(rS, ib & 1));
   30668             putYMMRegLoAndZU(rD, mkexpr(t128));
   30669             DIP("vextracti128 $%u,%s,%s\n",
   30670                 ib, nameXMMReg(rS), nameYMMReg(rD));
   30671          } else {
   30672             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30673             delta += alen;
   30674             ib = getUChar(delta);
   30675             assign(t128, getYMMRegLane128(rS, ib & 1));
   30676             storeLE(mkexpr(addr), mkexpr(t128));
   30677             DIP("vextracti128 $%u,%s,%s\n",
   30678                 ib, nameYMMReg(rS), dis_buf);
   30679          }
   30680          delta++;
   30681          /* doesn't use vvvv */
   30682          goto decode_success;
   30683       }
   30684       break;
   30685 
   30686    case 0x40:
   30687       /* VDPPS imm8, xmm3/m128,xmm2,xmm1 = VEX.NDS.128.66.0F3A.WIG 40 /r ib */
   30688       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30689          UChar  modrm   = getUChar(delta);
   30690          UInt   rG      = gregOfRexRM(pfx, modrm);
   30691          UInt   rV      = getVexNvvvv(pfx);
   30692          IRTemp dst_vec = newTemp(Ity_V128);
   30693          Int    imm8;
   30694          if (epartIsReg( modrm )) {
   30695             UInt rE = eregOfRexRM(pfx,modrm);
   30696             imm8 = (Int)getUChar(delta+1);
   30697             assign( dst_vec, getXMMReg( rE ) );
   30698             delta += 1+1;
   30699             DIP( "vdpps $%d,%s,%s,%s\n",
   30700                  imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
   30701          } else {
   30702             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30703             imm8 = (Int)getUChar(delta+alen);
   30704             assign( dst_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   30705             delta += alen+1;
   30706             DIP( "vdpps $%d,%s,%s,%s\n",
   30707                  imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   30708          }
   30709 
   30710          IRTemp src_vec = newTemp(Ity_V128);
   30711          assign(src_vec, getXMMReg( rV ));
   30712          IRTemp res_vec = math_DPPS_128( src_vec, dst_vec, imm8 );
   30713          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   30714          *uses_vvvv = True;
   30715          goto decode_success;
   30716       }
   30717       /* VDPPS imm8, ymm3/m128,ymm2,ymm1 = VEX.NDS.256.66.0F3A.WIG 40 /r ib */
   30718       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   30719          UChar  modrm   = getUChar(delta);
   30720          UInt   rG      = gregOfRexRM(pfx, modrm);
   30721          UInt   rV      = getVexNvvvv(pfx);
   30722          IRTemp dst_vec = newTemp(Ity_V256);
   30723          Int    imm8;
   30724          if (epartIsReg( modrm )) {
   30725             UInt rE = eregOfRexRM(pfx,modrm);
   30726             imm8 = (Int)getUChar(delta+1);
   30727             assign( dst_vec, getYMMReg( rE ) );
   30728             delta += 1+1;
   30729             DIP( "vdpps $%d,%s,%s,%s\n",
   30730                  imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG) );
   30731          } else {
   30732             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30733             imm8 = (Int)getUChar(delta+alen);
   30734             assign( dst_vec, loadLE( Ity_V256, mkexpr(addr) ) );
   30735             delta += alen+1;
   30736             DIP( "vdpps $%d,%s,%s,%s\n",
   30737                  imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
   30738          }
   30739 
   30740          IRTemp src_vec = newTemp(Ity_V256);
   30741          assign(src_vec, getYMMReg( rV ));
   30742          IRTemp s0, s1, d0, d1;
   30743          s0 = s1 = d0 = d1 = IRTemp_INVALID;
   30744          breakupV256toV128s( dst_vec, &d1, &d0 );
   30745          breakupV256toV128s( src_vec, &s1, &s0 );
   30746          putYMMReg( rG, binop( Iop_V128HLtoV256,
   30747                                mkexpr( math_DPPS_128(s1, d1, imm8) ),
   30748                                mkexpr( math_DPPS_128(s0, d0, imm8) ) ) );
   30749          *uses_vvvv = True;
   30750          goto decode_success;
   30751       }
   30752       break;
   30753 
   30754    case 0x41:
   30755       /* VDPPD imm8, xmm3/m128,xmm2,xmm1 = VEX.NDS.128.66.0F3A.WIG 41 /r ib */
   30756       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30757          UChar  modrm   = getUChar(delta);
   30758          UInt   rG      = gregOfRexRM(pfx, modrm);
   30759          UInt   rV      = getVexNvvvv(pfx);
   30760          IRTemp dst_vec = newTemp(Ity_V128);
   30761          Int    imm8;
   30762          if (epartIsReg( modrm )) {
   30763             UInt rE = eregOfRexRM(pfx,modrm);
   30764             imm8 = (Int)getUChar(delta+1);
   30765             assign( dst_vec, getXMMReg( rE ) );
   30766             delta += 1+1;
   30767             DIP( "vdppd $%d,%s,%s,%s\n",
   30768                  imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
   30769          } else {
   30770             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30771             imm8 = (Int)getUChar(delta+alen);
   30772             assign( dst_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   30773             delta += alen+1;
   30774             DIP( "vdppd $%d,%s,%s,%s\n",
   30775                  imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   30776          }
   30777 
   30778          IRTemp src_vec = newTemp(Ity_V128);
   30779          assign(src_vec, getXMMReg( rV ));
   30780          IRTemp res_vec = math_DPPD_128( src_vec, dst_vec, imm8 );
   30781          putYMMRegLoAndZU( rG, mkexpr(res_vec) );
   30782          *uses_vvvv = True;
   30783          goto decode_success;
   30784       }
   30785       break;
   30786 
   30787    case 0x42:
   30788       /* VMPSADBW imm8, xmm3/m128,xmm2,xmm1 */
   30789       /* VMPSADBW = VEX.NDS.128.66.0F3A.WIG 42 /r ib */
   30790       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30791          UChar  modrm   = getUChar(delta);
   30792          Int    imm8;
   30793          IRTemp src_vec = newTemp(Ity_V128);
   30794          IRTemp dst_vec = newTemp(Ity_V128);
   30795          UInt   rG      = gregOfRexRM(pfx, modrm);
   30796          UInt   rV      = getVexNvvvv(pfx);
   30797 
   30798          assign( dst_vec, getXMMReg(rV) );
   30799 
   30800          if ( epartIsReg( modrm ) ) {
   30801             UInt rE = eregOfRexRM(pfx, modrm);
   30802 
   30803             imm8 = (Int)getUChar(delta+1);
   30804             assign( src_vec, getXMMReg(rE) );
   30805             delta += 1+1;
   30806             DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
   30807                  nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
   30808          } else {
   30809             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   30810                              1/* imm8 is 1 byte after the amode */ );
   30811             assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
   30812             imm8 = (Int)getUChar(delta+alen);
   30813             delta += alen+1;
   30814             DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
   30815                  dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   30816          }
   30817 
   30818          putYMMRegLoAndZU( rG, mkexpr( math_MPSADBW_128(dst_vec,
   30819                                                         src_vec, imm8) ) );
   30820          *uses_vvvv = True;
   30821          goto decode_success;
   30822       }
   30823       /* VMPSADBW imm8, ymm3/m256,ymm2,ymm1 */
   30824       /* VMPSADBW = VEX.NDS.256.66.0F3A.WIG 42 /r ib */
   30825       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   30826          UChar  modrm   = getUChar(delta);
   30827          Int    imm8;
   30828          IRTemp src_vec = newTemp(Ity_V256);
   30829          IRTemp dst_vec = newTemp(Ity_V256);
   30830          UInt   rG      = gregOfRexRM(pfx, modrm);
   30831          UInt   rV      = getVexNvvvv(pfx);
   30832          IRTemp sHi, sLo, dHi, dLo;
   30833          sHi = sLo = dHi = dLo = IRTemp_INVALID;
   30834 
   30835          assign( dst_vec, getYMMReg(rV) );
   30836 
   30837          if ( epartIsReg( modrm ) ) {
   30838             UInt rE = eregOfRexRM(pfx, modrm);
   30839 
   30840             imm8 = (Int)getUChar(delta+1);
   30841             assign( src_vec, getYMMReg(rE) );
   30842             delta += 1+1;
   30843             DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
   30844                  nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG) );
   30845          } else {
   30846             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   30847                              1/* imm8 is 1 byte after the amode */ );
   30848             assign( src_vec, loadLE( Ity_V256, mkexpr(addr) ) );
   30849             imm8 = (Int)getUChar(delta+alen);
   30850             delta += alen+1;
   30851             DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
   30852                  dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
   30853          }
   30854 
   30855          breakupV256toV128s( dst_vec, &dHi, &dLo );
   30856          breakupV256toV128s( src_vec, &sHi, &sLo );
   30857          putYMMReg( rG, binop( Iop_V128HLtoV256,
   30858                                mkexpr( math_MPSADBW_128(dHi, sHi, imm8 >> 3) ),
   30859                                mkexpr( math_MPSADBW_128(dLo, sLo, imm8) ) ) );
   30860          *uses_vvvv = True;
   30861          goto decode_success;
   30862       }
   30863       break;
   30864 
   30865    case 0x44:
   30866       /* VPCLMULQDQ imm8, xmm3/m128,xmm2,xmm1 */
   30867       /* VPCLMULQDQ = VEX.NDS.128.66.0F3A.WIG 44 /r ib */
   30868       /* 66 0F 3A 44 /r ib = PCLMULQDQ xmm1, xmm2/m128, imm8
   30869        * Carry-less multiplication of selected XMM quadwords into XMM
   30870        * registers (a.k.a multiplication of polynomials over GF(2))
   30871        */
   30872       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30873          UChar  modrm = getUChar(delta);
   30874          Int imm8;
   30875          IRTemp sV    = newTemp(Ity_V128);
   30876          IRTemp dV    = newTemp(Ity_V128);
   30877          UInt   rG    = gregOfRexRM(pfx, modrm);
   30878          UInt   rV    = getVexNvvvv(pfx);
   30879 
   30880          assign( dV, getXMMReg(rV) );
   30881 
   30882          if ( epartIsReg( modrm ) ) {
   30883             UInt rE = eregOfRexRM(pfx, modrm);
   30884             imm8 = (Int)getUChar(delta+1);
   30885             assign( sV, getXMMReg(rE) );
   30886             delta += 1+1;
   30887             DIP( "vpclmulqdq $%d, %s,%s,%s\n", imm8,
   30888                  nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
   30889          } else {
   30890             addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   30891                              1/* imm8 is 1 byte after the amode */ );
   30892             assign( sV, loadLE( Ity_V128, mkexpr(addr) ) );
   30893             imm8 = (Int)getUChar(delta+alen);
   30894             delta += alen+1;
   30895             DIP( "vpclmulqdq $%d, %s,%s,%s\n",
   30896                  imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
   30897          }
   30898 
   30899          putYMMRegLoAndZU( rG, mkexpr( math_PCLMULQDQ(dV, sV, imm8) ) );
   30900          *uses_vvvv = True;
   30901          goto decode_success;
   30902       }
   30903       break;
   30904 
   30905    case 0x46:
   30906       /* VPERM2I128 imm8, ymm3/m256, ymm2, ymm1 = VEX.NDS.66.0F3A.W0 46 /r ib */
   30907       if (have66noF2noF3(pfx)
   30908           && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
   30909          UChar  modrm = getUChar(delta);
   30910          UInt   imm8  = 0;
   30911          UInt   rG    = gregOfRexRM(pfx, modrm);
   30912          UInt   rV    = getVexNvvvv(pfx);
   30913          IRTemp s00   = newTemp(Ity_V128);
   30914          IRTemp s01   = newTemp(Ity_V128);
   30915          IRTemp s10   = newTemp(Ity_V128);
   30916          IRTemp s11   = newTemp(Ity_V128);
   30917          assign(s00, getYMMRegLane128(rV, 0));
   30918          assign(s01, getYMMRegLane128(rV, 1));
   30919          if (epartIsReg(modrm)) {
   30920             UInt rE = eregOfRexRM(pfx, modrm);
   30921             delta += 1;
   30922             imm8 = getUChar(delta);
   30923             DIP("vperm2i128 $%u,%s,%s,%s\n",
   30924                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
   30925             assign(s10, getYMMRegLane128(rE, 0));
   30926             assign(s11, getYMMRegLane128(rE, 1));
   30927          } else {
   30928             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
   30929             delta += alen;
   30930             imm8 = getUChar(delta);
   30931             DIP("vperm2i128 $%u,%s,%s,%s\n",
   30932                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
   30933             assign(s10, loadLE(Ity_V128, binop(Iop_Add64,
   30934                                                mkexpr(addr), mkU64(0))));
   30935             assign(s11, loadLE(Ity_V128, binop(Iop_Add64,
   30936                                                mkexpr(addr), mkU64(16))));
   30937          }
   30938          delta++;
   30939 #        define SEL(_nn) (((_nn)==0) ? s00 : ((_nn)==1) ? s01 \
   30940                                            : ((_nn)==2) ? s10 : s11)
   30941          putYMMRegLane128(rG, 0, mkexpr(SEL((imm8 >> 0) & 3)));
   30942          putYMMRegLane128(rG, 1, mkexpr(SEL((imm8 >> 4) & 3)));
   30943 #        undef SEL
   30944          if (imm8 & (1<<3)) putYMMRegLane128(rG, 0, mkV128(0));
   30945          if (imm8 & (1<<7)) putYMMRegLane128(rG, 1, mkV128(0));
   30946          *uses_vvvv = True;
   30947          goto decode_success;
   30948       }
   30949       break;
   30950 
   30951    case 0x4A:
   30952       /* VBLENDVPS xmmG, xmmE/memE, xmmV, xmmIS4
   30953          ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */
   30954       /* VBLENDVPS = VEX.NDS.128.66.0F3A.WIG 4A /r /is4 */
   30955       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30956          delta = dis_VBLENDV_128 ( vbi, pfx, delta,
   30957                                    "vblendvps", 4, Iop_SarN32x4 );
   30958          *uses_vvvv = True;
   30959          goto decode_success;
   30960       }
   30961       /* VBLENDVPS ymmG, ymmE/memE, ymmV, ymmIS4
   30962          ::: ymmG:V256 = PBLEND(ymmE, ymmV, ymmIS4) (RMVR) */
   30963       /* VBLENDVPS = VEX.NDS.256.66.0F3A.WIG 4A /r /is4 */
   30964       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   30965          delta = dis_VBLENDV_256 ( vbi, pfx, delta,
   30966                                    "vblendvps", 4, Iop_SarN32x4 );
   30967          *uses_vvvv = True;
   30968          goto decode_success;
   30969       }
   30970       break;
   30971 
   30972    case 0x4B:
   30973       /* VBLENDVPD xmmG, xmmE/memE, xmmV, xmmIS4
   30974          ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */
   30975       /* VBLENDVPD = VEX.NDS.128.66.0F3A.WIG 4B /r /is4 */
   30976       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30977          delta = dis_VBLENDV_128 ( vbi, pfx, delta,
   30978                                    "vblendvpd", 8, Iop_SarN64x2 );
   30979          *uses_vvvv = True;
   30980          goto decode_success;
   30981       }
   30982       /* VBLENDVPD ymmG, ymmE/memE, ymmV, ymmIS4
   30983          ::: ymmG:V256 = PBLEND(ymmE, ymmV, ymmIS4) (RMVR) */
   30984       /* VBLENDVPD = VEX.NDS.256.66.0F3A.WIG 4B /r /is4 */
   30985       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   30986          delta = dis_VBLENDV_256 ( vbi, pfx, delta,
   30987                                    "vblendvpd", 8, Iop_SarN64x2 );
   30988          *uses_vvvv = True;
   30989          goto decode_success;
   30990       }
   30991       break;
   30992 
   30993    case 0x4C:
   30994       /* VPBLENDVB xmmG, xmmE/memE, xmmV, xmmIS4
   30995          ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */
   30996       /* VPBLENDVB = VEX.NDS.128.66.0F3A.WIG 4C /r /is4 */
   30997       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   30998          delta = dis_VBLENDV_128 ( vbi, pfx, delta,
   30999                                    "vpblendvb", 1, Iop_SarN8x16 );
   31000          *uses_vvvv = True;
   31001          goto decode_success;
   31002       }
   31003       /* VPBLENDVB ymmG, ymmE/memE, ymmV, ymmIS4
   31004          ::: ymmG:V256 = PBLEND(ymmE, ymmV, ymmIS4) (RMVR) */
   31005       /* VPBLENDVB = VEX.NDS.256.66.0F3A.WIG 4C /r /is4 */
   31006       if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
   31007          delta = dis_VBLENDV_256 ( vbi, pfx, delta,
   31008                                    "vpblendvb", 1, Iop_SarN8x16 );
   31009          *uses_vvvv = True;
   31010          goto decode_success;
   31011       }
   31012       break;
   31013 
   31014    case 0x60:
   31015    case 0x61:
   31016    case 0x62:
   31017    case 0x63:
   31018       /* VEX.128.66.0F3A.WIG 63 /r ib = VPCMPISTRI imm8, xmm2/m128, xmm1
   31019          VEX.128.66.0F3A.WIG 62 /r ib = VPCMPISTRM imm8, xmm2/m128, xmm1
   31020          VEX.128.66.0F3A.WIG 61 /r ib = VPCMPESTRI imm8, xmm2/m128, xmm1
   31021          VEX.128.66.0F3A.WIG 60 /r ib = VPCMPESTRM imm8, xmm2/m128, xmm1
   31022          (selected special cases that actually occur in glibc,
   31023           not by any means a complete implementation.)
   31024       */
   31025       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31026          Long delta0 = delta;
   31027          delta = dis_PCMPxSTRx( vbi, pfx, delta, True/*isAvx*/, opc );
   31028          if (delta > delta0) goto decode_success;
   31029          /* else fall though; dis_PCMPxSTRx failed to decode it */
   31030       }
   31031       break;
   31032 
   31033    case 0xDF:
   31034       /* VAESKEYGENASSIST imm8, xmm2/m128, xmm1 = VEX.128.66.0F3A.WIG DF /r */
   31035       if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
   31036          delta = dis_AESKEYGENASSIST( vbi, pfx, delta, True/*!isAvx*/ );
   31037          goto decode_success;
   31038       }
   31039       break;
   31040 
   31041    case 0xF0:
   31042       /* RORX imm8, r/m32, r32a = VEX.LZ.F2.0F3A.W0 F0 /r /i */
   31043       /* RORX imm8, r/m64, r64a = VEX.LZ.F2.0F3A.W1 F0 /r /i */
   31044       if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
   31045          Int     size = getRexW(pfx) ? 8 : 4;
   31046          IRType  ty   = szToITy(size);
   31047          IRTemp  src  = newTemp(ty);
   31048          UChar   rm   = getUChar(delta);
   31049          UChar   imm8;
   31050 
   31051          if (epartIsReg(rm)) {
   31052             imm8 = getUChar(delta+1);
   31053             assign( src, getIRegE(size,pfx,rm) );
   31054             DIP("rorx %d,%s,%s\n", imm8, nameIRegE(size,pfx,rm),
   31055                                    nameIRegG(size,pfx,rm));
   31056             delta += 2;
   31057          } else {
   31058             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   31059             imm8 = getUChar(delta+alen);
   31060             assign( src, loadLE(ty, mkexpr(addr)) );
   31061             DIP("rorx %d,%s,%s\n", imm8, dis_buf, nameIRegG(size,pfx,rm));
   31062             delta += alen + 1;
   31063          }
   31064          imm8 &= 8*size-1;
   31065 
   31066          /* dst = (src >>u imm8) | (src << (size-imm8)) */
   31067          putIRegG( size, pfx, rm,
   31068                    imm8 == 0 ? mkexpr(src)
   31069                    : binop( mkSizedOp(ty,Iop_Or8),
   31070                             binop( mkSizedOp(ty,Iop_Shr8), mkexpr(src),
   31071                                    mkU8(imm8) ),
   31072                             binop( mkSizedOp(ty,Iop_Shl8), mkexpr(src),
   31073                                    mkU8(8*size-imm8) ) ) );
   31074          /* Flags aren't modified.  */
   31075          goto decode_success;
   31076       }
   31077       break;
   31078 
   31079    default:
   31080       break;
   31081 
   31082    }
   31083 
   31084   //decode_failure:
   31085    return deltaIN;
   31086 
   31087   decode_success:
   31088    return delta;
   31089 }
   31090 
   31091 
   31092 /*------------------------------------------------------------*/
   31093 /*---                                                      ---*/
   31094 /*--- Disassemble a single instruction                     ---*/
   31095 /*---                                                      ---*/
   31096 /*------------------------------------------------------------*/
   31097 
   31098 /* Disassemble a single instruction into IR.  The instruction is
   31099    located in host memory at &guest_code[delta]. */
   31100 
   31101 static
   31102 DisResult disInstr_AMD64_WRK (
   31103              /*OUT*/Bool* expect_CAS,
   31104              Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
   31105              Bool         resteerCisOk,
   31106              void*        callback_opaque,
   31107              Long         delta64,
   31108              VexArchInfo* archinfo,
   31109              VexAbiInfo*  vbi,
   31110              Bool         sigill_diag
   31111           )
   31112 {
   31113    IRTemp    t1, t2, t3, t4, t5, t6;
   31114    UChar     pre;
   31115    Int       n, n_prefixes;
   31116    DisResult dres;
   31117 
   31118    /* The running delta */
   31119    Long delta = delta64;
   31120 
   31121    /* Holds eip at the start of the insn, so that we can print
   31122       consistent error messages for unimplemented insns. */
   31123    Long delta_start = delta;
   31124 
   31125    /* sz denotes the nominal data-op size of the insn; we change it to
   31126       2 if an 0x66 prefix is seen and 8 if REX.W is 1.  In case of
   31127       conflict REX.W takes precedence. */
   31128    Int sz = 4;
   31129 
   31130    /* pfx holds the summary of prefixes. */
   31131    Prefix pfx = PFX_EMPTY;
   31132 
   31133    /* Holds the computed opcode-escape indication. */
   31134    Escape esc = ESC_NONE;
   31135 
   31136    /* Set result defaults. */
   31137    dres.whatNext    = Dis_Continue;
   31138    dres.len         = 0;
   31139    dres.continueAt  = 0;
   31140    dres.jk_StopHere = Ijk_INVALID;
   31141    *expect_CAS = False;
   31142 
   31143    vassert(guest_RIP_next_assumed == 0);
   31144    vassert(guest_RIP_next_mustcheck == False);
   31145 
   31146    t1 = t2 = t3 = t4 = t5 = t6 = IRTemp_INVALID;
   31147 
   31148    DIP("\t0x%llx:  ", guest_RIP_bbstart+delta);
   31149 
   31150    /* Spot "Special" instructions (see comment at top of file). */
   31151    {
   31152       UChar* code = (UChar*)(guest_code + delta);
   31153       /* Spot the 16-byte preamble:
   31154          48C1C703   rolq $3,  %rdi
   31155          48C1C70D   rolq $13, %rdi
   31156          48C1C73D   rolq $61, %rdi
   31157          48C1C733   rolq $51, %rdi
   31158       */
   31159       if (code[ 0] == 0x48 && code[ 1] == 0xC1 && code[ 2] == 0xC7
   31160                                                && code[ 3] == 0x03 &&
   31161           code[ 4] == 0x48 && code[ 5] == 0xC1 && code[ 6] == 0xC7
   31162                                                && code[ 7] == 0x0D &&
   31163           code[ 8] == 0x48 && code[ 9] == 0xC1 && code[10] == 0xC7
   31164                                                && code[11] == 0x3D &&
   31165           code[12] == 0x48 && code[13] == 0xC1 && code[14] == 0xC7
   31166                                                && code[15] == 0x33) {
   31167          /* Got a "Special" instruction preamble.  Which one is it? */
   31168          if (code[16] == 0x48 && code[17] == 0x87
   31169                               && code[18] == 0xDB /* xchgq %rbx,%rbx */) {
   31170             /* %RDX = client_request ( %RAX ) */
   31171             DIP("%%rdx = client_request ( %%rax )\n");
   31172             delta += 19;
   31173             jmp_lit(&dres, Ijk_ClientReq, guest_RIP_bbstart+delta);
   31174             vassert(dres.whatNext == Dis_StopHere);
   31175             goto decode_success;
   31176          }
   31177          else
   31178          if (code[16] == 0x48 && code[17] == 0x87
   31179                               && code[18] == 0xC9 /* xchgq %rcx,%rcx */) {
   31180             /* %RAX = guest_NRADDR */
   31181             DIP("%%rax = guest_NRADDR\n");
   31182             delta += 19;
   31183             putIRegRAX(8, IRExpr_Get( OFFB_NRADDR, Ity_I64 ));
   31184             goto decode_success;
   31185          }
   31186          else
   31187          if (code[16] == 0x48 && code[17] == 0x87
   31188                               && code[18] == 0xD2 /* xchgq %rdx,%rdx */) {
   31189             /* call-noredir *%RAX */
   31190             DIP("call-noredir *%%rax\n");
   31191             delta += 19;
   31192             t1 = newTemp(Ity_I64);
   31193             assign(t1, getIRegRAX(8));
   31194             t2 = newTemp(Ity_I64);
   31195             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   31196             putIReg64(R_RSP, mkexpr(t2));
   31197             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta));
   31198             jmp_treg(&dres, Ijk_NoRedir, t1);
   31199             vassert(dres.whatNext == Dis_StopHere);
   31200             goto decode_success;
   31201          }
   31202          else
   31203          if (code[16] == 0x48 && code[17] == 0x87
   31204                               && code[18] == 0xff /* xchgq %rdi,%rdi */) {
   31205            /* IR injection */
   31206             DIP("IR injection\n");
   31207             vex_inject_ir(irsb, Iend_LE);
   31208 
   31209             // Invalidate the current insn. The reason is that the IRop we're
   31210             // injecting here can change. In which case the translation has to
   31211             // be redone. For ease of handling, we simply invalidate all the
   31212             // time.
   31213             stmt(IRStmt_Put(OFFB_CMSTART, mkU64(guest_RIP_curr_instr)));
   31214             stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(19)));
   31215 
   31216             delta += 19;
   31217 
   31218             stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_bbstart + delta) ) );
   31219             dres.whatNext    = Dis_StopHere;
   31220             dres.jk_StopHere = Ijk_InvalICache;
   31221             goto decode_success;
   31222          }
   31223          /* We don't know what it is. */
   31224          goto decode_failure;
   31225          /*NOTREACHED*/
   31226       }
   31227    }
   31228 
   31229    /* Eat prefixes, summarising the result in pfx and sz, and rejecting
   31230       as many invalid combinations as possible. */
   31231    n_prefixes = 0;
   31232    while (True) {
   31233       if (n_prefixes > 7) goto decode_failure;
   31234       pre = getUChar(delta);
   31235       switch (pre) {
   31236          case 0x66: pfx |= PFX_66; break;
   31237          case 0x67: pfx |= PFX_ASO; break;
   31238          case 0xF2: pfx |= PFX_F2; break;
   31239          case 0xF3: pfx |= PFX_F3; break;
   31240          case 0xF0: pfx |= PFX_LOCK; *expect_CAS = True; break;
   31241          case 0x2E: pfx |= PFX_CS; break;
   31242          case 0x3E: pfx |= PFX_DS; break;
   31243          case 0x26: pfx |= PFX_ES; break;
   31244          case 0x64: pfx |= PFX_FS; break;
   31245          case 0x65: pfx |= PFX_GS; break;
   31246          case 0x36: pfx |= PFX_SS; break;
   31247          case 0x40 ... 0x4F:
   31248             pfx |= PFX_REX;
   31249             if (pre & (1<<3)) pfx |= PFX_REXW;
   31250             if (pre & (1<<2)) pfx |= PFX_REXR;
   31251             if (pre & (1<<1)) pfx |= PFX_REXX;
   31252             if (pre & (1<<0)) pfx |= PFX_REXB;
   31253             break;
   31254          default:
   31255             goto not_a_legacy_prefix;
   31256       }
   31257       n_prefixes++;
   31258       delta++;
   31259    }
   31260 
   31261    not_a_legacy_prefix:
   31262    /* We've used up all the non-VEX prefixes.  Parse and validate a
   31263       VEX prefix if that's appropriate. */
   31264    if (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX) {
   31265       /* Used temporarily for holding VEX prefixes. */
   31266       UChar vex0 = getUChar(delta);
   31267       if (vex0 == 0xC4) {
   31268          /* 3-byte VEX */
   31269          UChar vex1 = getUChar(delta+1);
   31270          UChar vex2 = getUChar(delta+2);
   31271          delta += 3;
   31272          pfx |= PFX_VEX;
   31273          /* Snarf contents of byte 1 */
   31274          /* R */ pfx |= (vex1 & (1<<7)) ? 0 : PFX_REXR;
   31275          /* X */ pfx |= (vex1 & (1<<6)) ? 0 : PFX_REXX;
   31276          /* B */ pfx |= (vex1 & (1<<5)) ? 0 : PFX_REXB;
   31277          /* m-mmmm */
   31278          switch (vex1 & 0x1F) {
   31279             case 1: esc = ESC_0F;   break;
   31280             case 2: esc = ESC_0F38; break;
   31281             case 3: esc = ESC_0F3A; break;
   31282             /* Any other m-mmmm field will #UD */
   31283             default: goto decode_failure;
   31284          }
   31285          /* Snarf contents of byte 2 */
   31286          /* W */    pfx |= (vex2 & (1<<7)) ? PFX_REXW : 0;
   31287          /* ~v3 */  pfx |= (vex2 & (1<<6)) ? 0 : PFX_VEXnV3;
   31288          /* ~v2 */  pfx |= (vex2 & (1<<5)) ? 0 : PFX_VEXnV2;
   31289          /* ~v1 */  pfx |= (vex2 & (1<<4)) ? 0 : PFX_VEXnV1;
   31290          /* ~v0 */  pfx |= (vex2 & (1<<3)) ? 0 : PFX_VEXnV0;
   31291          /* L */    pfx |= (vex2 & (1<<2)) ? PFX_VEXL : 0;
   31292          /* pp */
   31293          switch (vex2 & 3) {
   31294             case 0: break;
   31295             case 1: pfx |= PFX_66; break;
   31296             case 2: pfx |= PFX_F3; break;
   31297             case 3: pfx |= PFX_F2; break;
   31298             default: vassert(0);
   31299          }
   31300       }
   31301       else if (vex0 == 0xC5) {
   31302          /* 2-byte VEX */
   31303          UChar vex1 = getUChar(delta+1);
   31304          delta += 2;
   31305          pfx |= PFX_VEX;
   31306          /* Snarf contents of byte 1 */
   31307          /* R */    pfx |= (vex1 & (1<<7)) ? 0 : PFX_REXR;
   31308          /* ~v3 */  pfx |= (vex1 & (1<<6)) ? 0 : PFX_VEXnV3;
   31309          /* ~v2 */  pfx |= (vex1 & (1<<5)) ? 0 : PFX_VEXnV2;
   31310          /* ~v1 */  pfx |= (vex1 & (1<<4)) ? 0 : PFX_VEXnV1;
   31311          /* ~v0 */  pfx |= (vex1 & (1<<3)) ? 0 : PFX_VEXnV0;
   31312          /* L */    pfx |= (vex1 & (1<<2)) ? PFX_VEXL : 0;
   31313          /* pp */
   31314          switch (vex1 & 3) {
   31315             case 0: break;
   31316             case 1: pfx |= PFX_66; break;
   31317             case 2: pfx |= PFX_F3; break;
   31318             case 3: pfx |= PFX_F2; break;
   31319             default: vassert(0);
   31320          }
   31321          /* implied: */
   31322          esc = ESC_0F;
   31323       }
   31324       /* Can't have both VEX and REX */
   31325       if ((pfx & PFX_VEX) && (pfx & PFX_REX))
   31326          goto decode_failure; /* can't have both */
   31327    }
   31328 
   31329    /* Dump invalid combinations */
   31330    n = 0;
   31331    if (pfx & PFX_F2) n++;
   31332    if (pfx & PFX_F3) n++;
   31333    if (n > 1)
   31334       goto decode_failure; /* can't have both */
   31335 
   31336    n = 0;
   31337    if (pfx & PFX_CS) n++;
   31338    if (pfx & PFX_DS) n++;
   31339    if (pfx & PFX_ES) n++;
   31340    if (pfx & PFX_FS) n++;
   31341    if (pfx & PFX_GS) n++;
   31342    if (pfx & PFX_SS) n++;
   31343    if (n > 1)
   31344       goto decode_failure; /* multiple seg overrides == illegal */
   31345 
   31346    /* We have a %fs prefix.  Reject it if there's no evidence in 'vbi'
   31347       that we should accept it. */
   31348    if ((pfx & PFX_FS) && !vbi->guest_amd64_assume_fs_is_zero)
   31349       goto decode_failure;
   31350 
   31351    /* Ditto for %gs prefixes. */
   31352    if ((pfx & PFX_GS) && !vbi->guest_amd64_assume_gs_is_0x60)
   31353       goto decode_failure;
   31354 
   31355    /* Set up sz. */
   31356    sz = 4;
   31357    if (pfx & PFX_66) sz = 2;
   31358    if ((pfx & PFX_REX) && (pfx & PFX_REXW)) sz = 8;
   31359 
   31360    /* Now we should be looking at the primary opcode byte or the
   31361       leading escapes.  Check that any LOCK prefix is actually
   31362       allowed. */
   31363    if (haveLOCK(pfx)) {
   31364       if (can_be_used_with_LOCK_prefix( (UChar*)&guest_code[delta] )) {
   31365          DIP("lock ");
   31366       } else {
   31367          *expect_CAS = False;
   31368          goto decode_failure;
   31369       }
   31370    }
   31371 
   31372    /* Eat up opcode escape bytes, until we're really looking at the
   31373       primary opcode byte.  But only if there's no VEX present. */
   31374    if (!(pfx & PFX_VEX)) {
   31375       vassert(esc == ESC_NONE);
   31376       pre = getUChar(delta);
   31377       if (pre == 0x0F) {
   31378          delta++;
   31379          pre = getUChar(delta);
   31380          switch (pre) {
   31381             case 0x38: esc = ESC_0F38; delta++; break;
   31382             case 0x3A: esc = ESC_0F3A; delta++; break;
   31383             default:   esc = ESC_0F; break;
   31384          }
   31385       }
   31386    }
   31387 
   31388    /* So now we're really really looking at the primary opcode
   31389       byte. */
   31390    Long delta_at_primary_opcode = delta;
   31391 
   31392    if (!(pfx & PFX_VEX)) {
   31393       /* Handle non-VEX prefixed instructions.  "Legacy" (non-VEX) SSE
   31394          instructions preserve the upper 128 bits of YMM registers;
   31395          iow we can simply ignore the presence of the upper halves of
   31396          these registers. */
   31397       switch (esc) {
   31398          case ESC_NONE:
   31399             delta = dis_ESC_NONE( &dres, expect_CAS,
   31400                                   resteerOkFn, resteerCisOk, callback_opaque,
   31401                                   archinfo, vbi, pfx, sz, delta );
   31402             break;
   31403          case ESC_0F:
   31404             delta = dis_ESC_0F  ( &dres, expect_CAS,
   31405                                   resteerOkFn, resteerCisOk, callback_opaque,
   31406                                   archinfo, vbi, pfx, sz, delta );
   31407             break;
   31408          case ESC_0F38:
   31409             delta = dis_ESC_0F38( &dres,
   31410                                   resteerOkFn, resteerCisOk, callback_opaque,
   31411                                   archinfo, vbi, pfx, sz, delta );
   31412             break;
   31413          case ESC_0F3A:
   31414             delta = dis_ESC_0F3A( &dres,
   31415                                   resteerOkFn, resteerCisOk, callback_opaque,
   31416                                   archinfo, vbi, pfx, sz, delta );
   31417             break;
   31418          default:
   31419             vassert(0);
   31420       }
   31421    } else {
   31422       /* VEX prefixed instruction */
   31423       /* Sloppy Intel wording: "An instruction encoded with a VEX.128
   31424          prefix that loads a YMM register operand ..." zeroes out bits
   31425          128 and above of the register. */
   31426       Bool uses_vvvv = False;
   31427       switch (esc) {
   31428          case ESC_0F:
   31429             delta = dis_ESC_0F__VEX ( &dres, &uses_vvvv,
   31430                                       resteerOkFn, resteerCisOk,
   31431                                       callback_opaque,
   31432                                       archinfo, vbi, pfx, sz, delta );
   31433             break;
   31434          case ESC_0F38:
   31435             delta = dis_ESC_0F38__VEX ( &dres, &uses_vvvv,
   31436                                         resteerOkFn, resteerCisOk,
   31437                                         callback_opaque,
   31438                                         archinfo, vbi, pfx, sz, delta );
   31439             break;
   31440          case ESC_0F3A:
   31441             delta = dis_ESC_0F3A__VEX ( &dres, &uses_vvvv,
   31442                                         resteerOkFn, resteerCisOk,
   31443                                         callback_opaque,
   31444                                         archinfo, vbi, pfx, sz, delta );
   31445             break;
   31446          case ESC_NONE:
   31447             /* The presence of a VEX prefix, by Intel definition,
   31448                always implies at least an 0F escape. */
   31449             goto decode_failure;
   31450          default:
   31451             vassert(0);
   31452       }
   31453       /* If the insn doesn't use VEX.vvvv then it must be all ones.
   31454          Check this. */
   31455       if (!uses_vvvv) {
   31456          if (getVexNvvvv(pfx) != 0)
   31457             goto decode_failure;
   31458       }
   31459    }
   31460 
   31461    vassert(delta - delta_at_primary_opcode >= 0);
   31462    vassert(delta - delta_at_primary_opcode < 16/*let's say*/);
   31463 
   31464    /* Use delta == delta_at_primary_opcode to denote decode failure.
   31465       This implies that any successful decode must use at least one
   31466       byte up. */
   31467    if (delta == delta_at_primary_opcode)
   31468       goto decode_failure;
   31469    else
   31470       goto decode_success; /* \o/ */
   31471 
   31472 #if 0 /* XYZZY */
   31473 
   31474    /* ---------------------------------------------------- */
   31475    /* --- The SSE/SSE2 decoder.                        --- */
   31476    /* ---------------------------------------------------- */
   31477 
   31478    /* What did I do to deserve SSE ?  Perhaps I was really bad in a
   31479       previous life? */
   31480 
   31481    /* Note, this doesn't handle SSE3 right now.  All amd64s support
   31482       SSE2 as a minimum so there is no point distinguishing SSE1 vs
   31483       SSE2. */
   31484 
   31485    insn = (UChar*)&guest_code[delta];
   31486 
   31487    /* FXSAVE is spuriously at the start here only because it is
   31488       thusly placed in guest-x86/toIR.c. */
   31489 
   31490    /* ------ SSE decoder main ------ */
   31491 
   31492    /* ---------------------------------------------------- */
   31493    /* --- end of the SSE decoder.                      --- */
   31494    /* ---------------------------------------------------- */
   31495 
   31496    /* ---------------------------------------------------- */
   31497    /* --- start of the SSE2 decoder.                   --- */
   31498    /* ---------------------------------------------------- */
   31499 
   31500    /* ---------------------------------------------------- */
   31501    /* --- end of the SSE/SSE2 decoder.                 --- */
   31502    /* ---------------------------------------------------- */
   31503 
   31504    /* ---------------------------------------------------- */
   31505    /* --- start of the SSE3 decoder.                   --- */
   31506    /* ---------------------------------------------------- */
   31507 
   31508    /* ---------------------------------------------------- */
   31509    /* --- end of the SSE3 decoder.                     --- */
   31510    /* ---------------------------------------------------- */
   31511 
   31512    /* ---------------------------------------------------- */
   31513    /* --- start of the SSSE3 decoder.                  --- */
   31514    /* ---------------------------------------------------- */
   31515 
   31516    /* ---------------------------------------------------- */
   31517    /* --- end of the SSSE3 decoder.                    --- */
   31518    /* ---------------------------------------------------- */
   31519 
   31520    /* ---------------------------------------------------- */
   31521    /* --- start of the SSE4 decoder                    --- */
   31522    /* ---------------------------------------------------- */
   31523 
   31524    /* ---------------------------------------------------- */
   31525    /* --- end of the SSE4 decoder                      --- */
   31526    /* ---------------------------------------------------- */
   31527 
   31528    /*after_sse_decoders:*/
   31529 
   31530    /* Get the primary opcode. */
   31531    opc = getUChar(delta); delta++;
   31532 
   31533    /* We get here if the current insn isn't SSE, or this CPU doesn't
   31534       support SSE. */
   31535 
   31536    switch (opc) {
   31537 
   31538    /* ------------------------ Control flow --------------- */
   31539 
   31540    /* ------------------------ CWD/CDQ -------------------- */
   31541 
   31542    /* ------------------------ FPU ops -------------------- */
   31543 
   31544    /* ------------------------ INT ------------------------ */
   31545 
   31546    case 0xCD: { /* INT imm8 */
   31547       IRJumpKind jk = Ijk_Boring;
   31548       if (have66orF2orF3(pfx)) goto decode_failure;
   31549       d64 = getUChar(delta); delta++;
   31550       switch (d64) {
   31551          case 32: jk = Ijk_Sys_int32; break;
   31552          default: goto decode_failure;
   31553       }
   31554       guest_RIP_next_mustcheck = True;
   31555       guest_RIP_next_assumed = guest_RIP_bbstart + delta;
   31556       jmp_lit(jk, guest_RIP_next_assumed);
   31557       /* It's important that all ArchRegs carry their up-to-date value
   31558          at this point.  So we declare an end-of-block here, which
   31559          forces any TempRegs caching ArchRegs to be flushed. */
   31560       vassert(dres.whatNext == Dis_StopHere);
   31561       DIP("int $0x%02x\n", (UInt)d64);
   31562       break;
   31563    }
   31564 
   31565    /* ------------------------ Jcond, byte offset --------- */
   31566 
   31567    /* ------------------------ IMUL ----------------------- */
   31568 
   31569    /* ------------------------ MOV ------------------------ */
   31570 
   31571    /* ------------------------ MOVx ------------------------ */
   31572 
   31573    /* ------------------------ opl imm, A ----------------- */
   31574 
   31575    /* ------------------------ opl Ev, Gv ----------------- */
   31576 
   31577    /* ------------------------ opl Gv, Ev ----------------- */
   31578 
   31579    /* ------------------------ POP ------------------------ */
   31580 
   31581    /* ------------------------ PUSH ----------------------- */
   31582 
   31583    /* ------ AE: SCAS variants ------ */
   31584 
   31585    /* ------ A6, A7: CMPS variants ------ */
   31586 
   31587    /* ------ AA, AB: STOS variants ------ */
   31588 
   31589    /* ------ A4, A5: MOVS variants ------ */
   31590 
   31591    /* ------------------------ XCHG ----------------------- */
   31592 
   31593    /* ------------------------ IN / OUT ----------------------- */
   31594 
   31595    /* ------------------------ (Grp1 extensions) ---------- */
   31596 
   31597    /* ------------------------ (Grp2 extensions) ---------- */
   31598 
   31599    /* ------------------------ (Grp3 extensions) ---------- */
   31600 
   31601    /* ------------------------ (Grp4 extensions) ---------- */
   31602 
   31603    /* ------------------------ (Grp5 extensions) ---------- */
   31604 
   31605    /* ------------------------ Escapes to 2-byte opcodes -- */
   31606 
   31607    case 0x0F: {
   31608       opc = getUChar(delta); delta++;
   31609       switch (opc) {
   31610 
   31611       /* =-=-=-=-=-=-=-=-=- Grp8 =-=-=-=-=-=-=-=-=-=-=-= */
   31612 
   31613       /* =-=-=-=-=-=-=-=-=- BSF/BSR -=-=-=-=-=-=-=-=-=-= */
   31614 
   31615       /* =-=-=-=-=-=-=-=-=- BSWAP -=-=-=-=-=-=-=-=-=-=-= */
   31616 
   31617       /* =-=-=-=-=-=-=-=-=- BT/BTS/BTR/BTC =-=-=-=-=-=-= */
   31618 
   31619       /* =-=-=-=-=-=-=-=-=- CMOV =-=-=-=-=-=-=-=-=-=-=-= */
   31620 
   31621       /* =-=-=-=-=-=-=-=-=- CMPXCHG -=-=-=-=-=-=-=-=-=-= */
   31622 
   31623       /* =-=-=-=-=-=-=-=-=- CPUID -=-=-=-=-=-=-=-=-=-=-= */
   31624 
   31625       /* =-=-=-=-=-=-=-=-=- MOVZX, MOVSX =-=-=-=-=-=-=-= */
   31626 
   31627       /* =-=-=-=-=-=-=-=-=- MUL/IMUL =-=-=-=-=-=-=-=-=-= */
   31628 
   31629       /* =-=-=-=-=-=-=-=-=- NOPs =-=-=-=-=-=-=-=-=-=-=-= */
   31630 
   31631       /* =-=-=-=-=-=-=-=-=- Jcond d32 -=-=-=-=-=-=-=-=-= */
   31632 
   31633       /* =-=-=-=-=-=-=-=-=- PREFETCH =-=-=-=-=-=-=-=-=-= */
   31634 
   31635       /* =-=-=-=-=-=-=-=-=- RDTSC -=-=-=-=-=-=-=-=-=-=-= */
   31636 
   31637       /* =-=-=-=-=-=-=-=-=- SETcc Eb =-=-=-=-=-=-=-=-=-= */
   31638 
   31639       /* =-=-=-=-=-=-=-=-=- SHLD/SHRD -=-=-=-=-=-=-=-=-= */
   31640 
   31641       /* =-=-=-=-=-=-=-=-=- SYSCALL -=-=-=-=-=-=-=-=-=-= */
   31642 
   31643       /* =-=-=-=-=-=-=-=-=- XADD -=-=-=-=-=-=-=-=-=-= */
   31644 
   31645       /* =-=-=-=-=-=-=-=-=- SGDT and SIDT =-=-=-=-=-=-=-=-=-=-= */
   31646 
   31647       /* =-=-=-=-=-=-=-=-=- unimp2 =-=-=-=-=-=-=-=-=-=-= */
   31648 
   31649       default:
   31650          goto decode_failure;
   31651    } /* switch (opc) for the 2-byte opcodes */
   31652    goto decode_success;
   31653    } /* case 0x0F: of primary opcode */
   31654 
   31655    /* ------------------------ ??? ------------------------ */
   31656 #endif /* XYZZY */
   31657 
   31658      //default:
   31659   decode_failure:
   31660    /* All decode failures end up here. */
   31661    if (sigill_diag) {
   31662       vex_printf("vex amd64->IR: unhandled instruction bytes: "
   31663                  "0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
   31664                  (Int)getUChar(delta_start+0),
   31665                  (Int)getUChar(delta_start+1),
   31666                  (Int)getUChar(delta_start+2),
   31667                  (Int)getUChar(delta_start+3),
   31668                  (Int)getUChar(delta_start+4),
   31669                  (Int)getUChar(delta_start+5),
   31670                  (Int)getUChar(delta_start+6),
   31671                  (Int)getUChar(delta_start+7) );
   31672       vex_printf("vex amd64->IR:   REX=%d REX.W=%d REX.R=%d REX.X=%d REX.B=%d\n",
   31673                  haveREX(pfx) ? 1 : 0, getRexW(pfx), getRexR(pfx),
   31674                  getRexX(pfx), getRexB(pfx));
   31675       vex_printf("vex amd64->IR:   VEX=%d VEX.L=%d VEX.nVVVV=0x%x ESC=%s\n",
   31676                  haveVEX(pfx) ? 1 : 0, getVexL(pfx),
   31677                  getVexNvvvv(pfx),
   31678                  esc==ESC_NONE ? "NONE" :
   31679                    esc==ESC_0F ? "0F" :
   31680                    esc==ESC_0F38 ? "0F38" :
   31681                    esc==ESC_0F3A ? "0F3A" : "???");
   31682       vex_printf("vex amd64->IR:   PFX.66=%d PFX.F2=%d PFX.F3=%d\n",
   31683                  have66(pfx) ? 1 : 0, haveF2(pfx) ? 1 : 0,
   31684                  haveF3(pfx) ? 1 : 0);
   31685    }
   31686 
   31687    /* Tell the dispatcher that this insn cannot be decoded, and so has
   31688       not been executed, and (is currently) the next to be executed.
   31689       RIP should be up-to-date since it made so at the start of each
   31690       insn, but nevertheless be paranoid and update it again right
   31691       now. */
   31692    stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_curr_instr) ) );
   31693    jmp_lit(&dres, Ijk_NoDecode, guest_RIP_curr_instr);
   31694    vassert(dres.whatNext == Dis_StopHere);
   31695    dres.len = 0;
   31696    /* We also need to say that a CAS is not expected now, regardless
   31697       of what it might have been set to at the start of the function,
   31698       since the IR that we've emitted just above (to synthesis a
   31699       SIGILL) does not involve any CAS, and presumably no other IR has
   31700       been emitted for this (non-decoded) insn. */
   31701    *expect_CAS = False;
   31702    return dres;
   31703 
   31704    //   } /* switch (opc) for the main (primary) opcode switch. */
   31705 
   31706   decode_success:
   31707    /* All decode successes end up here. */
   31708    switch (dres.whatNext) {
   31709       case Dis_Continue:
   31710          stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_bbstart + delta) ) );
   31711          break;
   31712       case Dis_ResteerU:
   31713       case Dis_ResteerC:
   31714          stmt( IRStmt_Put( OFFB_RIP, mkU64(dres.continueAt) ) );
   31715          break;
   31716       case Dis_StopHere:
   31717          break;
   31718       default:
   31719          vassert(0);
   31720    }
   31721 
   31722    DIP("\n");
   31723    dres.len = (Int)toUInt(delta - delta_start);
   31724    return dres;
   31725 }
   31726 
   31727 #undef DIP
   31728 #undef DIS
   31729 
   31730 
   31731 /*------------------------------------------------------------*/
   31732 /*--- Top-level fn                                         ---*/
   31733 /*------------------------------------------------------------*/
   31734 
   31735 /* Disassemble a single instruction into IR.  The instruction
   31736    is located in host memory at &guest_code[delta]. */
   31737 
   31738 DisResult disInstr_AMD64 ( IRSB*        irsb_IN,
   31739                            Bool         (*resteerOkFn) ( void*, Addr64 ),
   31740                            Bool         resteerCisOk,
   31741                            void*        callback_opaque,
   31742                            UChar*       guest_code_IN,
   31743                            Long         delta,
   31744                            Addr64       guest_IP,
   31745                            VexArch      guest_arch,
   31746                            VexArchInfo* archinfo,
   31747                            VexAbiInfo*  abiinfo,
   31748                            Bool         host_bigendian_IN,
   31749                            Bool         sigill_diag_IN )
   31750 {
   31751    Int       i, x1, x2;
   31752    Bool      expect_CAS, has_CAS;
   31753    DisResult dres;
   31754 
   31755    /* Set globals (see top of this file) */
   31756    vassert(guest_arch == VexArchAMD64);
   31757    guest_code           = guest_code_IN;
   31758    irsb                 = irsb_IN;
   31759    host_is_bigendian    = host_bigendian_IN;
   31760    guest_RIP_curr_instr = guest_IP;
   31761    guest_RIP_bbstart    = guest_IP - delta;
   31762 
   31763    /* We'll consult these after doing disInstr_AMD64_WRK. */
   31764    guest_RIP_next_assumed   = 0;
   31765    guest_RIP_next_mustcheck = False;
   31766 
   31767    x1 = irsb_IN->stmts_used;
   31768    expect_CAS = False;
   31769    dres = disInstr_AMD64_WRK ( &expect_CAS, resteerOkFn,
   31770                                resteerCisOk,
   31771                                callback_opaque,
   31772                                delta, archinfo, abiinfo, sigill_diag_IN );
   31773    x2 = irsb_IN->stmts_used;
   31774    vassert(x2 >= x1);
   31775 
   31776    /* If disInstr_AMD64_WRK tried to figure out the next rip, check it
   31777       got it right.  Failure of this assertion is serious and denotes
   31778       a bug in disInstr. */
   31779    if (guest_RIP_next_mustcheck
   31780        && guest_RIP_next_assumed != guest_RIP_curr_instr + dres.len) {
   31781       vex_printf("\n");
   31782       vex_printf("assumed next %%rip = 0x%llx\n",
   31783                  guest_RIP_next_assumed );
   31784       vex_printf(" actual next %%rip = 0x%llx\n",
   31785                  guest_RIP_curr_instr + dres.len );
   31786       vpanic("disInstr_AMD64: disInstr miscalculated next %rip");
   31787    }
   31788 
   31789    /* See comment at the top of disInstr_AMD64_WRK for meaning of
   31790       expect_CAS.  Here, we (sanity-)check for the presence/absence of
   31791       IRCAS as directed by the returned expect_CAS value. */
   31792    has_CAS = False;
   31793    for (i = x1; i < x2; i++) {
   31794       if (irsb_IN->stmts[i]->tag == Ist_CAS)
   31795          has_CAS = True;
   31796    }
   31797 
   31798    if (expect_CAS != has_CAS) {
   31799       /* inconsistency detected.  re-disassemble the instruction so as
   31800          to generate a useful error message; then assert. */
   31801       vex_traceflags |= VEX_TRACE_FE;
   31802       dres = disInstr_AMD64_WRK ( &expect_CAS, resteerOkFn,
   31803                                   resteerCisOk,
   31804                                   callback_opaque,
   31805                                   delta, archinfo, abiinfo, sigill_diag_IN );
   31806       for (i = x1; i < x2; i++) {
   31807          vex_printf("\t\t");
   31808          ppIRStmt(irsb_IN->stmts[i]);
   31809          vex_printf("\n");
   31810       }
   31811       /* Failure of this assertion is serious and denotes a bug in
   31812          disInstr. */
   31813       vpanic("disInstr_AMD64: inconsistency in LOCK prefix handling");
   31814    }
   31815 
   31816    return dres;
   31817 }
   31818 
   31819 
   31820 /*------------------------------------------------------------*/
   31821 /*--- Unused stuff                                         ---*/
   31822 /*------------------------------------------------------------*/
   31823 
   31824 // A potentially more Memcheck-friendly version of gen_LZCNT, if
   31825 // this should ever be needed.
   31826 //
   31827 //static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
   31828 //{
   31829 //   /* Scheme is simple: propagate the most significant 1-bit into all
   31830 //      lower positions in the word.  This gives a word of the form
   31831 //      0---01---1.  Now invert it, giving a word of the form
   31832 //      1---10---0, then do a population-count idiom (to count the 1s,
   31833 //      which is the number of leading zeroes, or the word size if the
   31834 //      original word was 0.
   31835 //   */
   31836 //   Int i;
   31837 //   IRTemp t[7];
   31838 //   for (i = 0; i < 7; i++) {
   31839 //      t[i] = newTemp(ty);
   31840 //   }
   31841 //   if (ty == Ity_I64) {
   31842 //      assign(t[0], binop(Iop_Or64, mkexpr(src),
   31843 //                                   binop(Iop_Shr64, mkexpr(src),  mkU8(1))));
   31844 //      assign(t[1], binop(Iop_Or64, mkexpr(t[0]),
   31845 //                                   binop(Iop_Shr64, mkexpr(t[0]), mkU8(2))));
   31846 //      assign(t[2], binop(Iop_Or64, mkexpr(t[1]),
   31847 //                                   binop(Iop_Shr64, mkexpr(t[1]), mkU8(4))));
   31848 //      assign(t[3], binop(Iop_Or64, mkexpr(t[2]),
   31849 //                                   binop(Iop_Shr64, mkexpr(t[2]), mkU8(8))));
   31850 //      assign(t[4], binop(Iop_Or64, mkexpr(t[3]),
   31851 //                                   binop(Iop_Shr64, mkexpr(t[3]), mkU8(16))));
   31852 //      assign(t[5], binop(Iop_Or64, mkexpr(t[4]),
   31853 //                                   binop(Iop_Shr64, mkexpr(t[4]), mkU8(32))));
   31854 //      assign(t[6], unop(Iop_Not64, mkexpr(t[5])));
   31855 //      return gen_POPCOUNT(ty, t[6]);
   31856 //   }
   31857 //   if (ty == Ity_I32) {
   31858 //      assign(t[0], binop(Iop_Or32, mkexpr(src),
   31859 //                                   binop(Iop_Shr32, mkexpr(src),  mkU8(1))));
   31860 //      assign(t[1], binop(Iop_Or32, mkexpr(t[0]),
   31861 //                                   binop(Iop_Shr32, mkexpr(t[0]), mkU8(2))));
   31862 //      assign(t[2], binop(Iop_Or32, mkexpr(t[1]),
   31863 //                                   binop(Iop_Shr32, mkexpr(t[1]), mkU8(4))));
   31864 //      assign(t[3], binop(Iop_Or32, mkexpr(t[2]),
   31865 //                                   binop(Iop_Shr32, mkexpr(t[2]), mkU8(8))));
   31866 //      assign(t[4], binop(Iop_Or32, mkexpr(t[3]),
   31867 //                                   binop(Iop_Shr32, mkexpr(t[3]), mkU8(16))));
   31868 //      assign(t[5], unop(Iop_Not32, mkexpr(t[4])));
   31869 //      return gen_POPCOUNT(ty, t[5]);
   31870 //   }
   31871 //   if (ty == Ity_I16) {
   31872 //      assign(t[0], binop(Iop_Or16, mkexpr(src),
   31873 //                                   binop(Iop_Shr16, mkexpr(src),  mkU8(1))));
   31874 //      assign(t[1], binop(Iop_Or16, mkexpr(t[0]),
   31875 //                                   binop(Iop_Shr16, mkexpr(t[0]), mkU8(2))));
   31876 //      assign(t[2], binop(Iop_Or16, mkexpr(t[1]),
   31877 //                                   binop(Iop_Shr16, mkexpr(t[1]), mkU8(4))));
   31878 //      assign(t[3], binop(Iop_Or16, mkexpr(t[2]),
   31879 //                                   binop(Iop_Shr16, mkexpr(t[2]), mkU8(8))));
   31880 //      assign(t[4], unop(Iop_Not16, mkexpr(t[3])));
   31881 //      return gen_POPCOUNT(ty, t[4]);
   31882 //   }
   31883 //   vassert(0);
   31884 //}
   31885 
   31886 
   31887 /*--------------------------------------------------------------------*/
   31888 /*--- end                                       guest_amd64_toIR.c ---*/
   31889 /*--------------------------------------------------------------------*/
   31890